{ "best_metric": 0.2612117528915405, "best_model_checkpoint": "xblock-social-screenshots-5/checkpoint-6738", "epoch": 3.0, "eval_steps": 500, "global_step": 6738, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 14.132967948913574, "learning_rate": 1.8545994065281898e-06, "loss": 0.6958, "step": 25 }, { "epoch": 0.02, "grad_norm": 8.136152267456055, "learning_rate": 3.6350148367952525e-06, "loss": 0.6975, "step": 50 }, { "epoch": 0.03, "grad_norm": 0.9680777788162231, "learning_rate": 5.489614243323442e-06, "loss": 0.5213, "step": 75 }, { "epoch": 0.04, "grad_norm": 4.900412559509277, "learning_rate": 7.270029673590505e-06, "loss": 0.4804, "step": 100 }, { "epoch": 0.06, "grad_norm": 3.952802896499634, "learning_rate": 9.124629080118695e-06, "loss": 0.4743, "step": 125 }, { "epoch": 0.07, "grad_norm": 14.785197257995605, "learning_rate": 1.0979228486646884e-05, "loss": 0.5276, "step": 150 }, { "epoch": 0.08, "grad_norm": 13.082988739013672, "learning_rate": 1.2833827893175073e-05, "loss": 0.5019, "step": 175 }, { "epoch": 0.09, "grad_norm": 8.51102352142334, "learning_rate": 1.4688427299703264e-05, "loss": 0.6297, "step": 200 }, { "epoch": 0.1, "grad_norm": 5.031483173370361, "learning_rate": 1.6543026706231455e-05, "loss": 0.4062, "step": 225 }, { "epoch": 0.11, "grad_norm": 7.973475456237793, "learning_rate": 1.8397626112759644e-05, "loss": 0.5997, "step": 250 }, { "epoch": 0.12, "grad_norm": 11.180716514587402, "learning_rate": 2.0252225519287833e-05, "loss": 0.478, "step": 275 }, { "epoch": 0.13, "grad_norm": 4.953334331512451, "learning_rate": 2.2106824925816026e-05, "loss": 0.3911, "step": 300 }, { "epoch": 0.14, "grad_norm": 7.498977184295654, "learning_rate": 2.3961424332344215e-05, "loss": 0.477, "step": 325 }, { "epoch": 0.16, "grad_norm": 0.19468317925930023, "learning_rate": 2.58160237388724e-05, "loss": 0.338, "step": 350 }, { "epoch": 0.17, "grad_norm": 0.9361104965209961, "learning_rate": 2.7670623145400593e-05, "loss": 0.5228, "step": 375 }, { "epoch": 0.18, "grad_norm": 1.6056278944015503, "learning_rate": 2.9525222551928783e-05, "loss": 0.526, "step": 400 }, { "epoch": 0.19, "grad_norm": 6.435601234436035, "learning_rate": 3.137982195845697e-05, "loss": 0.3893, "step": 425 }, { "epoch": 0.2, "grad_norm": 0.40519052743911743, "learning_rate": 3.323442136498516e-05, "loss": 0.3945, "step": 450 }, { "epoch": 0.21, "grad_norm": 0.11996802687644958, "learning_rate": 3.508902077151335e-05, "loss": 0.5464, "step": 475 }, { "epoch": 0.22, "grad_norm": 5.791567325592041, "learning_rate": 3.6943620178041546e-05, "loss": 0.6363, "step": 500 }, { "epoch": 0.23, "grad_norm": 0.3942394256591797, "learning_rate": 3.8798219584569735e-05, "loss": 0.368, "step": 525 }, { "epoch": 0.24, "grad_norm": 11.67949390411377, "learning_rate": 4.0652818991097924e-05, "loss": 0.4232, "step": 550 }, { "epoch": 0.26, "grad_norm": 6.389534950256348, "learning_rate": 4.2507418397626114e-05, "loss": 0.4988, "step": 575 }, { "epoch": 0.27, "grad_norm": 22.40272331237793, "learning_rate": 4.43620178041543e-05, "loss": 0.707, "step": 600 }, { "epoch": 0.28, "grad_norm": 3.8825833797454834, "learning_rate": 4.621661721068249e-05, "loss": 0.576, "step": 625 }, { "epoch": 0.29, "grad_norm": 3.9929873943328857, "learning_rate": 4.807121661721069e-05, "loss": 0.4785, "step": 650 }, { "epoch": 0.3, "grad_norm": 8.819046974182129, "learning_rate": 4.992581602373888e-05, "loss": 0.5884, "step": 675 }, { "epoch": 0.31, "grad_norm": 10.694670677185059, "learning_rate": 4.9802110817941956e-05, "loss": 0.5243, "step": 700 }, { "epoch": 0.32, "grad_norm": 3.784719944000244, "learning_rate": 4.9595976253298154e-05, "loss": 0.6428, "step": 725 }, { "epoch": 0.33, "grad_norm": 7.733468055725098, "learning_rate": 4.938984168865435e-05, "loss": 0.612, "step": 750 }, { "epoch": 0.35, "grad_norm": 7.867452144622803, "learning_rate": 4.918370712401056e-05, "loss": 0.582, "step": 775 }, { "epoch": 0.36, "grad_norm": 12.518653869628906, "learning_rate": 4.8977572559366755e-05, "loss": 0.569, "step": 800 }, { "epoch": 0.37, "grad_norm": 7.27546501159668, "learning_rate": 4.877143799472296e-05, "loss": 0.6449, "step": 825 }, { "epoch": 0.38, "grad_norm": 16.084596633911133, "learning_rate": 4.856530343007916e-05, "loss": 0.6471, "step": 850 }, { "epoch": 0.39, "grad_norm": 4.305793285369873, "learning_rate": 4.8359168865435357e-05, "loss": 0.5507, "step": 875 }, { "epoch": 0.4, "grad_norm": 17.53583335876465, "learning_rate": 4.815303430079156e-05, "loss": 0.4701, "step": 900 }, { "epoch": 0.41, "grad_norm": 6.12671422958374, "learning_rate": 4.794689973614776e-05, "loss": 0.551, "step": 925 }, { "epoch": 0.42, "grad_norm": 2.3855419158935547, "learning_rate": 4.774076517150396e-05, "loss": 0.6347, "step": 950 }, { "epoch": 0.43, "grad_norm": 7.226621627807617, "learning_rate": 4.7534630606860156e-05, "loss": 0.3186, "step": 975 }, { "epoch": 0.45, "grad_norm": 7.834379196166992, "learning_rate": 4.732849604221636e-05, "loss": 0.5085, "step": 1000 }, { "epoch": 0.46, "grad_norm": 0.51530522108078, "learning_rate": 4.7122361477572566e-05, "loss": 0.3593, "step": 1025 }, { "epoch": 0.47, "grad_norm": 1.3713924884796143, "learning_rate": 4.6916226912928764e-05, "loss": 0.5763, "step": 1050 }, { "epoch": 0.48, "grad_norm": 1.3441976308822632, "learning_rate": 4.671009234828496e-05, "loss": 0.6034, "step": 1075 }, { "epoch": 0.49, "grad_norm": 14.422416687011719, "learning_rate": 4.650395778364116e-05, "loss": 0.4459, "step": 1100 }, { "epoch": 0.5, "grad_norm": 4.849581241607666, "learning_rate": 4.6297823218997365e-05, "loss": 0.4681, "step": 1125 }, { "epoch": 0.51, "grad_norm": 11.974592208862305, "learning_rate": 4.6091688654353563e-05, "loss": 0.5693, "step": 1150 }, { "epoch": 0.52, "grad_norm": 8.925352096557617, "learning_rate": 4.588555408970976e-05, "loss": 0.4427, "step": 1175 }, { "epoch": 0.53, "grad_norm": 7.595994472503662, "learning_rate": 4.5679419525065967e-05, "loss": 0.3736, "step": 1200 }, { "epoch": 0.55, "grad_norm": 8.393537521362305, "learning_rate": 4.5473284960422165e-05, "loss": 0.6534, "step": 1225 }, { "epoch": 0.56, "grad_norm": 6.042502403259277, "learning_rate": 4.526715039577837e-05, "loss": 0.4326, "step": 1250 }, { "epoch": 0.57, "grad_norm": 13.696694374084473, "learning_rate": 4.506101583113457e-05, "loss": 0.5176, "step": 1275 }, { "epoch": 0.58, "grad_norm": 4.518089771270752, "learning_rate": 4.4854881266490766e-05, "loss": 0.4211, "step": 1300 }, { "epoch": 0.59, "grad_norm": 11.04053783416748, "learning_rate": 4.4648746701846964e-05, "loss": 0.5664, "step": 1325 }, { "epoch": 0.6, "grad_norm": 16.009056091308594, "learning_rate": 4.444261213720316e-05, "loss": 0.5545, "step": 1350 }, { "epoch": 0.61, "grad_norm": 4.595952033996582, "learning_rate": 4.423647757255937e-05, "loss": 0.4318, "step": 1375 }, { "epoch": 0.62, "grad_norm": 5.188466548919678, "learning_rate": 4.403034300791557e-05, "loss": 0.4286, "step": 1400 }, { "epoch": 0.63, "grad_norm": 6.234115123748779, "learning_rate": 4.382420844327177e-05, "loss": 0.5876, "step": 1425 }, { "epoch": 0.65, "grad_norm": 8.778355598449707, "learning_rate": 4.361807387862797e-05, "loss": 0.6304, "step": 1450 }, { "epoch": 0.66, "grad_norm": 9.595151901245117, "learning_rate": 4.3411939313984173e-05, "loss": 0.5844, "step": 1475 }, { "epoch": 0.67, "grad_norm": 9.795525550842285, "learning_rate": 4.320580474934037e-05, "loss": 0.4714, "step": 1500 }, { "epoch": 0.68, "grad_norm": 14.277398109436035, "learning_rate": 4.299967018469657e-05, "loss": 0.5627, "step": 1525 }, { "epoch": 0.69, "grad_norm": 5.245518684387207, "learning_rate": 4.2793535620052775e-05, "loss": 0.5474, "step": 1550 }, { "epoch": 0.7, "grad_norm": 6.895930290222168, "learning_rate": 4.258740105540897e-05, "loss": 0.4686, "step": 1575 }, { "epoch": 0.71, "grad_norm": 7.74411153793335, "learning_rate": 4.238126649076518e-05, "loss": 0.5021, "step": 1600 }, { "epoch": 0.72, "grad_norm": 2.97990083694458, "learning_rate": 4.2175131926121376e-05, "loss": 0.4173, "step": 1625 }, { "epoch": 0.73, "grad_norm": 8.026514053344727, "learning_rate": 4.1968997361477574e-05, "loss": 0.3728, "step": 1650 }, { "epoch": 0.75, "grad_norm": 0.3547731339931488, "learning_rate": 4.176286279683377e-05, "loss": 0.4983, "step": 1675 }, { "epoch": 0.76, "grad_norm": 4.483277797698975, "learning_rate": 4.155672823218997e-05, "loss": 0.5235, "step": 1700 }, { "epoch": 0.77, "grad_norm": 7.000768184661865, "learning_rate": 4.1350593667546175e-05, "loss": 0.4955, "step": 1725 }, { "epoch": 0.78, "grad_norm": 4.905660152435303, "learning_rate": 4.114445910290238e-05, "loss": 0.4803, "step": 1750 }, { "epoch": 0.79, "grad_norm": 4.236353397369385, "learning_rate": 4.093832453825858e-05, "loss": 0.5105, "step": 1775 }, { "epoch": 0.8, "grad_norm": 8.685340881347656, "learning_rate": 4.073218997361478e-05, "loss": 0.6345, "step": 1800 }, { "epoch": 0.81, "grad_norm": 2.6251416206359863, "learning_rate": 4.052605540897098e-05, "loss": 0.6859, "step": 1825 }, { "epoch": 0.82, "grad_norm": 13.661340713500977, "learning_rate": 4.031992084432718e-05, "loss": 0.3122, "step": 1850 }, { "epoch": 0.83, "grad_norm": 1.7971667051315308, "learning_rate": 4.011378627968338e-05, "loss": 0.5103, "step": 1875 }, { "epoch": 0.85, "grad_norm": 4.921170234680176, "learning_rate": 3.9907651715039576e-05, "loss": 0.4259, "step": 1900 }, { "epoch": 0.86, "grad_norm": 6.340487480163574, "learning_rate": 3.970151715039578e-05, "loss": 0.5088, "step": 1925 }, { "epoch": 0.87, "grad_norm": 8.182121276855469, "learning_rate": 3.9495382585751986e-05, "loss": 0.5638, "step": 1950 }, { "epoch": 0.88, "grad_norm": 3.3480095863342285, "learning_rate": 3.9289248021108184e-05, "loss": 0.5788, "step": 1975 }, { "epoch": 0.89, "grad_norm": 1.820002555847168, "learning_rate": 3.908311345646438e-05, "loss": 0.5043, "step": 2000 }, { "epoch": 0.9, "grad_norm": 5.409160614013672, "learning_rate": 3.887697889182058e-05, "loss": 0.3171, "step": 2025 }, { "epoch": 0.91, "grad_norm": 6.675960063934326, "learning_rate": 3.867084432717678e-05, "loss": 0.4536, "step": 2050 }, { "epoch": 0.92, "grad_norm": 2.4080889225006104, "learning_rate": 3.8464709762532984e-05, "loss": 0.5149, "step": 2075 }, { "epoch": 0.93, "grad_norm": 1.0316126346588135, "learning_rate": 3.825857519788918e-05, "loss": 0.6131, "step": 2100 }, { "epoch": 0.95, "grad_norm": 6.637559413909912, "learning_rate": 3.805244063324539e-05, "loss": 0.4527, "step": 2125 }, { "epoch": 0.96, "grad_norm": 5.249868869781494, "learning_rate": 3.7846306068601585e-05, "loss": 0.3764, "step": 2150 }, { "epoch": 0.97, "grad_norm": 8.488882064819336, "learning_rate": 3.764017150395778e-05, "loss": 0.4237, "step": 2175 }, { "epoch": 0.98, "grad_norm": 1.0089231729507446, "learning_rate": 3.743403693931399e-05, "loss": 0.5939, "step": 2200 }, { "epoch": 0.99, "grad_norm": 0.9522852301597595, "learning_rate": 3.7227902374670186e-05, "loss": 0.4297, "step": 2225 }, { "epoch": 1.0, "eval_accuracy": 0.9071889606053861, "eval_f1_macro": 0.4640938055477737, "eval_f1_micro": 0.9071889606053861, "eval_f1_weighted": 0.8894145383985912, "eval_loss": 0.35839545726776123, "eval_precision_macro": 0.6916894534747811, "eval_precision_micro": 0.9071889606053861, "eval_precision_weighted": 0.8955058437354184, "eval_recall_macro": 0.4208919396551037, "eval_recall_micro": 0.9071889606053861, "eval_recall_weighted": 0.9071889606053861, "eval_runtime": 391.8611, "eval_samples_per_second": 11.466, "eval_steps_per_second": 0.717, "step": 2246 }, { "epoch": 1.0, "grad_norm": 4.51116943359375, "learning_rate": 3.7021767810026384e-05, "loss": 0.5182, "step": 2250 }, { "epoch": 1.01, "grad_norm": 4.062203884124756, "learning_rate": 3.681563324538258e-05, "loss": 0.4014, "step": 2275 }, { "epoch": 1.02, "grad_norm": 4.1213884353637695, "learning_rate": 3.660949868073879e-05, "loss": 0.3564, "step": 2300 }, { "epoch": 1.04, "grad_norm": 4.061648845672607, "learning_rate": 3.640336411609499e-05, "loss": 0.5185, "step": 2325 }, { "epoch": 1.05, "grad_norm": 5.865363597869873, "learning_rate": 3.619722955145119e-05, "loss": 0.5669, "step": 2350 }, { "epoch": 1.06, "grad_norm": 1.3118332624435425, "learning_rate": 3.599109498680739e-05, "loss": 0.3586, "step": 2375 }, { "epoch": 1.07, "grad_norm": 8.255626678466797, "learning_rate": 3.578496042216359e-05, "loss": 0.5938, "step": 2400 }, { "epoch": 1.08, "grad_norm": 11.17790412902832, "learning_rate": 3.557882585751979e-05, "loss": 0.3843, "step": 2425 }, { "epoch": 1.09, "grad_norm": 9.643479347229004, "learning_rate": 3.5380936675461745e-05, "loss": 0.5595, "step": 2450 }, { "epoch": 1.1, "grad_norm": 0.35103797912597656, "learning_rate": 3.5174802110817943e-05, "loss": 0.413, "step": 2475 }, { "epoch": 1.11, "grad_norm": 0.7915975451469421, "learning_rate": 3.496866754617414e-05, "loss": 0.3686, "step": 2500 }, { "epoch": 1.12, "grad_norm": 5.222273826599121, "learning_rate": 3.4762532981530347e-05, "loss": 0.3873, "step": 2525 }, { "epoch": 1.14, "grad_norm": 6.224874019622803, "learning_rate": 3.4556398416886545e-05, "loss": 0.5069, "step": 2550 }, { "epoch": 1.15, "grad_norm": 12.55453109741211, "learning_rate": 3.435026385224275e-05, "loss": 0.255, "step": 2575 }, { "epoch": 1.16, "grad_norm": 11.516471862792969, "learning_rate": 3.414412928759895e-05, "loss": 0.4712, "step": 2600 }, { "epoch": 1.17, "grad_norm": 1.0335161685943604, "learning_rate": 3.3937994722955146e-05, "loss": 0.4652, "step": 2625 }, { "epoch": 1.18, "grad_norm": 5.176596641540527, "learning_rate": 3.3731860158311344e-05, "loss": 0.3918, "step": 2650 }, { "epoch": 1.19, "grad_norm": 3.3881676197052, "learning_rate": 3.352572559366754e-05, "loss": 0.3869, "step": 2675 }, { "epoch": 1.2, "grad_norm": 4.0972514152526855, "learning_rate": 3.331959102902375e-05, "loss": 0.44, "step": 2700 }, { "epoch": 1.21, "grad_norm": 0.4464218020439148, "learning_rate": 3.311345646437995e-05, "loss": 0.3373, "step": 2725 }, { "epoch": 1.22, "grad_norm": 6.553329944610596, "learning_rate": 3.290732189973615e-05, "loss": 0.3585, "step": 2750 }, { "epoch": 1.24, "grad_norm": 6.4266157150268555, "learning_rate": 3.270118733509235e-05, "loss": 0.4454, "step": 2775 }, { "epoch": 1.25, "grad_norm": 0.8840596079826355, "learning_rate": 3.2495052770448553e-05, "loss": 0.396, "step": 2800 }, { "epoch": 1.26, "grad_norm": 6.651895523071289, "learning_rate": 3.228891820580475e-05, "loss": 0.4029, "step": 2825 }, { "epoch": 1.27, "grad_norm": 8.035750389099121, "learning_rate": 3.208278364116095e-05, "loss": 0.5253, "step": 2850 }, { "epoch": 1.28, "grad_norm": 0.7661600112915039, "learning_rate": 3.187664907651715e-05, "loss": 0.498, "step": 2875 }, { "epoch": 1.29, "grad_norm": 3.3388662338256836, "learning_rate": 3.167051451187335e-05, "loss": 0.4511, "step": 2900 }, { "epoch": 1.3, "grad_norm": 4.266098499298096, "learning_rate": 3.146437994722956e-05, "loss": 0.5038, "step": 2925 }, { "epoch": 1.31, "grad_norm": 9.547815322875977, "learning_rate": 3.1258245382585756e-05, "loss": 0.3055, "step": 2950 }, { "epoch": 1.32, "grad_norm": 5.78660774230957, "learning_rate": 3.1052110817941954e-05, "loss": 0.3645, "step": 2975 }, { "epoch": 1.34, "grad_norm": 0.6312762498855591, "learning_rate": 3.084597625329815e-05, "loss": 0.5396, "step": 3000 }, { "epoch": 1.35, "grad_norm": 9.60580825805664, "learning_rate": 3.063984168865435e-05, "loss": 0.3903, "step": 3025 }, { "epoch": 1.36, "grad_norm": 8.78200626373291, "learning_rate": 3.043370712401056e-05, "loss": 0.2628, "step": 3050 }, { "epoch": 1.37, "grad_norm": 0.5894768834114075, "learning_rate": 3.0227572559366757e-05, "loss": 0.3829, "step": 3075 }, { "epoch": 1.38, "grad_norm": 4.919884204864502, "learning_rate": 3.0021437994722955e-05, "loss": 0.4657, "step": 3100 }, { "epoch": 1.39, "grad_norm": 4.6643290519714355, "learning_rate": 2.9815303430079157e-05, "loss": 0.4419, "step": 3125 }, { "epoch": 1.4, "grad_norm": 4.99531888961792, "learning_rate": 2.9609168865435355e-05, "loss": 0.6053, "step": 3150 }, { "epoch": 1.41, "grad_norm": 2.8033461570739746, "learning_rate": 2.940303430079156e-05, "loss": 0.3048, "step": 3175 }, { "epoch": 1.42, "grad_norm": 10.75854206085205, "learning_rate": 2.9196899736147758e-05, "loss": 0.3911, "step": 3200 }, { "epoch": 1.44, "grad_norm": 1.392392635345459, "learning_rate": 2.899076517150396e-05, "loss": 0.43, "step": 3225 }, { "epoch": 1.45, "grad_norm": 4.793901443481445, "learning_rate": 2.8784630606860158e-05, "loss": 0.4389, "step": 3250 }, { "epoch": 1.46, "grad_norm": 6.22283935546875, "learning_rate": 2.8578496042216363e-05, "loss": 0.3379, "step": 3275 }, { "epoch": 1.47, "grad_norm": 2.443415403366089, "learning_rate": 2.837236147757256e-05, "loss": 0.4967, "step": 3300 }, { "epoch": 1.48, "grad_norm": 4.5518059730529785, "learning_rate": 2.8166226912928762e-05, "loss": 0.5465, "step": 3325 }, { "epoch": 1.49, "grad_norm": 6.078768253326416, "learning_rate": 2.796009234828496e-05, "loss": 0.4807, "step": 3350 }, { "epoch": 1.5, "grad_norm": 0.6977243423461914, "learning_rate": 2.775395778364116e-05, "loss": 0.5693, "step": 3375 }, { "epoch": 1.51, "grad_norm": 13.189055442810059, "learning_rate": 2.7547823218997364e-05, "loss": 0.2355, "step": 3400 }, { "epoch": 1.52, "grad_norm": 11.238237380981445, "learning_rate": 2.7341688654353565e-05, "loss": 0.5586, "step": 3425 }, { "epoch": 1.54, "grad_norm": 8.910079956054688, "learning_rate": 2.7135554089709763e-05, "loss": 0.4066, "step": 3450 }, { "epoch": 1.55, "grad_norm": 1.7681870460510254, "learning_rate": 2.692941952506596e-05, "loss": 0.5567, "step": 3475 }, { "epoch": 1.56, "grad_norm": 0.8575474619865417, "learning_rate": 2.6723284960422163e-05, "loss": 0.2942, "step": 3500 }, { "epoch": 1.57, "grad_norm": 6.890367031097412, "learning_rate": 2.6517150395778368e-05, "loss": 0.2915, "step": 3525 }, { "epoch": 1.58, "grad_norm": 0.6504009962081909, "learning_rate": 2.6311015831134566e-05, "loss": 0.4993, "step": 3550 }, { "epoch": 1.59, "grad_norm": 5.363488674163818, "learning_rate": 2.6104881266490768e-05, "loss": 0.4317, "step": 3575 }, { "epoch": 1.6, "grad_norm": 0.34819716215133667, "learning_rate": 2.5898746701846966e-05, "loss": 0.3354, "step": 3600 }, { "epoch": 1.61, "grad_norm": 0.5899884104728699, "learning_rate": 2.5692612137203164e-05, "loss": 0.3711, "step": 3625 }, { "epoch": 1.63, "grad_norm": 3.2884409427642822, "learning_rate": 2.548647757255937e-05, "loss": 0.3231, "step": 3650 }, { "epoch": 1.64, "grad_norm": 10.364724159240723, "learning_rate": 2.528034300791557e-05, "loss": 0.5173, "step": 3675 }, { "epoch": 1.65, "grad_norm": 5.121739864349365, "learning_rate": 2.507420844327177e-05, "loss": 0.4939, "step": 3700 }, { "epoch": 1.66, "grad_norm": 6.0736589431762695, "learning_rate": 2.486807387862797e-05, "loss": 0.4922, "step": 3725 }, { "epoch": 1.67, "grad_norm": 11.522198677062988, "learning_rate": 2.466193931398417e-05, "loss": 0.4233, "step": 3750 }, { "epoch": 1.68, "grad_norm": 2.02380633354187, "learning_rate": 2.4455804749340373e-05, "loss": 0.3657, "step": 3775 }, { "epoch": 1.69, "grad_norm": 7.379997730255127, "learning_rate": 2.424967018469657e-05, "loss": 0.4719, "step": 3800 }, { "epoch": 1.7, "grad_norm": 9.087469100952148, "learning_rate": 2.404353562005277e-05, "loss": 0.4912, "step": 3825 }, { "epoch": 1.71, "grad_norm": 6.239768028259277, "learning_rate": 2.383740105540897e-05, "loss": 0.4496, "step": 3850 }, { "epoch": 1.73, "grad_norm": 0.5235075354576111, "learning_rate": 2.3631266490765173e-05, "loss": 0.3338, "step": 3875 }, { "epoch": 1.74, "grad_norm": 4.942290782928467, "learning_rate": 2.3425131926121374e-05, "loss": 0.5681, "step": 3900 }, { "epoch": 1.75, "grad_norm": 0.7366746068000793, "learning_rate": 2.3218997361477572e-05, "loss": 0.4026, "step": 3925 }, { "epoch": 1.76, "grad_norm": 6.261937141418457, "learning_rate": 2.3012862796833774e-05, "loss": 0.3486, "step": 3950 }, { "epoch": 1.77, "grad_norm": 4.27028751373291, "learning_rate": 2.2806728232189976e-05, "loss": 0.2636, "step": 3975 }, { "epoch": 1.78, "grad_norm": 1.5691050291061401, "learning_rate": 2.260883905013193e-05, "loss": 0.4031, "step": 4000 }, { "epoch": 1.79, "grad_norm": 8.969446182250977, "learning_rate": 2.2402704485488127e-05, "loss": 0.3943, "step": 4025 }, { "epoch": 1.8, "grad_norm": 0.4185885190963745, "learning_rate": 2.219656992084433e-05, "loss": 0.4099, "step": 4050 }, { "epoch": 1.81, "grad_norm": 3.5010409355163574, "learning_rate": 2.1990435356200527e-05, "loss": 0.5192, "step": 4075 }, { "epoch": 1.83, "grad_norm": 0.3482280969619751, "learning_rate": 2.1784300791556732e-05, "loss": 0.4143, "step": 4100 }, { "epoch": 1.84, "grad_norm": 0.623389720916748, "learning_rate": 2.157816622691293e-05, "loss": 0.3566, "step": 4125 }, { "epoch": 1.85, "grad_norm": 11.738636016845703, "learning_rate": 2.1372031662269128e-05, "loss": 0.4869, "step": 4150 }, { "epoch": 1.86, "grad_norm": 18.07844352722168, "learning_rate": 2.116589709762533e-05, "loss": 0.2863, "step": 4175 }, { "epoch": 1.87, "grad_norm": 7.016993522644043, "learning_rate": 2.095976253298153e-05, "loss": 0.3461, "step": 4200 }, { "epoch": 1.88, "grad_norm": 0.18062768876552582, "learning_rate": 2.0753627968337733e-05, "loss": 0.2242, "step": 4225 }, { "epoch": 1.89, "grad_norm": 8.72131061553955, "learning_rate": 2.054749340369393e-05, "loss": 0.4769, "step": 4250 }, { "epoch": 1.9, "grad_norm": 0.8265817165374756, "learning_rate": 2.0341358839050133e-05, "loss": 0.4334, "step": 4275 }, { "epoch": 1.91, "grad_norm": 2.490007162094116, "learning_rate": 2.0135224274406334e-05, "loss": 0.457, "step": 4300 }, { "epoch": 1.93, "grad_norm": 0.18633964657783508, "learning_rate": 1.9929089709762532e-05, "loss": 0.4192, "step": 4325 }, { "epoch": 1.94, "grad_norm": 5.816620826721191, "learning_rate": 1.9722955145118734e-05, "loss": 0.3111, "step": 4350 }, { "epoch": 1.95, "grad_norm": 5.391794204711914, "learning_rate": 1.9516820580474935e-05, "loss": 0.276, "step": 4375 }, { "epoch": 1.96, "grad_norm": 0.8628760576248169, "learning_rate": 1.9310686015831137e-05, "loss": 0.3737, "step": 4400 }, { "epoch": 1.97, "grad_norm": 4.97860050201416, "learning_rate": 1.9104551451187335e-05, "loss": 0.376, "step": 4425 }, { "epoch": 1.98, "grad_norm": 4.89571475982666, "learning_rate": 1.8898416886543537e-05, "loss": 0.3772, "step": 4450 }, { "epoch": 1.99, "grad_norm": 0.35662633180618286, "learning_rate": 1.8692282321899738e-05, "loss": 0.4631, "step": 4475 }, { "epoch": 2.0, "eval_accuracy": 0.9203204985533051, "eval_f1_macro": 0.49143994642521266, "eval_f1_micro": 0.9203204985533051, "eval_f1_weighted": 0.9061028561941445, "eval_loss": 0.2875824272632599, "eval_precision_macro": 0.6640335095209342, "eval_precision_micro": 0.9203204985533051, "eval_precision_weighted": 0.9054269421170664, "eval_recall_macro": 0.445379050503802, "eval_recall_micro": 0.9203204985533051, "eval_recall_weighted": 0.9203204985533051, "eval_runtime": 404.3838, "eval_samples_per_second": 11.111, "eval_steps_per_second": 0.695, "step": 4492 }, { "epoch": 2.0, "grad_norm": 10.790550231933594, "learning_rate": 1.8486147757255936e-05, "loss": 0.536, "step": 4500 }, { "epoch": 2.01, "grad_norm": 8.45910930633545, "learning_rate": 1.8280013192612138e-05, "loss": 0.4415, "step": 4525 }, { "epoch": 2.03, "grad_norm": 5.119017124176025, "learning_rate": 1.807387862796834e-05, "loss": 0.3585, "step": 4550 }, { "epoch": 2.04, "grad_norm": 1.2371793985366821, "learning_rate": 1.786774406332454e-05, "loss": 0.3951, "step": 4575 }, { "epoch": 2.05, "grad_norm": 4.3467607498168945, "learning_rate": 1.766160949868074e-05, "loss": 0.4048, "step": 4600 }, { "epoch": 2.06, "grad_norm": 0.7943634986877441, "learning_rate": 1.7455474934036937e-05, "loss": 0.3604, "step": 4625 }, { "epoch": 2.07, "grad_norm": 1.8456642627716064, "learning_rate": 1.7249340369393142e-05, "loss": 0.4308, "step": 4650 }, { "epoch": 2.08, "grad_norm": 15.450132369995117, "learning_rate": 1.704320580474934e-05, "loss": 0.2849, "step": 4675 }, { "epoch": 2.09, "grad_norm": 0.7097306251525879, "learning_rate": 1.6837071240105542e-05, "loss": 0.3189, "step": 4700 }, { "epoch": 2.1, "grad_norm": 0.0466163270175457, "learning_rate": 1.663093667546174e-05, "loss": 0.3747, "step": 4725 }, { "epoch": 2.11, "grad_norm": 17.914644241333008, "learning_rate": 1.6424802110817945e-05, "loss": 0.4635, "step": 4750 }, { "epoch": 2.13, "grad_norm": 5.257259845733643, "learning_rate": 1.6218667546174143e-05, "loss": 0.3882, "step": 4775 }, { "epoch": 2.14, "grad_norm": 0.1565193384885788, "learning_rate": 1.601253298153034e-05, "loss": 0.4073, "step": 4800 }, { "epoch": 2.15, "grad_norm": 13.001235008239746, "learning_rate": 1.5806398416886546e-05, "loss": 0.3174, "step": 4825 }, { "epoch": 2.16, "grad_norm": 11.252735137939453, "learning_rate": 1.5600263852242745e-05, "loss": 0.4794, "step": 4850 }, { "epoch": 2.17, "grad_norm": 8.534846305847168, "learning_rate": 1.5394129287598946e-05, "loss": 0.2951, "step": 4875 }, { "epoch": 2.18, "grad_norm": 4.504175662994385, "learning_rate": 1.5187994722955146e-05, "loss": 0.5374, "step": 4900 }, { "epoch": 2.19, "grad_norm": 4.395377159118652, "learning_rate": 1.4981860158311347e-05, "loss": 0.3697, "step": 4925 }, { "epoch": 2.2, "grad_norm": 11.302129745483398, "learning_rate": 1.4775725593667547e-05, "loss": 0.2116, "step": 4950 }, { "epoch": 2.22, "grad_norm": 0.7363251447677612, "learning_rate": 1.4569591029023747e-05, "loss": 0.327, "step": 4975 }, { "epoch": 2.23, "grad_norm": 4.721381664276123, "learning_rate": 1.4363456464379949e-05, "loss": 0.3867, "step": 5000 }, { "epoch": 2.24, "grad_norm": 2.9688565731048584, "learning_rate": 1.4157321899736149e-05, "loss": 0.4413, "step": 5025 }, { "epoch": 2.25, "grad_norm": 0.40781280398368835, "learning_rate": 1.395118733509235e-05, "loss": 0.3637, "step": 5050 }, { "epoch": 2.26, "grad_norm": 1.3190653324127197, "learning_rate": 1.374505277044855e-05, "loss": 0.3532, "step": 5075 }, { "epoch": 2.27, "grad_norm": 0.6920987963676453, "learning_rate": 1.3538918205804748e-05, "loss": 0.3523, "step": 5100 }, { "epoch": 2.28, "grad_norm": 10.344520568847656, "learning_rate": 1.3332783641160951e-05, "loss": 0.4375, "step": 5125 }, { "epoch": 2.29, "grad_norm": 12.584922790527344, "learning_rate": 1.312664907651715e-05, "loss": 0.4301, "step": 5150 }, { "epoch": 2.3, "grad_norm": 0.4461086094379425, "learning_rate": 1.2920514511873353e-05, "loss": 0.3158, "step": 5175 }, { "epoch": 2.32, "grad_norm": 9.555744171142578, "learning_rate": 1.2714379947229551e-05, "loss": 0.3468, "step": 5200 }, { "epoch": 2.33, "grad_norm": 12.140357971191406, "learning_rate": 1.2508245382585754e-05, "loss": 0.4215, "step": 5225 }, { "epoch": 2.34, "grad_norm": 4.640113830566406, "learning_rate": 1.2302110817941952e-05, "loss": 0.3538, "step": 5250 }, { "epoch": 2.35, "grad_norm": 8.983073234558105, "learning_rate": 1.2095976253298154e-05, "loss": 0.2868, "step": 5275 }, { "epoch": 2.36, "grad_norm": 6.697389125823975, "learning_rate": 1.1889841688654354e-05, "loss": 0.4148, "step": 5300 }, { "epoch": 2.37, "grad_norm": 9.571817398071289, "learning_rate": 1.1683707124010555e-05, "loss": 0.4891, "step": 5325 }, { "epoch": 2.38, "grad_norm": 6.644136905670166, "learning_rate": 1.1477572559366755e-05, "loss": 0.2858, "step": 5350 }, { "epoch": 2.39, "grad_norm": 19.55898666381836, "learning_rate": 1.1271437994722955e-05, "loss": 0.4059, "step": 5375 }, { "epoch": 2.4, "grad_norm": 6.888569355010986, "learning_rate": 1.1065303430079157e-05, "loss": 0.3415, "step": 5400 }, { "epoch": 2.42, "grad_norm": 7.856134414672852, "learning_rate": 1.0859168865435356e-05, "loss": 0.3327, "step": 5425 }, { "epoch": 2.43, "grad_norm": 15.82084846496582, "learning_rate": 1.0653034300791558e-05, "loss": 0.4321, "step": 5450 }, { "epoch": 2.44, "grad_norm": 14.98440170288086, "learning_rate": 1.0446899736147758e-05, "loss": 0.4672, "step": 5475 }, { "epoch": 2.45, "grad_norm": 0.872367799282074, "learning_rate": 1.0240765171503958e-05, "loss": 0.2711, "step": 5500 }, { "epoch": 2.46, "grad_norm": 5.635341167449951, "learning_rate": 1.0034630606860158e-05, "loss": 0.3636, "step": 5525 }, { "epoch": 2.47, "grad_norm": 12.989480972290039, "learning_rate": 9.82849604221636e-06, "loss": 0.3524, "step": 5550 }, { "epoch": 2.48, "grad_norm": 16.43426513671875, "learning_rate": 9.622361477572559e-06, "loss": 0.2381, "step": 5575 }, { "epoch": 2.49, "grad_norm": 1.0547945499420166, "learning_rate": 9.41622691292876e-06, "loss": 0.3293, "step": 5600 }, { "epoch": 2.5, "grad_norm": 13.630729675292969, "learning_rate": 9.210092348284962e-06, "loss": 0.3658, "step": 5625 }, { "epoch": 2.52, "grad_norm": 12.972505569458008, "learning_rate": 9.003957783641162e-06, "loss": 0.2829, "step": 5650 }, { "epoch": 2.53, "grad_norm": 10.048601150512695, "learning_rate": 8.797823218997362e-06, "loss": 0.3867, "step": 5675 }, { "epoch": 2.54, "grad_norm": 1.120229721069336, "learning_rate": 8.591688654353562e-06, "loss": 0.4166, "step": 5700 }, { "epoch": 2.55, "grad_norm": 0.9482748508453369, "learning_rate": 8.385554089709763e-06, "loss": 0.4281, "step": 5725 }, { "epoch": 2.56, "grad_norm": 0.29686295986175537, "learning_rate": 8.179419525065963e-06, "loss": 0.3187, "step": 5750 }, { "epoch": 2.57, "grad_norm": 9.385336875915527, "learning_rate": 7.973284960422165e-06, "loss": 0.3553, "step": 5775 }, { "epoch": 2.58, "grad_norm": 5.910414695739746, "learning_rate": 7.767150395778365e-06, "loss": 0.2821, "step": 5800 }, { "epoch": 2.59, "grad_norm": 5.932247161865234, "learning_rate": 7.561015831134564e-06, "loss": 0.2706, "step": 5825 }, { "epoch": 2.6, "grad_norm": 1.826149821281433, "learning_rate": 7.354881266490765e-06, "loss": 0.3742, "step": 5850 }, { "epoch": 2.62, "grad_norm": 9.548162460327148, "learning_rate": 7.148746701846966e-06, "loss": 0.3124, "step": 5875 }, { "epoch": 2.63, "grad_norm": 10.59200668334961, "learning_rate": 6.9426121372031665e-06, "loss": 0.2541, "step": 5900 }, { "epoch": 2.64, "grad_norm": 6.801640033721924, "learning_rate": 6.736477572559367e-06, "loss": 0.336, "step": 5925 }, { "epoch": 2.65, "grad_norm": 6.312964916229248, "learning_rate": 6.530343007915568e-06, "loss": 0.5251, "step": 5950 }, { "epoch": 2.66, "grad_norm": 10.121294975280762, "learning_rate": 6.324208443271768e-06, "loss": 0.3999, "step": 5975 }, { "epoch": 2.67, "grad_norm": 11.066811561584473, "learning_rate": 6.1180738786279684e-06, "loss": 0.3101, "step": 6000 }, { "epoch": 2.68, "grad_norm": 0.14530642330646515, "learning_rate": 5.911939313984169e-06, "loss": 0.2483, "step": 6025 }, { "epoch": 2.69, "grad_norm": 8.127425193786621, "learning_rate": 5.70580474934037e-06, "loss": 0.2684, "step": 6050 }, { "epoch": 2.7, "grad_norm": 4.671697616577148, "learning_rate": 5.4996701846965706e-06, "loss": 0.3339, "step": 6075 }, { "epoch": 2.72, "grad_norm": 7.663967609405518, "learning_rate": 5.29353562005277e-06, "loss": 0.2823, "step": 6100 }, { "epoch": 2.73, "grad_norm": 4.24953556060791, "learning_rate": 5.087401055408971e-06, "loss": 0.2481, "step": 6125 }, { "epoch": 2.74, "grad_norm": 6.942299842834473, "learning_rate": 4.881266490765172e-06, "loss": 0.3467, "step": 6150 }, { "epoch": 2.75, "grad_norm": 2.23897123336792, "learning_rate": 4.6751319261213725e-06, "loss": 0.4663, "step": 6175 }, { "epoch": 2.76, "grad_norm": 21.0181827545166, "learning_rate": 4.468997361477572e-06, "loss": 0.2012, "step": 6200 }, { "epoch": 2.77, "grad_norm": 6.582679748535156, "learning_rate": 4.262862796833773e-06, "loss": 0.3391, "step": 6225 }, { "epoch": 2.78, "grad_norm": 0.8921090960502625, "learning_rate": 4.056728232189974e-06, "loss": 0.2925, "step": 6250 }, { "epoch": 2.79, "grad_norm": 0.729013204574585, "learning_rate": 3.8505936675461745e-06, "loss": 0.2918, "step": 6275 }, { "epoch": 2.8, "grad_norm": 0.47634056210517883, "learning_rate": 3.6527044854881267e-06, "loss": 0.3309, "step": 6300 }, { "epoch": 2.82, "grad_norm": 0.7455437183380127, "learning_rate": 3.4465699208443274e-06, "loss": 0.2839, "step": 6325 }, { "epoch": 2.83, "grad_norm": 17.21619415283203, "learning_rate": 3.240435356200528e-06, "loss": 0.3023, "step": 6350 }, { "epoch": 2.84, "grad_norm": 0.5437944531440735, "learning_rate": 3.0343007915567284e-06, "loss": 0.2888, "step": 6375 }, { "epoch": 2.85, "grad_norm": 4.063761234283447, "learning_rate": 2.8281662269129287e-06, "loss": 0.4373, "step": 6400 }, { "epoch": 2.86, "grad_norm": 6.74634313583374, "learning_rate": 2.6220316622691294e-06, "loss": 0.4396, "step": 6425 }, { "epoch": 2.87, "grad_norm": 3.216498613357544, "learning_rate": 2.41589709762533e-06, "loss": 0.3116, "step": 6450 }, { "epoch": 2.88, "grad_norm": 1.6705697774887085, "learning_rate": 2.2097625329815304e-06, "loss": 0.3643, "step": 6475 }, { "epoch": 2.89, "grad_norm": 4.8092851638793945, "learning_rate": 2.003627968337731e-06, "loss": 0.2643, "step": 6500 }, { "epoch": 2.91, "grad_norm": 16.015827178955078, "learning_rate": 1.7974934036939316e-06, "loss": 0.4048, "step": 6525 }, { "epoch": 2.92, "grad_norm": 2.7783210277557373, "learning_rate": 1.5913588390501319e-06, "loss": 0.3777, "step": 6550 }, { "epoch": 2.93, "grad_norm": 0.9853120446205139, "learning_rate": 1.3852242744063324e-06, "loss": 0.2335, "step": 6575 }, { "epoch": 2.94, "grad_norm": 0.24746793508529663, "learning_rate": 1.179089709762533e-06, "loss": 0.1812, "step": 6600 }, { "epoch": 2.95, "grad_norm": 2.739319324493408, "learning_rate": 9.729551451187335e-07, "loss": 0.2132, "step": 6625 }, { "epoch": 2.96, "grad_norm": 0.38551005721092224, "learning_rate": 7.66820580474934e-07, "loss": 0.2686, "step": 6650 }, { "epoch": 2.97, "grad_norm": 6.976538181304932, "learning_rate": 5.606860158311346e-07, "loss": 0.507, "step": 6675 }, { "epoch": 2.98, "grad_norm": 6.835049152374268, "learning_rate": 3.5455145118733513e-07, "loss": 0.4169, "step": 6700 }, { "epoch": 2.99, "grad_norm": 0.45143523812294006, "learning_rate": 1.4841688654353562e-07, "loss": 0.3045, "step": 6725 }, { "epoch": 3.0, "eval_accuracy": 0.9278878255063432, "eval_f1_macro": 0.5509939923795275, "eval_f1_micro": 0.9278878255063432, "eval_f1_weighted": 0.915533252030031, "eval_loss": 0.2612117528915405, "eval_precision_macro": 0.7507175360173887, "eval_precision_micro": 0.9278878255063432, "eval_precision_weighted": 0.9184859185112592, "eval_recall_macro": 0.4872137731200702, "eval_recall_micro": 0.9278878255063432, "eval_recall_weighted": 0.9278878255063432, "eval_runtime": 408.2523, "eval_samples_per_second": 11.005, "eval_steps_per_second": 0.688, "step": 6738 } ], "logging_steps": 25, "max_steps": 6738, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 4.1760701843670835e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }