{ "best_metric": 0.5471687316894531, "best_model_checkpoint": "../experiments_checkpoints/LoRA/mistralai/Mistral_7B_v0.1_LoRA_coastalcph/lex_glue_ledgar_2/checkpoint-1750", "epoch": 3.0, "eval_steps": 250, "global_step": 2814, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 184.91021728515625, "learning_rate": 4.98223169864961e-06, "loss": 14.5062, "step": 10 }, { "epoch": 0.02, "grad_norm": 135.1532440185547, "learning_rate": 4.964463397299218e-06, "loss": 10.8531, "step": 20 }, { "epoch": 0.03, "grad_norm": 127.01141357421875, "learning_rate": 4.946695095948828e-06, "loss": 8.4406, "step": 30 }, { "epoch": 0.04, "grad_norm": 94.51043701171875, "learning_rate": 4.928926794598437e-06, "loss": 6.9266, "step": 40 }, { "epoch": 0.05, "grad_norm": 83.99446868896484, "learning_rate": 4.911158493248046e-06, "loss": 5.118, "step": 50 }, { "epoch": 0.06, "grad_norm": 70.8001937866211, "learning_rate": 4.893390191897655e-06, "loss": 4.2539, "step": 60 }, { "epoch": 0.07, "grad_norm": 62.497379302978516, "learning_rate": 4.875621890547264e-06, "loss": 3.8484, "step": 70 }, { "epoch": 0.09, "grad_norm": 66.69869995117188, "learning_rate": 4.857853589196873e-06, "loss": 3.3156, "step": 80 }, { "epoch": 0.1, "grad_norm": 63.162601470947266, "learning_rate": 4.8400852878464825e-06, "loss": 2.9258, "step": 90 }, { "epoch": 0.11, "grad_norm": 73.47547149658203, "learning_rate": 4.822316986496091e-06, "loss": 2.4395, "step": 100 }, { "epoch": 0.12, "grad_norm": 78.7245864868164, "learning_rate": 4.8045486851457005e-06, "loss": 2.0648, "step": 110 }, { "epoch": 0.13, "grad_norm": 60.332908630371094, "learning_rate": 4.78678038379531e-06, "loss": 1.6922, "step": 120 }, { "epoch": 0.14, "grad_norm": 77.41142272949219, "learning_rate": 4.7690120824449184e-06, "loss": 1.6127, "step": 130 }, { "epoch": 0.15, "grad_norm": 49.3652458190918, "learning_rate": 4.751243781094528e-06, "loss": 1.3486, "step": 140 }, { "epoch": 0.16, "grad_norm": 69.75550079345703, "learning_rate": 4.733475479744136e-06, "loss": 1.3898, "step": 150 }, { "epoch": 0.17, "grad_norm": 53.679718017578125, "learning_rate": 4.715707178393746e-06, "loss": 1.0984, "step": 160 }, { "epoch": 0.18, "grad_norm": 62.76721954345703, "learning_rate": 4.697938877043355e-06, "loss": 1.2896, "step": 170 }, { "epoch": 0.19, "grad_norm": 64.14358520507812, "learning_rate": 4.680170575692965e-06, "loss": 1.1088, "step": 180 }, { "epoch": 0.2, "grad_norm": 44.99474334716797, "learning_rate": 4.662402274342573e-06, "loss": 0.9713, "step": 190 }, { "epoch": 0.21, "grad_norm": 41.41924285888672, "learning_rate": 4.644633972992183e-06, "loss": 0.9322, "step": 200 }, { "epoch": 0.22, "grad_norm": 42.26039505004883, "learning_rate": 4.626865671641791e-06, "loss": 1.0075, "step": 210 }, { "epoch": 0.23, "grad_norm": 43.47175598144531, "learning_rate": 4.6090973702914006e-06, "loss": 0.7709, "step": 220 }, { "epoch": 0.25, "grad_norm": 42.199302673339844, "learning_rate": 4.591329068941009e-06, "loss": 0.9961, "step": 230 }, { "epoch": 0.26, "grad_norm": 40.571651458740234, "learning_rate": 4.5735607675906185e-06, "loss": 1.0277, "step": 240 }, { "epoch": 0.27, "grad_norm": 42.54713821411133, "learning_rate": 4.555792466240228e-06, "loss": 0.8576, "step": 250 }, { "epoch": 0.27, "eval_accuracy": 0.7704, "eval_f1_macro": 0.6393290151721265, "eval_f1_micro": 0.7704, "eval_loss": 0.9223937392234802, "eval_runtime": 122.4438, "eval_samples_per_second": 81.67, "eval_steps_per_second": 1.282, "step": 250 }, { "epoch": 0.28, "grad_norm": 43.578731536865234, "learning_rate": 4.538024164889837e-06, "loss": 0.8196, "step": 260 }, { "epoch": 0.29, "grad_norm": 47.72262191772461, "learning_rate": 4.520255863539446e-06, "loss": 0.9203, "step": 270 }, { "epoch": 0.3, "grad_norm": 43.31187438964844, "learning_rate": 4.502487562189055e-06, "loss": 0.8279, "step": 280 }, { "epoch": 0.31, "grad_norm": 35.86613845825195, "learning_rate": 4.484719260838664e-06, "loss": 0.8604, "step": 290 }, { "epoch": 0.32, "grad_norm": 41.125911712646484, "learning_rate": 4.466950959488273e-06, "loss": 0.8001, "step": 300 }, { "epoch": 0.33, "grad_norm": 34.05392837524414, "learning_rate": 4.449182658137882e-06, "loss": 0.8045, "step": 310 }, { "epoch": 0.34, "grad_norm": 36.995079040527344, "learning_rate": 4.431414356787491e-06, "loss": 0.8603, "step": 320 }, { "epoch": 0.35, "grad_norm": 35.64155578613281, "learning_rate": 4.413646055437101e-06, "loss": 0.8364, "step": 330 }, { "epoch": 0.36, "grad_norm": 31.89205551147461, "learning_rate": 4.39587775408671e-06, "loss": 0.7354, "step": 340 }, { "epoch": 0.37, "grad_norm": 38.92559814453125, "learning_rate": 4.378109452736319e-06, "loss": 0.9146, "step": 350 }, { "epoch": 0.38, "grad_norm": 50.537532806396484, "learning_rate": 4.360341151385928e-06, "loss": 0.7017, "step": 360 }, { "epoch": 0.39, "grad_norm": 42.99150466918945, "learning_rate": 4.3425728500355365e-06, "loss": 0.7124, "step": 370 }, { "epoch": 0.41, "grad_norm": 38.320091247558594, "learning_rate": 4.324804548685146e-06, "loss": 0.7801, "step": 380 }, { "epoch": 0.42, "grad_norm": 36.334617614746094, "learning_rate": 4.3070362473347545e-06, "loss": 0.7673, "step": 390 }, { "epoch": 0.43, "grad_norm": 36.68587112426758, "learning_rate": 4.289267945984365e-06, "loss": 0.72, "step": 400 }, { "epoch": 0.44, "grad_norm": 42.27124786376953, "learning_rate": 4.271499644633973e-06, "loss": 0.8178, "step": 410 }, { "epoch": 0.45, "grad_norm": 35.67120361328125, "learning_rate": 4.253731343283583e-06, "loss": 0.7879, "step": 420 }, { "epoch": 0.46, "grad_norm": 39.28106689453125, "learning_rate": 4.235963041933191e-06, "loss": 0.8659, "step": 430 }, { "epoch": 0.47, "grad_norm": 27.943252563476562, "learning_rate": 4.218194740582801e-06, "loss": 0.7978, "step": 440 }, { "epoch": 0.48, "grad_norm": 37.121009826660156, "learning_rate": 4.200426439232409e-06, "loss": 0.8314, "step": 450 }, { "epoch": 0.49, "grad_norm": 35.44685363769531, "learning_rate": 4.182658137882019e-06, "loss": 0.8476, "step": 460 }, { "epoch": 0.5, "grad_norm": 41.0649299621582, "learning_rate": 4.164889836531628e-06, "loss": 0.7087, "step": 470 }, { "epoch": 0.51, "grad_norm": 32.82563781738281, "learning_rate": 4.1471215351812375e-06, "loss": 0.7729, "step": 480 }, { "epoch": 0.52, "grad_norm": 33.89826965332031, "learning_rate": 4.129353233830846e-06, "loss": 0.7834, "step": 490 }, { "epoch": 0.53, "grad_norm": 40.666786193847656, "learning_rate": 4.1115849324804554e-06, "loss": 0.7735, "step": 500 }, { "epoch": 0.53, "eval_accuracy": 0.806, "eval_f1_macro": 0.6941190873652304, "eval_f1_micro": 0.806, "eval_loss": 0.7367187738418579, "eval_runtime": 122.8094, "eval_samples_per_second": 81.427, "eval_steps_per_second": 1.278, "step": 500 }, { "epoch": 0.54, "grad_norm": 31.39685821533203, "learning_rate": 4.093816631130064e-06, "loss": 0.7248, "step": 510 }, { "epoch": 0.55, "grad_norm": 41.70730972290039, "learning_rate": 4.076048329779673e-06, "loss": 0.7138, "step": 520 }, { "epoch": 0.57, "grad_norm": 36.442710876464844, "learning_rate": 4.058280028429282e-06, "loss": 0.6738, "step": 530 }, { "epoch": 0.58, "grad_norm": 29.00288963317871, "learning_rate": 4.040511727078892e-06, "loss": 0.6751, "step": 540 }, { "epoch": 0.59, "grad_norm": 26.012950897216797, "learning_rate": 4.022743425728501e-06, "loss": 0.6725, "step": 550 }, { "epoch": 0.6, "grad_norm": 46.581459045410156, "learning_rate": 4.00497512437811e-06, "loss": 0.6228, "step": 560 }, { "epoch": 0.61, "grad_norm": 29.79856300354004, "learning_rate": 3.987206823027719e-06, "loss": 0.6897, "step": 570 }, { "epoch": 0.62, "grad_norm": 30.918678283691406, "learning_rate": 3.969438521677328e-06, "loss": 0.616, "step": 580 }, { "epoch": 0.63, "grad_norm": 36.94145202636719, "learning_rate": 3.951670220326937e-06, "loss": 0.8165, "step": 590 }, { "epoch": 0.64, "grad_norm": 33.49089050292969, "learning_rate": 3.933901918976546e-06, "loss": 0.6647, "step": 600 }, { "epoch": 0.65, "grad_norm": 28.986488342285156, "learning_rate": 3.9161336176261555e-06, "loss": 0.6968, "step": 610 }, { "epoch": 0.66, "grad_norm": 39.39780807495117, "learning_rate": 3.898365316275765e-06, "loss": 0.6555, "step": 620 }, { "epoch": 0.67, "grad_norm": 28.334272384643555, "learning_rate": 3.8805970149253735e-06, "loss": 0.7416, "step": 630 }, { "epoch": 0.68, "grad_norm": 37.35606002807617, "learning_rate": 3.862828713574983e-06, "loss": 0.6981, "step": 640 }, { "epoch": 0.69, "grad_norm": 32.14896011352539, "learning_rate": 3.8450604122245914e-06, "loss": 0.6259, "step": 650 }, { "epoch": 0.7, "grad_norm": 30.312746047973633, "learning_rate": 3.827292110874201e-06, "loss": 0.6883, "step": 660 }, { "epoch": 0.71, "grad_norm": 48.4543342590332, "learning_rate": 3.80952380952381e-06, "loss": 0.6789, "step": 670 }, { "epoch": 0.72, "grad_norm": 27.0550479888916, "learning_rate": 3.7917555081734192e-06, "loss": 0.6157, "step": 680 }, { "epoch": 0.74, "grad_norm": 28.200292587280273, "learning_rate": 3.773987206823028e-06, "loss": 0.6625, "step": 690 }, { "epoch": 0.75, "grad_norm": 30.76053810119629, "learning_rate": 3.756218905472637e-06, "loss": 0.647, "step": 700 }, { "epoch": 0.76, "grad_norm": 35.41666030883789, "learning_rate": 3.738450604122246e-06, "loss": 0.6247, "step": 710 }, { "epoch": 0.77, "grad_norm": 35.479347229003906, "learning_rate": 3.720682302771855e-06, "loss": 0.6052, "step": 720 }, { "epoch": 0.78, "grad_norm": 31.4807071685791, "learning_rate": 3.702914001421464e-06, "loss": 0.6712, "step": 730 }, { "epoch": 0.79, "grad_norm": 33.04950714111328, "learning_rate": 3.685145700071073e-06, "loss": 0.6652, "step": 740 }, { "epoch": 0.8, "grad_norm": 35.26613235473633, "learning_rate": 3.667377398720683e-06, "loss": 0.7498, "step": 750 }, { "epoch": 0.8, "eval_accuracy": 0.8211, "eval_f1_macro": 0.7186967362129784, "eval_f1_micro": 0.8211, "eval_loss": 0.6500375270843506, "eval_runtime": 122.9945, "eval_samples_per_second": 81.304, "eval_steps_per_second": 1.276, "step": 750 }, { "epoch": 0.81, "grad_norm": 30.194623947143555, "learning_rate": 3.649609097370292e-06, "loss": 0.7143, "step": 760 }, { "epoch": 0.82, "grad_norm": 27.68548583984375, "learning_rate": 3.631840796019901e-06, "loss": 0.6216, "step": 770 }, { "epoch": 0.83, "grad_norm": 32.01395034790039, "learning_rate": 3.61407249466951e-06, "loss": 0.7295, "step": 780 }, { "epoch": 0.84, "grad_norm": 28.13996696472168, "learning_rate": 3.596304193319119e-06, "loss": 0.6409, "step": 790 }, { "epoch": 0.85, "grad_norm": 27.557226181030273, "learning_rate": 3.578535891968728e-06, "loss": 0.6091, "step": 800 }, { "epoch": 0.86, "grad_norm": 30.858646392822266, "learning_rate": 3.560767590618337e-06, "loss": 0.595, "step": 810 }, { "epoch": 0.87, "grad_norm": 33.09157943725586, "learning_rate": 3.542999289267946e-06, "loss": 0.6691, "step": 820 }, { "epoch": 0.88, "grad_norm": 30.814367294311523, "learning_rate": 3.5252309879175556e-06, "loss": 0.6816, "step": 830 }, { "epoch": 0.9, "grad_norm": 39.046241760253906, "learning_rate": 3.5074626865671646e-06, "loss": 0.6195, "step": 840 }, { "epoch": 0.91, "grad_norm": 25.62079620361328, "learning_rate": 3.4896943852167736e-06, "loss": 0.5747, "step": 850 }, { "epoch": 0.92, "grad_norm": 36.18880081176758, "learning_rate": 3.4719260838663826e-06, "loss": 0.6543, "step": 860 }, { "epoch": 0.93, "grad_norm": 31.487077713012695, "learning_rate": 3.4541577825159916e-06, "loss": 0.6274, "step": 870 }, { "epoch": 0.94, "grad_norm": 36.93439865112305, "learning_rate": 3.4363894811656006e-06, "loss": 0.5673, "step": 880 }, { "epoch": 0.95, "grad_norm": 28.636613845825195, "learning_rate": 3.4186211798152095e-06, "loss": 0.5821, "step": 890 }, { "epoch": 0.96, "grad_norm": 34.320106506347656, "learning_rate": 3.4008528784648194e-06, "loss": 0.5812, "step": 900 }, { "epoch": 0.97, "grad_norm": 23.637840270996094, "learning_rate": 3.3830845771144283e-06, "loss": 0.6283, "step": 910 }, { "epoch": 0.98, "grad_norm": 44.09649658203125, "learning_rate": 3.3653162757640373e-06, "loss": 0.5835, "step": 920 }, { "epoch": 0.99, "grad_norm": 31.48293113708496, "learning_rate": 3.3475479744136463e-06, "loss": 0.5355, "step": 930 }, { "epoch": 1.0, "grad_norm": 29.28251075744629, "learning_rate": 3.3297796730632553e-06, "loss": 0.6126, "step": 940 }, { "epoch": 1.01, "grad_norm": 31.779373168945312, "learning_rate": 3.3120113717128643e-06, "loss": 0.4801, "step": 950 }, { "epoch": 1.02, "grad_norm": 27.87462043762207, "learning_rate": 3.2942430703624733e-06, "loss": 0.5277, "step": 960 }, { "epoch": 1.03, "grad_norm": 29.835203170776367, "learning_rate": 3.276474769012083e-06, "loss": 0.3841, "step": 970 }, { "epoch": 1.04, "grad_norm": 26.056089401245117, "learning_rate": 3.258706467661692e-06, "loss": 0.435, "step": 980 }, { "epoch": 1.06, "grad_norm": 28.252172470092773, "learning_rate": 3.240938166311301e-06, "loss": 0.4596, "step": 990 }, { "epoch": 1.07, "grad_norm": 22.689332962036133, "learning_rate": 3.22316986496091e-06, "loss": 0.4705, "step": 1000 }, { "epoch": 1.07, "eval_accuracy": 0.8341, "eval_f1_macro": 0.748374559334373, "eval_f1_micro": 0.8341, "eval_loss": 0.6080421805381775, "eval_runtime": 123.272, "eval_samples_per_second": 81.121, "eval_steps_per_second": 1.274, "step": 1000 }, { "epoch": 1.08, "grad_norm": 37.917781829833984, "learning_rate": 3.205401563610519e-06, "loss": 0.5036, "step": 1010 }, { "epoch": 1.09, "grad_norm": 32.30621337890625, "learning_rate": 3.187633262260128e-06, "loss": 0.4905, "step": 1020 }, { "epoch": 1.1, "grad_norm": 29.364919662475586, "learning_rate": 3.169864960909737e-06, "loss": 0.5097, "step": 1030 }, { "epoch": 1.11, "grad_norm": 26.60013771057129, "learning_rate": 3.152096659559347e-06, "loss": 0.4585, "step": 1040 }, { "epoch": 1.12, "grad_norm": 39.39649963378906, "learning_rate": 3.1343283582089558e-06, "loss": 0.5252, "step": 1050 }, { "epoch": 1.13, "grad_norm": 33.575172424316406, "learning_rate": 3.1165600568585648e-06, "loss": 0.4577, "step": 1060 }, { "epoch": 1.14, "grad_norm": 25.627819061279297, "learning_rate": 3.0987917555081737e-06, "loss": 0.4131, "step": 1070 }, { "epoch": 1.15, "grad_norm": 32.13376998901367, "learning_rate": 3.0810234541577827e-06, "loss": 0.465, "step": 1080 }, { "epoch": 1.16, "grad_norm": 38.22120666503906, "learning_rate": 3.0632551528073917e-06, "loss": 0.4414, "step": 1090 }, { "epoch": 1.17, "grad_norm": 34.123008728027344, "learning_rate": 3.0454868514570007e-06, "loss": 0.4426, "step": 1100 }, { "epoch": 1.18, "grad_norm": 33.672584533691406, "learning_rate": 3.0277185501066105e-06, "loss": 0.5089, "step": 1110 }, { "epoch": 1.19, "grad_norm": 24.971105575561523, "learning_rate": 3.0099502487562195e-06, "loss": 0.4054, "step": 1120 }, { "epoch": 1.2, "grad_norm": 26.803415298461914, "learning_rate": 2.9921819474058285e-06, "loss": 0.4724, "step": 1130 }, { "epoch": 1.22, "grad_norm": 24.77806854248047, "learning_rate": 2.9744136460554375e-06, "loss": 0.4646, "step": 1140 }, { "epoch": 1.23, "grad_norm": 35.862186431884766, "learning_rate": 2.9566453447050464e-06, "loss": 0.4406, "step": 1150 }, { "epoch": 1.24, "grad_norm": 35.596832275390625, "learning_rate": 2.9388770433546554e-06, "loss": 0.4725, "step": 1160 }, { "epoch": 1.25, "grad_norm": 26.640377044677734, "learning_rate": 2.9211087420042644e-06, "loss": 0.4631, "step": 1170 }, { "epoch": 1.26, "grad_norm": 34.9013786315918, "learning_rate": 2.903340440653874e-06, "loss": 0.4376, "step": 1180 }, { "epoch": 1.27, "grad_norm": 32.519752502441406, "learning_rate": 2.885572139303483e-06, "loss": 0.508, "step": 1190 }, { "epoch": 1.28, "grad_norm": 31.03801918029785, "learning_rate": 2.867803837953092e-06, "loss": 0.4786, "step": 1200 }, { "epoch": 1.29, "grad_norm": 32.7837028503418, "learning_rate": 2.850035536602701e-06, "loss": 0.5303, "step": 1210 }, { "epoch": 1.3, "grad_norm": 37.83240509033203, "learning_rate": 2.83226723525231e-06, "loss": 0.4639, "step": 1220 }, { "epoch": 1.31, "grad_norm": 32.98924255371094, "learning_rate": 2.814498933901919e-06, "loss": 0.4683, "step": 1230 }, { "epoch": 1.32, "grad_norm": 34.40327835083008, "learning_rate": 2.796730632551528e-06, "loss": 0.4534, "step": 1240 }, { "epoch": 1.33, "grad_norm": 35.260684967041016, "learning_rate": 2.7789623312011375e-06, "loss": 0.4717, "step": 1250 }, { "epoch": 1.33, "eval_accuracy": 0.8364, "eval_f1_macro": 0.7470036323468886, "eval_f1_micro": 0.8364, "eval_loss": 0.6026765704154968, "eval_runtime": 123.6209, "eval_samples_per_second": 80.892, "eval_steps_per_second": 1.27, "step": 1250 }, { "epoch": 1.34, "grad_norm": 23.839731216430664, "learning_rate": 2.7611940298507465e-06, "loss": 0.5112, "step": 1260 }, { "epoch": 1.35, "grad_norm": 22.432899475097656, "learning_rate": 2.7434257285003555e-06, "loss": 0.4406, "step": 1270 }, { "epoch": 1.36, "grad_norm": 31.171859741210938, "learning_rate": 2.725657427149965e-06, "loss": 0.4376, "step": 1280 }, { "epoch": 1.38, "grad_norm": 31.905155181884766, "learning_rate": 2.707889125799574e-06, "loss": 0.462, "step": 1290 }, { "epoch": 1.39, "grad_norm": 22.738901138305664, "learning_rate": 2.690120824449183e-06, "loss": 0.4725, "step": 1300 }, { "epoch": 1.4, "grad_norm": 44.007511138916016, "learning_rate": 2.672352523098792e-06, "loss": 0.5446, "step": 1310 }, { "epoch": 1.41, "grad_norm": 31.273822784423828, "learning_rate": 2.654584221748401e-06, "loss": 0.4514, "step": 1320 }, { "epoch": 1.42, "grad_norm": 26.88573455810547, "learning_rate": 2.6368159203980102e-06, "loss": 0.4686, "step": 1330 }, { "epoch": 1.43, "grad_norm": 30.358346939086914, "learning_rate": 2.6190476190476192e-06, "loss": 0.38, "step": 1340 }, { "epoch": 1.44, "grad_norm": 35.21214294433594, "learning_rate": 2.601279317697228e-06, "loss": 0.4771, "step": 1350 }, { "epoch": 1.45, "grad_norm": 29.396312713623047, "learning_rate": 2.5835110163468376e-06, "loss": 0.4399, "step": 1360 }, { "epoch": 1.46, "grad_norm": 24.683927536010742, "learning_rate": 2.5657427149964466e-06, "loss": 0.4132, "step": 1370 }, { "epoch": 1.47, "grad_norm": 27.939329147338867, "learning_rate": 2.5479744136460556e-06, "loss": 0.4197, "step": 1380 }, { "epoch": 1.48, "grad_norm": 25.23176383972168, "learning_rate": 2.5302061122956646e-06, "loss": 0.5097, "step": 1390 }, { "epoch": 1.49, "grad_norm": 28.161165237426758, "learning_rate": 2.512437810945274e-06, "loss": 0.4091, "step": 1400 }, { "epoch": 1.5, "grad_norm": 21.97450065612793, "learning_rate": 2.494669509594883e-06, "loss": 0.3651, "step": 1410 }, { "epoch": 1.51, "grad_norm": 36.418678283691406, "learning_rate": 2.476901208244492e-06, "loss": 0.4551, "step": 1420 }, { "epoch": 1.52, "grad_norm": 32.430763244628906, "learning_rate": 2.459132906894101e-06, "loss": 0.4634, "step": 1430 }, { "epoch": 1.54, "grad_norm": 29.694974899291992, "learning_rate": 2.4413646055437103e-06, "loss": 0.4075, "step": 1440 }, { "epoch": 1.55, "grad_norm": 31.458261489868164, "learning_rate": 2.4235963041933193e-06, "loss": 0.3728, "step": 1450 }, { "epoch": 1.56, "grad_norm": 37.986698150634766, "learning_rate": 2.4058280028429283e-06, "loss": 0.4159, "step": 1460 }, { "epoch": 1.57, "grad_norm": 31.07583236694336, "learning_rate": 2.3880597014925373e-06, "loss": 0.4144, "step": 1470 }, { "epoch": 1.58, "grad_norm": 26.518775939941406, "learning_rate": 2.3702914001421467e-06, "loss": 0.4801, "step": 1480 }, { "epoch": 1.59, "grad_norm": 28.71346092224121, "learning_rate": 2.3525230987917556e-06, "loss": 0.4013, "step": 1490 }, { "epoch": 1.6, "grad_norm": 24.5445613861084, "learning_rate": 2.3347547974413646e-06, "loss": 0.4793, "step": 1500 }, { "epoch": 1.6, "eval_accuracy": 0.8418, "eval_f1_macro": 0.753668406230964, "eval_f1_micro": 0.8418, "eval_loss": 0.5638077855110168, "eval_runtime": 123.8628, "eval_samples_per_second": 80.734, "eval_steps_per_second": 1.268, "step": 1500 }, { "epoch": 1.61, "grad_norm": 32.141387939453125, "learning_rate": 2.3169864960909736e-06, "loss": 0.4447, "step": 1510 }, { "epoch": 1.62, "grad_norm": 20.818496704101562, "learning_rate": 2.299218194740583e-06, "loss": 0.3934, "step": 1520 }, { "epoch": 1.63, "grad_norm": 32.60322189331055, "learning_rate": 2.281449893390192e-06, "loss": 0.4603, "step": 1530 }, { "epoch": 1.64, "grad_norm": 37.379878997802734, "learning_rate": 2.263681592039801e-06, "loss": 0.4765, "step": 1540 }, { "epoch": 1.65, "grad_norm": 28.236251831054688, "learning_rate": 2.24591329068941e-06, "loss": 0.4381, "step": 1550 }, { "epoch": 1.66, "grad_norm": 34.84467315673828, "learning_rate": 2.2281449893390194e-06, "loss": 0.4811, "step": 1560 }, { "epoch": 1.67, "grad_norm": 19.580827713012695, "learning_rate": 2.2103766879886283e-06, "loss": 0.4381, "step": 1570 }, { "epoch": 1.68, "grad_norm": 34.670841217041016, "learning_rate": 2.1926083866382373e-06, "loss": 0.4011, "step": 1580 }, { "epoch": 1.7, "grad_norm": 27.111051559448242, "learning_rate": 2.1748400852878467e-06, "loss": 0.4097, "step": 1590 }, { "epoch": 1.71, "grad_norm": 32.83036804199219, "learning_rate": 2.1570717839374557e-06, "loss": 0.4505, "step": 1600 }, { "epoch": 1.72, "grad_norm": 24.941875457763672, "learning_rate": 2.1393034825870647e-06, "loss": 0.35, "step": 1610 }, { "epoch": 1.73, "grad_norm": 26.671207427978516, "learning_rate": 2.1215351812366737e-06, "loss": 0.4436, "step": 1620 }, { "epoch": 1.74, "grad_norm": 41.64507293701172, "learning_rate": 2.103766879886283e-06, "loss": 0.4664, "step": 1630 }, { "epoch": 1.75, "grad_norm": 27.915386199951172, "learning_rate": 2.085998578535892e-06, "loss": 0.3988, "step": 1640 }, { "epoch": 1.76, "grad_norm": 24.587438583374023, "learning_rate": 2.068230277185501e-06, "loss": 0.4, "step": 1650 }, { "epoch": 1.77, "grad_norm": 27.6597900390625, "learning_rate": 2.05046197583511e-06, "loss": 0.4339, "step": 1660 }, { "epoch": 1.78, "grad_norm": 35.30537796020508, "learning_rate": 2.0326936744847194e-06, "loss": 0.4648, "step": 1670 }, { "epoch": 1.79, "grad_norm": 22.710094451904297, "learning_rate": 2.0149253731343284e-06, "loss": 0.3969, "step": 1680 }, { "epoch": 1.8, "grad_norm": 29.3568172454834, "learning_rate": 1.9971570717839374e-06, "loss": 0.4616, "step": 1690 }, { "epoch": 1.81, "grad_norm": 33.942832946777344, "learning_rate": 1.979388770433547e-06, "loss": 0.4729, "step": 1700 }, { "epoch": 1.82, "grad_norm": 30.611745834350586, "learning_rate": 1.9616204690831558e-06, "loss": 0.4058, "step": 1710 }, { "epoch": 1.83, "grad_norm": 31.35557746887207, "learning_rate": 1.9438521677327648e-06, "loss": 0.364, "step": 1720 }, { "epoch": 1.84, "grad_norm": 32.7161865234375, "learning_rate": 1.9260838663823737e-06, "loss": 0.4231, "step": 1730 }, { "epoch": 1.86, "grad_norm": 26.588394165039062, "learning_rate": 1.908315565031983e-06, "loss": 0.4263, "step": 1740 }, { "epoch": 1.87, "grad_norm": 32.95501708984375, "learning_rate": 1.8905472636815921e-06, "loss": 0.4884, "step": 1750 }, { "epoch": 1.87, "eval_accuracy": 0.846, "eval_f1_macro": 0.7622184138097792, "eval_f1_micro": 0.846, "eval_loss": 0.5471687316894531, "eval_runtime": 124.0456, "eval_samples_per_second": 80.616, "eval_steps_per_second": 1.266, "step": 1750 }, { "epoch": 1.88, "grad_norm": 28.180747985839844, "learning_rate": 1.8727789623312011e-06, "loss": 0.4191, "step": 1760 }, { "epoch": 1.89, "grad_norm": 31.246604919433594, "learning_rate": 1.8550106609808105e-06, "loss": 0.3957, "step": 1770 }, { "epoch": 1.9, "grad_norm": 28.893356323242188, "learning_rate": 1.8372423596304195e-06, "loss": 0.3977, "step": 1780 }, { "epoch": 1.91, "grad_norm": 24.465343475341797, "learning_rate": 1.8194740582800285e-06, "loss": 0.455, "step": 1790 }, { "epoch": 1.92, "grad_norm": 26.49956703186035, "learning_rate": 1.8017057569296375e-06, "loss": 0.4111, "step": 1800 }, { "epoch": 1.93, "grad_norm": 35.164100646972656, "learning_rate": 1.7839374555792469e-06, "loss": 0.4566, "step": 1810 }, { "epoch": 1.94, "grad_norm": 30.47753143310547, "learning_rate": 1.7661691542288559e-06, "loss": 0.5157, "step": 1820 }, { "epoch": 1.95, "grad_norm": 30.594600677490234, "learning_rate": 1.7484008528784648e-06, "loss": 0.3793, "step": 1830 }, { "epoch": 1.96, "grad_norm": 27.72709083557129, "learning_rate": 1.7306325515280742e-06, "loss": 0.4352, "step": 1840 }, { "epoch": 1.97, "grad_norm": 32.357486724853516, "learning_rate": 1.7128642501776832e-06, "loss": 0.4319, "step": 1850 }, { "epoch": 1.98, "grad_norm": 35.13471984863281, "learning_rate": 1.6950959488272922e-06, "loss": 0.4315, "step": 1860 }, { "epoch": 1.99, "grad_norm": 28.141923904418945, "learning_rate": 1.6773276474769012e-06, "loss": 0.394, "step": 1870 }, { "epoch": 2.0, "grad_norm": 19.38408660888672, "learning_rate": 1.6595593461265106e-06, "loss": 0.2904, "step": 1880 }, { "epoch": 2.01, "grad_norm": 23.068687438964844, "learning_rate": 1.6417910447761196e-06, "loss": 0.2626, "step": 1890 }, { "epoch": 2.03, "grad_norm": 29.862808227539062, "learning_rate": 1.6240227434257286e-06, "loss": 0.2445, "step": 1900 }, { "epoch": 2.04, "grad_norm": 39.06084442138672, "learning_rate": 1.6062544420753375e-06, "loss": 0.2081, "step": 1910 }, { "epoch": 2.05, "grad_norm": 41.129886627197266, "learning_rate": 1.588486140724947e-06, "loss": 0.2222, "step": 1920 }, { "epoch": 2.06, "grad_norm": 24.22159767150879, "learning_rate": 1.570717839374556e-06, "loss": 0.209, "step": 1930 }, { "epoch": 2.07, "grad_norm": 32.13199996948242, "learning_rate": 1.552949538024165e-06, "loss": 0.2422, "step": 1940 }, { "epoch": 2.08, "grad_norm": 37.64527893066406, "learning_rate": 1.5351812366737743e-06, "loss": 0.2254, "step": 1950 }, { "epoch": 2.09, "grad_norm": 26.231842041015625, "learning_rate": 1.5174129353233833e-06, "loss": 0.1824, "step": 1960 }, { "epoch": 2.1, "grad_norm": 27.84870147705078, "learning_rate": 1.4996446339729923e-06, "loss": 0.2109, "step": 1970 }, { "epoch": 2.11, "grad_norm": 29.698753356933594, "learning_rate": 1.4818763326226013e-06, "loss": 0.1918, "step": 1980 }, { "epoch": 2.12, "grad_norm": 37.34313201904297, "learning_rate": 1.4641080312722107e-06, "loss": 0.2207, "step": 1990 }, { "epoch": 2.13, "grad_norm": 39.03969192504883, "learning_rate": 1.4463397299218196e-06, "loss": 0.2172, "step": 2000 }, { "epoch": 2.13, "eval_accuracy": 0.8515, "eval_f1_macro": 0.7692693439574276, "eval_f1_micro": 0.8515, "eval_loss": 0.579771101474762, "eval_runtime": 124.1567, "eval_samples_per_second": 80.543, "eval_steps_per_second": 1.265, "step": 2000 }, { "epoch": 2.14, "grad_norm": 42.35063552856445, "learning_rate": 1.4285714285714286e-06, "loss": 0.3061, "step": 2010 }, { "epoch": 2.15, "grad_norm": 29.83649253845215, "learning_rate": 1.4108031272210378e-06, "loss": 0.1979, "step": 2020 }, { "epoch": 2.16, "grad_norm": 25.261943817138672, "learning_rate": 1.393034825870647e-06, "loss": 0.2296, "step": 2030 }, { "epoch": 2.17, "grad_norm": 35.01914596557617, "learning_rate": 1.375266524520256e-06, "loss": 0.2009, "step": 2040 }, { "epoch": 2.19, "grad_norm": 23.898014068603516, "learning_rate": 1.357498223169865e-06, "loss": 0.1889, "step": 2050 }, { "epoch": 2.2, "grad_norm": 40.306312561035156, "learning_rate": 1.3397299218194742e-06, "loss": 0.2237, "step": 2060 }, { "epoch": 2.21, "grad_norm": 37.627296447753906, "learning_rate": 1.3219616204690834e-06, "loss": 0.2034, "step": 2070 }, { "epoch": 2.22, "grad_norm": 23.1574649810791, "learning_rate": 1.3041933191186923e-06, "loss": 0.1523, "step": 2080 }, { "epoch": 2.23, "grad_norm": 29.777870178222656, "learning_rate": 1.2864250177683015e-06, "loss": 0.1977, "step": 2090 }, { "epoch": 2.24, "grad_norm": 37.081851959228516, "learning_rate": 1.2686567164179105e-06, "loss": 0.2006, "step": 2100 }, { "epoch": 2.25, "grad_norm": 22.427270889282227, "learning_rate": 1.2508884150675197e-06, "loss": 0.2112, "step": 2110 }, { "epoch": 2.26, "grad_norm": 18.139812469482422, "learning_rate": 1.2331201137171287e-06, "loss": 0.1985, "step": 2120 }, { "epoch": 2.27, "grad_norm": 31.232402801513672, "learning_rate": 1.2153518123667379e-06, "loss": 0.2408, "step": 2130 }, { "epoch": 2.28, "grad_norm": 36.46913146972656, "learning_rate": 1.1975835110163469e-06, "loss": 0.2214, "step": 2140 }, { "epoch": 2.29, "grad_norm": 18.826915740966797, "learning_rate": 1.179815209665956e-06, "loss": 0.2392, "step": 2150 }, { "epoch": 2.3, "grad_norm": 37.9354362487793, "learning_rate": 1.162046908315565e-06, "loss": 0.1993, "step": 2160 }, { "epoch": 2.31, "grad_norm": 34.12331771850586, "learning_rate": 1.1442786069651742e-06, "loss": 0.2242, "step": 2170 }, { "epoch": 2.32, "grad_norm": 28.576717376708984, "learning_rate": 1.1265103056147832e-06, "loss": 0.2374, "step": 2180 }, { "epoch": 2.33, "grad_norm": 23.52765655517578, "learning_rate": 1.1087420042643924e-06, "loss": 0.2549, "step": 2190 }, { "epoch": 2.35, "grad_norm": 21.565595626831055, "learning_rate": 1.0909737029140014e-06, "loss": 0.2351, "step": 2200 }, { "epoch": 2.36, "grad_norm": 28.789283752441406, "learning_rate": 1.0732054015636106e-06, "loss": 0.2405, "step": 2210 }, { "epoch": 2.37, "grad_norm": 24.8309268951416, "learning_rate": 1.0554371002132196e-06, "loss": 0.1679, "step": 2220 }, { "epoch": 2.38, "grad_norm": 33.16305160522461, "learning_rate": 1.0376687988628288e-06, "loss": 0.1967, "step": 2230 }, { "epoch": 2.39, "grad_norm": 22.096860885620117, "learning_rate": 1.0199004975124378e-06, "loss": 0.1921, "step": 2240 }, { "epoch": 2.4, "grad_norm": 49.326988220214844, "learning_rate": 1.002132196162047e-06, "loss": 0.224, "step": 2250 }, { "epoch": 2.4, "eval_accuracy": 0.8525, "eval_f1_macro": 0.7700158737448057, "eval_f1_micro": 0.8525, "eval_loss": 0.603921115398407, "eval_runtime": 124.3333, "eval_samples_per_second": 80.429, "eval_steps_per_second": 1.263, "step": 2250 }, { "epoch": 2.41, "grad_norm": 40.09466552734375, "learning_rate": 9.843638948116561e-07, "loss": 0.219, "step": 2260 }, { "epoch": 2.42, "grad_norm": 47.25840759277344, "learning_rate": 9.665955934612651e-07, "loss": 0.2664, "step": 2270 }, { "epoch": 2.43, "grad_norm": 43.087562561035156, "learning_rate": 9.488272921108743e-07, "loss": 0.2254, "step": 2280 }, { "epoch": 2.44, "grad_norm": 33.870330810546875, "learning_rate": 9.310589907604833e-07, "loss": 0.1671, "step": 2290 }, { "epoch": 2.45, "grad_norm": 17.758787155151367, "learning_rate": 9.132906894100925e-07, "loss": 0.2053, "step": 2300 }, { "epoch": 2.46, "grad_norm": 28.629383087158203, "learning_rate": 8.955223880597015e-07, "loss": 0.2097, "step": 2310 }, { "epoch": 2.47, "grad_norm": 26.55512809753418, "learning_rate": 8.777540867093107e-07, "loss": 0.1834, "step": 2320 }, { "epoch": 2.48, "grad_norm": 39.31850051879883, "learning_rate": 8.599857853589196e-07, "loss": 0.1942, "step": 2330 }, { "epoch": 2.49, "grad_norm": 35.81980895996094, "learning_rate": 8.422174840085288e-07, "loss": 0.1962, "step": 2340 }, { "epoch": 2.51, "grad_norm": 33.496185302734375, "learning_rate": 8.24449182658138e-07, "loss": 0.1977, "step": 2350 }, { "epoch": 2.52, "grad_norm": 35.06413650512695, "learning_rate": 8.06680881307747e-07, "loss": 0.2, "step": 2360 }, { "epoch": 2.53, "grad_norm": 15.073473930358887, "learning_rate": 7.889125799573562e-07, "loss": 0.1956, "step": 2370 }, { "epoch": 2.54, "grad_norm": 19.61820411682129, "learning_rate": 7.711442786069652e-07, "loss": 0.1857, "step": 2380 }, { "epoch": 2.55, "grad_norm": 35.14012145996094, "learning_rate": 7.533759772565744e-07, "loss": 0.1883, "step": 2390 }, { "epoch": 2.56, "grad_norm": 30.28993797302246, "learning_rate": 7.356076759061834e-07, "loss": 0.2359, "step": 2400 }, { "epoch": 2.57, "grad_norm": 38.90215301513672, "learning_rate": 7.178393745557926e-07, "loss": 0.208, "step": 2410 }, { "epoch": 2.58, "grad_norm": 25.41356086730957, "learning_rate": 7.000710732054016e-07, "loss": 0.2341, "step": 2420 }, { "epoch": 2.59, "grad_norm": 41.500911712646484, "learning_rate": 6.823027718550107e-07, "loss": 0.1771, "step": 2430 }, { "epoch": 2.6, "grad_norm": 20.687131881713867, "learning_rate": 6.645344705046198e-07, "loss": 0.1613, "step": 2440 }, { "epoch": 2.61, "grad_norm": 38.510581970214844, "learning_rate": 6.467661691542289e-07, "loss": 0.221, "step": 2450 }, { "epoch": 2.62, "grad_norm": 22.35893440246582, "learning_rate": 6.28997867803838e-07, "loss": 0.2147, "step": 2460 }, { "epoch": 2.63, "grad_norm": 32.44358444213867, "learning_rate": 6.112295664534471e-07, "loss": 0.1817, "step": 2470 }, { "epoch": 2.64, "grad_norm": 31.902772903442383, "learning_rate": 5.934612651030562e-07, "loss": 0.1753, "step": 2480 }, { "epoch": 2.65, "grad_norm": 28.205106735229492, "learning_rate": 5.756929637526653e-07, "loss": 0.2022, "step": 2490 }, { "epoch": 2.67, "grad_norm": 24.798738479614258, "learning_rate": 5.579246624022743e-07, "loss": 0.1555, "step": 2500 }, { "epoch": 2.67, "eval_accuracy": 0.8557, "eval_f1_macro": 0.7763562809202748, "eval_f1_micro": 0.8557, "eval_loss": 0.5900101661682129, "eval_runtime": 124.5215, "eval_samples_per_second": 80.307, "eval_steps_per_second": 1.261, "step": 2500 }, { "epoch": 2.68, "grad_norm": 31.424488067626953, "learning_rate": 5.401563610518834e-07, "loss": 0.1898, "step": 2510 }, { "epoch": 2.69, "grad_norm": 20.661287307739258, "learning_rate": 5.223880597014925e-07, "loss": 0.1742, "step": 2520 }, { "epoch": 2.7, "grad_norm": 36.49563980102539, "learning_rate": 5.046197583511016e-07, "loss": 0.2145, "step": 2530 }, { "epoch": 2.71, "grad_norm": 26.153505325317383, "learning_rate": 4.868514570007108e-07, "loss": 0.1909, "step": 2540 }, { "epoch": 2.72, "grad_norm": 31.629478454589844, "learning_rate": 4.690831556503199e-07, "loss": 0.1951, "step": 2550 }, { "epoch": 2.73, "grad_norm": 29.434505462646484, "learning_rate": 4.51314854299929e-07, "loss": 0.1804, "step": 2560 }, { "epoch": 2.74, "grad_norm": 31.663951873779297, "learning_rate": 4.3354655294953807e-07, "loss": 0.1797, "step": 2570 }, { "epoch": 2.75, "grad_norm": 41.432926177978516, "learning_rate": 4.1577825159914716e-07, "loss": 0.15, "step": 2580 }, { "epoch": 2.76, "grad_norm": 31.196048736572266, "learning_rate": 3.9800995024875624e-07, "loss": 0.2014, "step": 2590 }, { "epoch": 2.77, "grad_norm": 32.634159088134766, "learning_rate": 3.8024164889836533e-07, "loss": 0.1464, "step": 2600 }, { "epoch": 2.78, "grad_norm": 23.545093536376953, "learning_rate": 3.624733475479744e-07, "loss": 0.2264, "step": 2610 }, { "epoch": 2.79, "grad_norm": 46.752872467041016, "learning_rate": 3.447050461975835e-07, "loss": 0.2048, "step": 2620 }, { "epoch": 2.8, "grad_norm": 27.10477638244629, "learning_rate": 3.2693674484719265e-07, "loss": 0.1519, "step": 2630 }, { "epoch": 2.81, "grad_norm": 17.948461532592773, "learning_rate": 3.0916844349680174e-07, "loss": 0.219, "step": 2640 }, { "epoch": 2.83, "grad_norm": 27.001426696777344, "learning_rate": 2.914001421464108e-07, "loss": 0.2196, "step": 2650 }, { "epoch": 2.84, "grad_norm": 40.51960754394531, "learning_rate": 2.736318407960199e-07, "loss": 0.1746, "step": 2660 }, { "epoch": 2.85, "grad_norm": 28.898805618286133, "learning_rate": 2.55863539445629e-07, "loss": 0.1638, "step": 2670 }, { "epoch": 2.86, "grad_norm": 40.67920684814453, "learning_rate": 2.3809523809523811e-07, "loss": 0.1677, "step": 2680 }, { "epoch": 2.87, "grad_norm": 30.66012191772461, "learning_rate": 2.2032693674484723e-07, "loss": 0.1528, "step": 2690 }, { "epoch": 2.88, "grad_norm": 32.79899978637695, "learning_rate": 2.0255863539445632e-07, "loss": 0.212, "step": 2700 }, { "epoch": 2.89, "grad_norm": 34.37156295776367, "learning_rate": 1.847903340440654e-07, "loss": 0.2158, "step": 2710 }, { "epoch": 2.9, "grad_norm": 25.229320526123047, "learning_rate": 1.670220326936745e-07, "loss": 0.2365, "step": 2720 }, { "epoch": 2.91, "grad_norm": 22.94424819946289, "learning_rate": 1.4925373134328358e-07, "loss": 0.2094, "step": 2730 }, { "epoch": 2.92, "grad_norm": 28.624725341796875, "learning_rate": 1.314854299928927e-07, "loss": 0.1651, "step": 2740 }, { "epoch": 2.93, "grad_norm": 35.16642761230469, "learning_rate": 1.1371712864250178e-07, "loss": 0.1949, "step": 2750 }, { "epoch": 2.93, "eval_accuracy": 0.8578, "eval_f1_macro": 0.7806882044914581, "eval_f1_micro": 0.8578, "eval_loss": 0.5838390588760376, "eval_runtime": 124.7446, "eval_samples_per_second": 80.164, "eval_steps_per_second": 1.259, "step": 2750 }, { "epoch": 2.94, "grad_norm": 23.88597869873047, "learning_rate": 9.59488272921109e-08, "loss": 0.1841, "step": 2760 }, { "epoch": 2.95, "grad_norm": 21.020296096801758, "learning_rate": 7.818052594171998e-08, "loss": 0.2079, "step": 2770 }, { "epoch": 2.96, "grad_norm": 36.09236526489258, "learning_rate": 6.041222459132907e-08, "loss": 0.1826, "step": 2780 }, { "epoch": 2.97, "grad_norm": 14.783782958984375, "learning_rate": 4.264392324093817e-08, "loss": 0.1753, "step": 2790 }, { "epoch": 2.99, "grad_norm": 37.81224060058594, "learning_rate": 2.4875621890547265e-08, "loss": 0.1692, "step": 2800 }, { "epoch": 3.0, "grad_norm": 24.018884658813477, "learning_rate": 7.107320540156361e-09, "loss": 0.1753, "step": 2810 }, { "epoch": 3.0, "step": 2814, "total_flos": 1.011896579591766e+18, "train_loss": 0.6761682072512716, "train_runtime": 6696.8995, "train_samples_per_second": 26.878, "train_steps_per_second": 0.42 } ], "logging_steps": 10, "max_steps": 2814, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 250, "total_flos": 1.011896579591766e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }