wu981526092's picture
Upload 11 files
01c54c3 verified
{
"best_metric": 0.29854172468185425,
"best_model_checkpoint": "./results/checkpoint-2026",
"epoch": 2.0,
"eval_steps": 500,
"global_step": 2026,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"grad_norm": 15.60984992980957,
"learning_rate": 4.990128331688055e-05,
"loss": 0.5627,
"step": 10
},
{
"epoch": 0.02,
"grad_norm": 1.2870399951934814,
"learning_rate": 4.9802566633761114e-05,
"loss": 0.2714,
"step": 20
},
{
"epoch": 0.03,
"grad_norm": 123.16035461425781,
"learning_rate": 4.970384995064166e-05,
"loss": 0.5078,
"step": 30
},
{
"epoch": 0.04,
"grad_norm": 11.994287490844727,
"learning_rate": 4.960513326752221e-05,
"loss": 0.4197,
"step": 40
},
{
"epoch": 0.05,
"grad_norm": 3.41953182220459,
"learning_rate": 4.950641658440277e-05,
"loss": 0.441,
"step": 50
},
{
"epoch": 0.06,
"grad_norm": 26.95296287536621,
"learning_rate": 4.940769990128332e-05,
"loss": 0.6958,
"step": 60
},
{
"epoch": 0.07,
"grad_norm": 28.05646324157715,
"learning_rate": 4.930898321816387e-05,
"loss": 0.3109,
"step": 70
},
{
"epoch": 0.08,
"grad_norm": 28.755550384521484,
"learning_rate": 4.921026653504443e-05,
"loss": 0.5321,
"step": 80
},
{
"epoch": 0.09,
"grad_norm": 46.09171676635742,
"learning_rate": 4.9111549851924976e-05,
"loss": 0.4216,
"step": 90
},
{
"epoch": 0.1,
"grad_norm": 3.234527587890625,
"learning_rate": 4.901283316880553e-05,
"loss": 0.3427,
"step": 100
},
{
"epoch": 0.11,
"grad_norm": 36.78240966796875,
"learning_rate": 4.891411648568609e-05,
"loss": 0.5259,
"step": 110
},
{
"epoch": 0.12,
"grad_norm": 2.4952964782714844,
"learning_rate": 4.8815399802566636e-05,
"loss": 0.525,
"step": 120
},
{
"epoch": 0.13,
"grad_norm": 6.107447147369385,
"learning_rate": 4.8716683119447184e-05,
"loss": 0.5349,
"step": 130
},
{
"epoch": 0.14,
"grad_norm": 17.599472045898438,
"learning_rate": 4.861796643632775e-05,
"loss": 0.3194,
"step": 140
},
{
"epoch": 0.15,
"grad_norm": 67.53023529052734,
"learning_rate": 4.8519249753208296e-05,
"loss": 0.4738,
"step": 150
},
{
"epoch": 0.16,
"grad_norm": 61.95085525512695,
"learning_rate": 4.8420533070088844e-05,
"loss": 0.4151,
"step": 160
},
{
"epoch": 0.17,
"grad_norm": 377.9793701171875,
"learning_rate": 4.83218163869694e-05,
"loss": 0.3219,
"step": 170
},
{
"epoch": 0.18,
"grad_norm": 9.71474838256836,
"learning_rate": 4.8223099703849955e-05,
"loss": 0.2931,
"step": 180
},
{
"epoch": 0.19,
"grad_norm": 54.442691802978516,
"learning_rate": 4.8124383020730504e-05,
"loss": 0.3802,
"step": 190
},
{
"epoch": 0.2,
"grad_norm": 111.41837310791016,
"learning_rate": 4.802566633761106e-05,
"loss": 0.4909,
"step": 200
},
{
"epoch": 0.21,
"grad_norm": 28.207542419433594,
"learning_rate": 4.792694965449161e-05,
"loss": 0.392,
"step": 210
},
{
"epoch": 0.22,
"grad_norm": 65.766357421875,
"learning_rate": 4.7828232971372164e-05,
"loss": 0.3002,
"step": 220
},
{
"epoch": 0.23,
"grad_norm": 127.14469909667969,
"learning_rate": 4.772951628825272e-05,
"loss": 0.3654,
"step": 230
},
{
"epoch": 0.24,
"grad_norm": 0.19254250824451447,
"learning_rate": 4.763079960513327e-05,
"loss": 0.2953,
"step": 240
},
{
"epoch": 0.25,
"grad_norm": 30.106840133666992,
"learning_rate": 4.753208292201382e-05,
"loss": 0.4019,
"step": 250
},
{
"epoch": 0.26,
"grad_norm": 4.884279727935791,
"learning_rate": 4.743336623889438e-05,
"loss": 0.4152,
"step": 260
},
{
"epoch": 0.27,
"grad_norm": 71.0513916015625,
"learning_rate": 4.733464955577493e-05,
"loss": 0.6281,
"step": 270
},
{
"epoch": 0.28,
"grad_norm": 1.381753921508789,
"learning_rate": 4.723593287265548e-05,
"loss": 0.3224,
"step": 280
},
{
"epoch": 0.29,
"grad_norm": 1.0361205339431763,
"learning_rate": 4.713721618953603e-05,
"loss": 0.6375,
"step": 290
},
{
"epoch": 0.3,
"grad_norm": 1.2360197305679321,
"learning_rate": 4.703849950641659e-05,
"loss": 0.419,
"step": 300
},
{
"epoch": 0.31,
"grad_norm": 188.26495361328125,
"learning_rate": 4.693978282329714e-05,
"loss": 0.5303,
"step": 310
},
{
"epoch": 0.32,
"grad_norm": 0.14256739616394043,
"learning_rate": 4.684106614017769e-05,
"loss": 0.2615,
"step": 320
},
{
"epoch": 0.33,
"grad_norm": 63.93450927734375,
"learning_rate": 4.674234945705824e-05,
"loss": 0.4678,
"step": 330
},
{
"epoch": 0.34,
"grad_norm": 31.07522201538086,
"learning_rate": 4.66436327739388e-05,
"loss": 0.607,
"step": 340
},
{
"epoch": 0.35,
"grad_norm": 12.982345581054688,
"learning_rate": 4.654491609081935e-05,
"loss": 0.189,
"step": 350
},
{
"epoch": 0.36,
"grad_norm": 14.37088394165039,
"learning_rate": 4.64461994076999e-05,
"loss": 0.4341,
"step": 360
},
{
"epoch": 0.37,
"grad_norm": 8.185881614685059,
"learning_rate": 4.634748272458046e-05,
"loss": 0.5132,
"step": 370
},
{
"epoch": 0.38,
"grad_norm": 1.082980751991272,
"learning_rate": 4.624876604146101e-05,
"loss": 0.3491,
"step": 380
},
{
"epoch": 0.38,
"grad_norm": 12.5576753616333,
"learning_rate": 4.615004935834156e-05,
"loss": 0.7587,
"step": 390
},
{
"epoch": 0.39,
"grad_norm": 5.933102130889893,
"learning_rate": 4.605133267522212e-05,
"loss": 0.579,
"step": 400
},
{
"epoch": 0.4,
"grad_norm": 1.3454967737197876,
"learning_rate": 4.5952615992102666e-05,
"loss": 0.1932,
"step": 410
},
{
"epoch": 0.41,
"grad_norm": 17.171228408813477,
"learning_rate": 4.585389930898322e-05,
"loss": 0.3217,
"step": 420
},
{
"epoch": 0.42,
"grad_norm": 3.5590412616729736,
"learning_rate": 4.575518262586377e-05,
"loss": 0.2279,
"step": 430
},
{
"epoch": 0.43,
"grad_norm": 17.661069869995117,
"learning_rate": 4.5656465942744326e-05,
"loss": 0.2676,
"step": 440
},
{
"epoch": 0.44,
"grad_norm": 48.93571853637695,
"learning_rate": 4.5557749259624875e-05,
"loss": 0.5449,
"step": 450
},
{
"epoch": 0.45,
"grad_norm": 12.7286376953125,
"learning_rate": 4.545903257650543e-05,
"loss": 0.5127,
"step": 460
},
{
"epoch": 0.46,
"grad_norm": 51.88860321044922,
"learning_rate": 4.5360315893385986e-05,
"loss": 0.4794,
"step": 470
},
{
"epoch": 0.47,
"grad_norm": 18.063552856445312,
"learning_rate": 4.5261599210266535e-05,
"loss": 0.3728,
"step": 480
},
{
"epoch": 0.48,
"grad_norm": 2.861877918243408,
"learning_rate": 4.516288252714709e-05,
"loss": 0.3038,
"step": 490
},
{
"epoch": 0.49,
"grad_norm": 6.476074695587158,
"learning_rate": 4.5064165844027646e-05,
"loss": 0.2592,
"step": 500
},
{
"epoch": 0.5,
"grad_norm": 62.48997497558594,
"learning_rate": 4.4965449160908195e-05,
"loss": 0.4779,
"step": 510
},
{
"epoch": 0.51,
"grad_norm": 0.5959272384643555,
"learning_rate": 4.486673247778875e-05,
"loss": 0.3865,
"step": 520
},
{
"epoch": 0.52,
"grad_norm": 32.949684143066406,
"learning_rate": 4.47680157946693e-05,
"loss": 0.5077,
"step": 530
},
{
"epoch": 0.53,
"grad_norm": 8.09738826751709,
"learning_rate": 4.4669299111549855e-05,
"loss": 0.3352,
"step": 540
},
{
"epoch": 0.54,
"grad_norm": 23.277297973632812,
"learning_rate": 4.457058242843041e-05,
"loss": 0.5204,
"step": 550
},
{
"epoch": 0.55,
"grad_norm": 89.32869720458984,
"learning_rate": 4.447186574531096e-05,
"loss": 0.3888,
"step": 560
},
{
"epoch": 0.56,
"grad_norm": 2.6795363426208496,
"learning_rate": 4.437314906219151e-05,
"loss": 0.5252,
"step": 570
},
{
"epoch": 0.57,
"grad_norm": 37.583744049072266,
"learning_rate": 4.427443237907207e-05,
"loss": 0.3881,
"step": 580
},
{
"epoch": 0.58,
"grad_norm": 1.256844162940979,
"learning_rate": 4.417571569595262e-05,
"loss": 0.1872,
"step": 590
},
{
"epoch": 0.59,
"grad_norm": 4.5737786293029785,
"learning_rate": 4.407699901283317e-05,
"loss": 0.2536,
"step": 600
},
{
"epoch": 0.6,
"grad_norm": 45.64347839355469,
"learning_rate": 4.3978282329713724e-05,
"loss": 0.3777,
"step": 610
},
{
"epoch": 0.61,
"grad_norm": 0.4227633774280548,
"learning_rate": 4.387956564659428e-05,
"loss": 0.2028,
"step": 620
},
{
"epoch": 0.62,
"grad_norm": 4.602664947509766,
"learning_rate": 4.378084896347483e-05,
"loss": 0.5563,
"step": 630
},
{
"epoch": 0.63,
"grad_norm": 0.7803702354431152,
"learning_rate": 4.3682132280355384e-05,
"loss": 0.3636,
"step": 640
},
{
"epoch": 0.64,
"grad_norm": 70.02734375,
"learning_rate": 4.358341559723593e-05,
"loss": 0.4558,
"step": 650
},
{
"epoch": 0.65,
"grad_norm": 39.45964050292969,
"learning_rate": 4.348469891411649e-05,
"loss": 0.4592,
"step": 660
},
{
"epoch": 0.66,
"grad_norm": 22.5675106048584,
"learning_rate": 4.3385982230997044e-05,
"loss": 0.3082,
"step": 670
},
{
"epoch": 0.67,
"grad_norm": 4.789850234985352,
"learning_rate": 4.328726554787759e-05,
"loss": 0.2404,
"step": 680
},
{
"epoch": 0.68,
"grad_norm": 4.671356678009033,
"learning_rate": 4.318854886475814e-05,
"loss": 0.2864,
"step": 690
},
{
"epoch": 0.69,
"grad_norm": 1.803113341331482,
"learning_rate": 4.3089832181638704e-05,
"loss": 0.2627,
"step": 700
},
{
"epoch": 0.7,
"grad_norm": 0.38143932819366455,
"learning_rate": 4.299111549851925e-05,
"loss": 0.1678,
"step": 710
},
{
"epoch": 0.71,
"grad_norm": 0.396694540977478,
"learning_rate": 4.28923988153998e-05,
"loss": 0.181,
"step": 720
},
{
"epoch": 0.72,
"grad_norm": 26.724634170532227,
"learning_rate": 4.279368213228036e-05,
"loss": 0.5595,
"step": 730
},
{
"epoch": 0.73,
"grad_norm": 179.3428497314453,
"learning_rate": 4.269496544916091e-05,
"loss": 0.3613,
"step": 740
},
{
"epoch": 0.74,
"grad_norm": 4.721936225891113,
"learning_rate": 4.259624876604146e-05,
"loss": 0.4182,
"step": 750
},
{
"epoch": 0.75,
"grad_norm": 1.8950241804122925,
"learning_rate": 4.249753208292202e-05,
"loss": 0.3623,
"step": 760
},
{
"epoch": 0.76,
"grad_norm": 5.388864994049072,
"learning_rate": 4.2398815399802566e-05,
"loss": 0.4246,
"step": 770
},
{
"epoch": 0.77,
"grad_norm": 0.41123124957084656,
"learning_rate": 4.230009871668312e-05,
"loss": 0.2425,
"step": 780
},
{
"epoch": 0.78,
"grad_norm": 0.3556106388568878,
"learning_rate": 4.220138203356368e-05,
"loss": 0.3751,
"step": 790
},
{
"epoch": 0.79,
"grad_norm": 0.899945080280304,
"learning_rate": 4.2102665350444226e-05,
"loss": 0.3994,
"step": 800
},
{
"epoch": 0.8,
"grad_norm": 4.583869934082031,
"learning_rate": 4.2003948667324774e-05,
"loss": 0.3681,
"step": 810
},
{
"epoch": 0.81,
"grad_norm": 0.3905455768108368,
"learning_rate": 4.190523198420534e-05,
"loss": 0.1491,
"step": 820
},
{
"epoch": 0.82,
"grad_norm": 36.8359260559082,
"learning_rate": 4.1806515301085886e-05,
"loss": 0.2609,
"step": 830
},
{
"epoch": 0.83,
"grad_norm": 34.53616714477539,
"learning_rate": 4.1707798617966434e-05,
"loss": 0.5495,
"step": 840
},
{
"epoch": 0.84,
"grad_norm": 14.104715347290039,
"learning_rate": 4.160908193484699e-05,
"loss": 0.33,
"step": 850
},
{
"epoch": 0.85,
"grad_norm": 30.295068740844727,
"learning_rate": 4.1510365251727546e-05,
"loss": 1.0008,
"step": 860
},
{
"epoch": 0.86,
"grad_norm": 93.3653793334961,
"learning_rate": 4.1411648568608094e-05,
"loss": 1.0401,
"step": 870
},
{
"epoch": 0.87,
"grad_norm": 114.31365966796875,
"learning_rate": 4.131293188548865e-05,
"loss": 0.4156,
"step": 880
},
{
"epoch": 0.88,
"grad_norm": 134.54774475097656,
"learning_rate": 4.12142152023692e-05,
"loss": 0.5463,
"step": 890
},
{
"epoch": 0.89,
"grad_norm": 3.021076202392578,
"learning_rate": 4.1115498519249754e-05,
"loss": 0.2947,
"step": 900
},
{
"epoch": 0.9,
"grad_norm": 9.884215354919434,
"learning_rate": 4.101678183613031e-05,
"loss": 0.3674,
"step": 910
},
{
"epoch": 0.91,
"grad_norm": 167.9898223876953,
"learning_rate": 4.091806515301086e-05,
"loss": 0.4516,
"step": 920
},
{
"epoch": 0.92,
"grad_norm": 34.41691207885742,
"learning_rate": 4.0819348469891414e-05,
"loss": 0.504,
"step": 930
},
{
"epoch": 0.93,
"grad_norm": 10.135024070739746,
"learning_rate": 4.072063178677197e-05,
"loss": 0.2834,
"step": 940
},
{
"epoch": 0.94,
"grad_norm": 1.0688509941101074,
"learning_rate": 4.062191510365252e-05,
"loss": 0.3188,
"step": 950
},
{
"epoch": 0.95,
"grad_norm": 5.052711009979248,
"learning_rate": 4.052319842053307e-05,
"loss": 0.3693,
"step": 960
},
{
"epoch": 0.96,
"grad_norm": 0.37648436427116394,
"learning_rate": 4.042448173741363e-05,
"loss": 0.1054,
"step": 970
},
{
"epoch": 0.97,
"grad_norm": 18.3348445892334,
"learning_rate": 4.032576505429418e-05,
"loss": 0.3397,
"step": 980
},
{
"epoch": 0.98,
"grad_norm": 10.808074951171875,
"learning_rate": 4.022704837117473e-05,
"loss": 0.3628,
"step": 990
},
{
"epoch": 0.99,
"grad_norm": 141.88064575195312,
"learning_rate": 4.012833168805528e-05,
"loss": 0.9269,
"step": 1000
},
{
"epoch": 1.0,
"grad_norm": 0.555182695388794,
"learning_rate": 4.002961500493584e-05,
"loss": 0.1197,
"step": 1010
},
{
"epoch": 1.0,
"eval_balanced accuracy": 0.917760474601409,
"eval_f1": 0.9176981176842771,
"eval_loss": 0.40740078687667847,
"eval_precision": 0.9176448492816227,
"eval_recall": 0.917760474601409,
"eval_runtime": 5.5647,
"eval_samples_per_second": 161.733,
"eval_steps_per_second": 10.243,
"step": 1013
},
{
"epoch": 1.01,
"grad_norm": 4.549361228942871,
"learning_rate": 3.993089832181639e-05,
"loss": 0.5231,
"step": 1020
},
{
"epoch": 1.02,
"grad_norm": 5.699501991271973,
"learning_rate": 3.983218163869694e-05,
"loss": 0.4139,
"step": 1030
},
{
"epoch": 1.03,
"grad_norm": 2.1153147220611572,
"learning_rate": 3.973346495557749e-05,
"loss": 0.2718,
"step": 1040
},
{
"epoch": 1.04,
"grad_norm": 5.258866310119629,
"learning_rate": 3.963474827245805e-05,
"loss": 0.3115,
"step": 1050
},
{
"epoch": 1.05,
"grad_norm": 13.351494789123535,
"learning_rate": 3.95360315893386e-05,
"loss": 0.3992,
"step": 1060
},
{
"epoch": 1.06,
"grad_norm": 9.7189359664917,
"learning_rate": 3.943731490621915e-05,
"loss": 0.1346,
"step": 1070
},
{
"epoch": 1.07,
"grad_norm": 5.006288051605225,
"learning_rate": 3.933859822309971e-05,
"loss": 0.3118,
"step": 1080
},
{
"epoch": 1.08,
"grad_norm": 7.094489574432373,
"learning_rate": 3.923988153998026e-05,
"loss": 0.1807,
"step": 1090
},
{
"epoch": 1.09,
"grad_norm": 4.784492492675781,
"learning_rate": 3.914116485686081e-05,
"loss": 0.3839,
"step": 1100
},
{
"epoch": 1.1,
"grad_norm": 1.5643423795700073,
"learning_rate": 3.904244817374136e-05,
"loss": 0.1729,
"step": 1110
},
{
"epoch": 1.11,
"grad_norm": 33.595703125,
"learning_rate": 3.8943731490621916e-05,
"loss": 0.1749,
"step": 1120
},
{
"epoch": 1.12,
"grad_norm": 0.5887395143508911,
"learning_rate": 3.884501480750247e-05,
"loss": 0.2513,
"step": 1130
},
{
"epoch": 1.13,
"grad_norm": 22.53057289123535,
"learning_rate": 3.874629812438302e-05,
"loss": 0.2858,
"step": 1140
},
{
"epoch": 1.14,
"grad_norm": 52.66212463378906,
"learning_rate": 3.8647581441263576e-05,
"loss": 0.1328,
"step": 1150
},
{
"epoch": 1.15,
"grad_norm": 5.8826117515563965,
"learning_rate": 3.8548864758144125e-05,
"loss": 0.3296,
"step": 1160
},
{
"epoch": 1.15,
"grad_norm": 10.208854675292969,
"learning_rate": 3.845014807502468e-05,
"loss": 0.1743,
"step": 1170
},
{
"epoch": 1.16,
"grad_norm": 5.222922325134277,
"learning_rate": 3.8351431391905236e-05,
"loss": 0.2482,
"step": 1180
},
{
"epoch": 1.17,
"grad_norm": 0.3885471224784851,
"learning_rate": 3.8252714708785785e-05,
"loss": 0.3651,
"step": 1190
},
{
"epoch": 1.18,
"grad_norm": 68.36416625976562,
"learning_rate": 3.815399802566634e-05,
"loss": 0.5256,
"step": 1200
},
{
"epoch": 1.19,
"grad_norm": 103.91950988769531,
"learning_rate": 3.8055281342546896e-05,
"loss": 0.2199,
"step": 1210
},
{
"epoch": 1.2,
"grad_norm": 0.17333897948265076,
"learning_rate": 3.7956564659427445e-05,
"loss": 0.126,
"step": 1220
},
{
"epoch": 1.21,
"grad_norm": 41.487117767333984,
"learning_rate": 3.7857847976308e-05,
"loss": 0.2293,
"step": 1230
},
{
"epoch": 1.22,
"grad_norm": 0.1527445763349533,
"learning_rate": 3.775913129318855e-05,
"loss": 0.2754,
"step": 1240
},
{
"epoch": 1.23,
"grad_norm": 0.3720811605453491,
"learning_rate": 3.7660414610069105e-05,
"loss": 0.1904,
"step": 1250
},
{
"epoch": 1.24,
"grad_norm": 0.2801426947116852,
"learning_rate": 3.756169792694966e-05,
"loss": 0.2894,
"step": 1260
},
{
"epoch": 1.25,
"grad_norm": 0.912218451499939,
"learning_rate": 3.746298124383021e-05,
"loss": 0.4345,
"step": 1270
},
{
"epoch": 1.26,
"grad_norm": 0.25501587986946106,
"learning_rate": 3.736426456071076e-05,
"loss": 0.2249,
"step": 1280
},
{
"epoch": 1.27,
"grad_norm": 19.25888442993164,
"learning_rate": 3.7265547877591314e-05,
"loss": 0.4532,
"step": 1290
},
{
"epoch": 1.28,
"grad_norm": 7.447415351867676,
"learning_rate": 3.716683119447187e-05,
"loss": 0.419,
"step": 1300
},
{
"epoch": 1.29,
"grad_norm": 1.2623952627182007,
"learning_rate": 3.706811451135242e-05,
"loss": 0.3596,
"step": 1310
},
{
"epoch": 1.3,
"grad_norm": 49.27845001220703,
"learning_rate": 3.6969397828232974e-05,
"loss": 0.1807,
"step": 1320
},
{
"epoch": 1.31,
"grad_norm": 8.055280685424805,
"learning_rate": 3.687068114511353e-05,
"loss": 0.1877,
"step": 1330
},
{
"epoch": 1.32,
"grad_norm": 0.24801558256149292,
"learning_rate": 3.677196446199408e-05,
"loss": 0.1906,
"step": 1340
},
{
"epoch": 1.33,
"grad_norm": 0.37148603796958923,
"learning_rate": 3.6673247778874634e-05,
"loss": 0.6613,
"step": 1350
},
{
"epoch": 1.34,
"grad_norm": 2.0603933334350586,
"learning_rate": 3.657453109575518e-05,
"loss": 0.1717,
"step": 1360
},
{
"epoch": 1.35,
"grad_norm": 1.4730746746063232,
"learning_rate": 3.647581441263574e-05,
"loss": 0.3606,
"step": 1370
},
{
"epoch": 1.36,
"grad_norm": 11.129170417785645,
"learning_rate": 3.6377097729516294e-05,
"loss": 0.4668,
"step": 1380
},
{
"epoch": 1.37,
"grad_norm": 107.76866912841797,
"learning_rate": 3.627838104639684e-05,
"loss": 0.4248,
"step": 1390
},
{
"epoch": 1.38,
"grad_norm": 0.4574478566646576,
"learning_rate": 3.617966436327739e-05,
"loss": 0.2463,
"step": 1400
},
{
"epoch": 1.39,
"grad_norm": 9.523133277893066,
"learning_rate": 3.6080947680157954e-05,
"loss": 0.2986,
"step": 1410
},
{
"epoch": 1.4,
"grad_norm": 724.2791137695312,
"learning_rate": 3.59822309970385e-05,
"loss": 0.1994,
"step": 1420
},
{
"epoch": 1.41,
"grad_norm": 0.495822012424469,
"learning_rate": 3.588351431391905e-05,
"loss": 0.405,
"step": 1430
},
{
"epoch": 1.42,
"grad_norm": 0.7077971696853638,
"learning_rate": 3.578479763079961e-05,
"loss": 0.3258,
"step": 1440
},
{
"epoch": 1.43,
"grad_norm": 0.471545934677124,
"learning_rate": 3.568608094768016e-05,
"loss": 0.3381,
"step": 1450
},
{
"epoch": 1.44,
"grad_norm": 160.64279174804688,
"learning_rate": 3.558736426456071e-05,
"loss": 0.4319,
"step": 1460
},
{
"epoch": 1.45,
"grad_norm": 213.93475341796875,
"learning_rate": 3.548864758144127e-05,
"loss": 0.3506,
"step": 1470
},
{
"epoch": 1.46,
"grad_norm": 0.5124903917312622,
"learning_rate": 3.5389930898321816e-05,
"loss": 0.261,
"step": 1480
},
{
"epoch": 1.47,
"grad_norm": 0.2033979296684265,
"learning_rate": 3.529121421520237e-05,
"loss": 0.3329,
"step": 1490
},
{
"epoch": 1.48,
"grad_norm": 0.14042626321315765,
"learning_rate": 3.519249753208293e-05,
"loss": 0.198,
"step": 1500
},
{
"epoch": 1.49,
"grad_norm": 0.052474942058324814,
"learning_rate": 3.5093780848963476e-05,
"loss": 0.3291,
"step": 1510
},
{
"epoch": 1.5,
"grad_norm": 0.7498096823692322,
"learning_rate": 3.4995064165844024e-05,
"loss": 0.5893,
"step": 1520
},
{
"epoch": 1.51,
"grad_norm": 56.467071533203125,
"learning_rate": 3.489634748272459e-05,
"loss": 0.22,
"step": 1530
},
{
"epoch": 1.52,
"grad_norm": 5.047154903411865,
"learning_rate": 3.4797630799605136e-05,
"loss": 0.3128,
"step": 1540
},
{
"epoch": 1.53,
"grad_norm": 0.24173791706562042,
"learning_rate": 3.4698914116485684e-05,
"loss": 0.2632,
"step": 1550
},
{
"epoch": 1.54,
"grad_norm": 0.23745213449001312,
"learning_rate": 3.460019743336624e-05,
"loss": 0.1316,
"step": 1560
},
{
"epoch": 1.55,
"grad_norm": 0.3697431683540344,
"learning_rate": 3.4501480750246796e-05,
"loss": 0.3162,
"step": 1570
},
{
"epoch": 1.56,
"grad_norm": 125.36990356445312,
"learning_rate": 3.4402764067127344e-05,
"loss": 0.7252,
"step": 1580
},
{
"epoch": 1.57,
"grad_norm": 30.01531410217285,
"learning_rate": 3.43040473840079e-05,
"loss": 0.567,
"step": 1590
},
{
"epoch": 1.58,
"grad_norm": 44.524818420410156,
"learning_rate": 3.420533070088845e-05,
"loss": 0.4531,
"step": 1600
},
{
"epoch": 1.59,
"grad_norm": 133.4363555908203,
"learning_rate": 3.4106614017769004e-05,
"loss": 0.4438,
"step": 1610
},
{
"epoch": 1.6,
"grad_norm": 1119.47509765625,
"learning_rate": 3.400789733464956e-05,
"loss": 0.3973,
"step": 1620
},
{
"epoch": 1.61,
"grad_norm": 4.369329929351807,
"learning_rate": 3.390918065153011e-05,
"loss": 0.482,
"step": 1630
},
{
"epoch": 1.62,
"grad_norm": 28.413909912109375,
"learning_rate": 3.381046396841066e-05,
"loss": 0.3454,
"step": 1640
},
{
"epoch": 1.63,
"grad_norm": 76.58002471923828,
"learning_rate": 3.371174728529122e-05,
"loss": 0.2663,
"step": 1650
},
{
"epoch": 1.64,
"grad_norm": 597.3102416992188,
"learning_rate": 3.361303060217177e-05,
"loss": 0.155,
"step": 1660
},
{
"epoch": 1.65,
"grad_norm": 24.984447479248047,
"learning_rate": 3.351431391905232e-05,
"loss": 0.2535,
"step": 1670
},
{
"epoch": 1.66,
"grad_norm": 30.53813934326172,
"learning_rate": 3.341559723593287e-05,
"loss": 0.315,
"step": 1680
},
{
"epoch": 1.67,
"grad_norm": 1.5513701438903809,
"learning_rate": 3.331688055281343e-05,
"loss": 0.3617,
"step": 1690
},
{
"epoch": 1.68,
"grad_norm": 3.676360845565796,
"learning_rate": 3.321816386969398e-05,
"loss": 0.6472,
"step": 1700
},
{
"epoch": 1.69,
"grad_norm": 23.96689796447754,
"learning_rate": 3.311944718657453e-05,
"loss": 0.5382,
"step": 1710
},
{
"epoch": 1.7,
"grad_norm": 18.116992950439453,
"learning_rate": 3.302073050345508e-05,
"loss": 0.345,
"step": 1720
},
{
"epoch": 1.71,
"grad_norm": 4.786412239074707,
"learning_rate": 3.292201382033564e-05,
"loss": 0.3599,
"step": 1730
},
{
"epoch": 1.72,
"grad_norm": 2.5227644443511963,
"learning_rate": 3.282329713721619e-05,
"loss": 0.4313,
"step": 1740
},
{
"epoch": 1.73,
"grad_norm": 4.462274074554443,
"learning_rate": 3.272458045409674e-05,
"loss": 0.5479,
"step": 1750
},
{
"epoch": 1.74,
"grad_norm": 49.19129180908203,
"learning_rate": 3.26258637709773e-05,
"loss": 0.5215,
"step": 1760
},
{
"epoch": 1.75,
"grad_norm": 89.65460968017578,
"learning_rate": 3.252714708785785e-05,
"loss": 0.7555,
"step": 1770
},
{
"epoch": 1.76,
"grad_norm": 0.9293081760406494,
"learning_rate": 3.24284304047384e-05,
"loss": 0.3071,
"step": 1780
},
{
"epoch": 1.77,
"grad_norm": 11.949310302734375,
"learning_rate": 3.232971372161895e-05,
"loss": 0.2182,
"step": 1790
},
{
"epoch": 1.78,
"grad_norm": 15.446320533752441,
"learning_rate": 3.2230997038499506e-05,
"loss": 0.2696,
"step": 1800
},
{
"epoch": 1.79,
"grad_norm": 5.7437567710876465,
"learning_rate": 3.213228035538006e-05,
"loss": 0.3771,
"step": 1810
},
{
"epoch": 1.8,
"grad_norm": 209.53298950195312,
"learning_rate": 3.203356367226061e-05,
"loss": 0.3023,
"step": 1820
},
{
"epoch": 1.81,
"grad_norm": 1.2472151517868042,
"learning_rate": 3.1934846989141166e-05,
"loss": 0.29,
"step": 1830
},
{
"epoch": 1.82,
"grad_norm": 370.38800048828125,
"learning_rate": 3.1836130306021715e-05,
"loss": 0.3409,
"step": 1840
},
{
"epoch": 1.83,
"grad_norm": 145.07717895507812,
"learning_rate": 3.173741362290227e-05,
"loss": 0.3839,
"step": 1850
},
{
"epoch": 1.84,
"grad_norm": 48.441585540771484,
"learning_rate": 3.1638696939782826e-05,
"loss": 0.2765,
"step": 1860
},
{
"epoch": 1.85,
"grad_norm": 8.114079475402832,
"learning_rate": 3.1539980256663375e-05,
"loss": 0.4797,
"step": 1870
},
{
"epoch": 1.86,
"grad_norm": 2.0335161685943604,
"learning_rate": 3.144126357354393e-05,
"loss": 0.3283,
"step": 1880
},
{
"epoch": 1.87,
"grad_norm": 571.5001831054688,
"learning_rate": 3.1342546890424486e-05,
"loss": 0.3749,
"step": 1890
},
{
"epoch": 1.88,
"grad_norm": 26.4891414642334,
"learning_rate": 3.1243830207305035e-05,
"loss": 0.5855,
"step": 1900
},
{
"epoch": 1.89,
"grad_norm": 373.5781555175781,
"learning_rate": 3.114511352418559e-05,
"loss": 0.5779,
"step": 1910
},
{
"epoch": 1.9,
"grad_norm": 12.153056144714355,
"learning_rate": 3.1046396841066146e-05,
"loss": 0.5083,
"step": 1920
},
{
"epoch": 1.91,
"grad_norm": 246.8365936279297,
"learning_rate": 3.0947680157946695e-05,
"loss": 0.2794,
"step": 1930
},
{
"epoch": 1.92,
"grad_norm": 16.38204002380371,
"learning_rate": 3.084896347482725e-05,
"loss": 0.1689,
"step": 1940
},
{
"epoch": 1.92,
"grad_norm": 86.90618896484375,
"learning_rate": 3.07502467917078e-05,
"loss": 0.124,
"step": 1950
},
{
"epoch": 1.93,
"grad_norm": 13.767565727233887,
"learning_rate": 3.0651530108588355e-05,
"loss": 0.4286,
"step": 1960
},
{
"epoch": 1.94,
"grad_norm": 25.554912567138672,
"learning_rate": 3.0552813425468904e-05,
"loss": 0.342,
"step": 1970
},
{
"epoch": 1.95,
"grad_norm": 0.8774542212486267,
"learning_rate": 3.045409674234946e-05,
"loss": 0.2024,
"step": 1980
},
{
"epoch": 1.96,
"grad_norm": 53.33576583862305,
"learning_rate": 3.0355380059230008e-05,
"loss": 0.3032,
"step": 1990
},
{
"epoch": 1.97,
"grad_norm": 6.252757549285889,
"learning_rate": 3.0256663376110567e-05,
"loss": 0.3633,
"step": 2000
},
{
"epoch": 1.98,
"grad_norm": 0.8560687899589539,
"learning_rate": 3.015794669299112e-05,
"loss": 0.2662,
"step": 2010
},
{
"epoch": 1.99,
"grad_norm": 5.560061454772949,
"learning_rate": 3.0059230009871668e-05,
"loss": 0.3057,
"step": 2020
},
{
"epoch": 2.0,
"eval_balanced accuracy": 0.9113459399332592,
"eval_f1": 0.9119394500117044,
"eval_loss": 0.29854172468185425,
"eval_precision": 0.913535516192521,
"eval_recall": 0.9113459399332592,
"eval_runtime": 5.6113,
"eval_samples_per_second": 160.392,
"eval_steps_per_second": 10.158,
"step": 2026
}
],
"logging_steps": 10,
"max_steps": 5065,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"total_flos": 96787312128000.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}