|
{ |
|
"best_metric": 0.29854172468185425, |
|
"best_model_checkpoint": "./results/checkpoint-2026", |
|
"epoch": 2.0, |
|
"eval_steps": 500, |
|
"global_step": 2026, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 15.60984992980957, |
|
"learning_rate": 4.990128331688055e-05, |
|
"loss": 0.5627, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.2870399951934814, |
|
"learning_rate": 4.9802566633761114e-05, |
|
"loss": 0.2714, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 123.16035461425781, |
|
"learning_rate": 4.970384995064166e-05, |
|
"loss": 0.5078, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 11.994287490844727, |
|
"learning_rate": 4.960513326752221e-05, |
|
"loss": 0.4197, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 3.41953182220459, |
|
"learning_rate": 4.950641658440277e-05, |
|
"loss": 0.441, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 26.95296287536621, |
|
"learning_rate": 4.940769990128332e-05, |
|
"loss": 0.6958, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 28.05646324157715, |
|
"learning_rate": 4.930898321816387e-05, |
|
"loss": 0.3109, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 28.755550384521484, |
|
"learning_rate": 4.921026653504443e-05, |
|
"loss": 0.5321, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 46.09171676635742, |
|
"learning_rate": 4.9111549851924976e-05, |
|
"loss": 0.4216, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 3.234527587890625, |
|
"learning_rate": 4.901283316880553e-05, |
|
"loss": 0.3427, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 36.78240966796875, |
|
"learning_rate": 4.891411648568609e-05, |
|
"loss": 0.5259, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.4952964782714844, |
|
"learning_rate": 4.8815399802566636e-05, |
|
"loss": 0.525, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 6.107447147369385, |
|
"learning_rate": 4.8716683119447184e-05, |
|
"loss": 0.5349, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 17.599472045898438, |
|
"learning_rate": 4.861796643632775e-05, |
|
"loss": 0.3194, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 67.53023529052734, |
|
"learning_rate": 4.8519249753208296e-05, |
|
"loss": 0.4738, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 61.95085525512695, |
|
"learning_rate": 4.8420533070088844e-05, |
|
"loss": 0.4151, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 377.9793701171875, |
|
"learning_rate": 4.83218163869694e-05, |
|
"loss": 0.3219, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 9.71474838256836, |
|
"learning_rate": 4.8223099703849955e-05, |
|
"loss": 0.2931, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 54.442691802978516, |
|
"learning_rate": 4.8124383020730504e-05, |
|
"loss": 0.3802, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 111.41837310791016, |
|
"learning_rate": 4.802566633761106e-05, |
|
"loss": 0.4909, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 28.207542419433594, |
|
"learning_rate": 4.792694965449161e-05, |
|
"loss": 0.392, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 65.766357421875, |
|
"learning_rate": 4.7828232971372164e-05, |
|
"loss": 0.3002, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 127.14469909667969, |
|
"learning_rate": 4.772951628825272e-05, |
|
"loss": 0.3654, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.19254250824451447, |
|
"learning_rate": 4.763079960513327e-05, |
|
"loss": 0.2953, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 30.106840133666992, |
|
"learning_rate": 4.753208292201382e-05, |
|
"loss": 0.4019, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 4.884279727935791, |
|
"learning_rate": 4.743336623889438e-05, |
|
"loss": 0.4152, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 71.0513916015625, |
|
"learning_rate": 4.733464955577493e-05, |
|
"loss": 0.6281, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.381753921508789, |
|
"learning_rate": 4.723593287265548e-05, |
|
"loss": 0.3224, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.0361205339431763, |
|
"learning_rate": 4.713721618953603e-05, |
|
"loss": 0.6375, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.2360197305679321, |
|
"learning_rate": 4.703849950641659e-05, |
|
"loss": 0.419, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 188.26495361328125, |
|
"learning_rate": 4.693978282329714e-05, |
|
"loss": 0.5303, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.14256739616394043, |
|
"learning_rate": 4.684106614017769e-05, |
|
"loss": 0.2615, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 63.93450927734375, |
|
"learning_rate": 4.674234945705824e-05, |
|
"loss": 0.4678, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 31.07522201538086, |
|
"learning_rate": 4.66436327739388e-05, |
|
"loss": 0.607, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 12.982345581054688, |
|
"learning_rate": 4.654491609081935e-05, |
|
"loss": 0.189, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 14.37088394165039, |
|
"learning_rate": 4.64461994076999e-05, |
|
"loss": 0.4341, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 8.185881614685059, |
|
"learning_rate": 4.634748272458046e-05, |
|
"loss": 0.5132, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.082980751991272, |
|
"learning_rate": 4.624876604146101e-05, |
|
"loss": 0.3491, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 12.5576753616333, |
|
"learning_rate": 4.615004935834156e-05, |
|
"loss": 0.7587, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 5.933102130889893, |
|
"learning_rate": 4.605133267522212e-05, |
|
"loss": 0.579, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.3454967737197876, |
|
"learning_rate": 4.5952615992102666e-05, |
|
"loss": 0.1932, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 17.171228408813477, |
|
"learning_rate": 4.585389930898322e-05, |
|
"loss": 0.3217, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 3.5590412616729736, |
|
"learning_rate": 4.575518262586377e-05, |
|
"loss": 0.2279, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 17.661069869995117, |
|
"learning_rate": 4.5656465942744326e-05, |
|
"loss": 0.2676, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 48.93571853637695, |
|
"learning_rate": 4.5557749259624875e-05, |
|
"loss": 0.5449, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 12.7286376953125, |
|
"learning_rate": 4.545903257650543e-05, |
|
"loss": 0.5127, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 51.88860321044922, |
|
"learning_rate": 4.5360315893385986e-05, |
|
"loss": 0.4794, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 18.063552856445312, |
|
"learning_rate": 4.5261599210266535e-05, |
|
"loss": 0.3728, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 2.861877918243408, |
|
"learning_rate": 4.516288252714709e-05, |
|
"loss": 0.3038, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 6.476074695587158, |
|
"learning_rate": 4.5064165844027646e-05, |
|
"loss": 0.2592, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 62.48997497558594, |
|
"learning_rate": 4.4965449160908195e-05, |
|
"loss": 0.4779, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.5959272384643555, |
|
"learning_rate": 4.486673247778875e-05, |
|
"loss": 0.3865, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 32.949684143066406, |
|
"learning_rate": 4.47680157946693e-05, |
|
"loss": 0.5077, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 8.09738826751709, |
|
"learning_rate": 4.4669299111549855e-05, |
|
"loss": 0.3352, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 23.277297973632812, |
|
"learning_rate": 4.457058242843041e-05, |
|
"loss": 0.5204, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 89.32869720458984, |
|
"learning_rate": 4.447186574531096e-05, |
|
"loss": 0.3888, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 2.6795363426208496, |
|
"learning_rate": 4.437314906219151e-05, |
|
"loss": 0.5252, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 37.583744049072266, |
|
"learning_rate": 4.427443237907207e-05, |
|
"loss": 0.3881, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.256844162940979, |
|
"learning_rate": 4.417571569595262e-05, |
|
"loss": 0.1872, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 4.5737786293029785, |
|
"learning_rate": 4.407699901283317e-05, |
|
"loss": 0.2536, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 45.64347839355469, |
|
"learning_rate": 4.3978282329713724e-05, |
|
"loss": 0.3777, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.4227633774280548, |
|
"learning_rate": 4.387956564659428e-05, |
|
"loss": 0.2028, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 4.602664947509766, |
|
"learning_rate": 4.378084896347483e-05, |
|
"loss": 0.5563, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.7803702354431152, |
|
"learning_rate": 4.3682132280355384e-05, |
|
"loss": 0.3636, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 70.02734375, |
|
"learning_rate": 4.358341559723593e-05, |
|
"loss": 0.4558, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 39.45964050292969, |
|
"learning_rate": 4.348469891411649e-05, |
|
"loss": 0.4592, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 22.5675106048584, |
|
"learning_rate": 4.3385982230997044e-05, |
|
"loss": 0.3082, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 4.789850234985352, |
|
"learning_rate": 4.328726554787759e-05, |
|
"loss": 0.2404, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 4.671356678009033, |
|
"learning_rate": 4.318854886475814e-05, |
|
"loss": 0.2864, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.803113341331482, |
|
"learning_rate": 4.3089832181638704e-05, |
|
"loss": 0.2627, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.38143932819366455, |
|
"learning_rate": 4.299111549851925e-05, |
|
"loss": 0.1678, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.396694540977478, |
|
"learning_rate": 4.28923988153998e-05, |
|
"loss": 0.181, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 26.724634170532227, |
|
"learning_rate": 4.279368213228036e-05, |
|
"loss": 0.5595, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 179.3428497314453, |
|
"learning_rate": 4.269496544916091e-05, |
|
"loss": 0.3613, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 4.721936225891113, |
|
"learning_rate": 4.259624876604146e-05, |
|
"loss": 0.4182, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.8950241804122925, |
|
"learning_rate": 4.249753208292202e-05, |
|
"loss": 0.3623, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 5.388864994049072, |
|
"learning_rate": 4.2398815399802566e-05, |
|
"loss": 0.4246, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.41123124957084656, |
|
"learning_rate": 4.230009871668312e-05, |
|
"loss": 0.2425, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.3556106388568878, |
|
"learning_rate": 4.220138203356368e-05, |
|
"loss": 0.3751, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.899945080280304, |
|
"learning_rate": 4.2102665350444226e-05, |
|
"loss": 0.3994, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 4.583869934082031, |
|
"learning_rate": 4.2003948667324774e-05, |
|
"loss": 0.3681, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.3905455768108368, |
|
"learning_rate": 4.190523198420534e-05, |
|
"loss": 0.1491, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 36.8359260559082, |
|
"learning_rate": 4.1806515301085886e-05, |
|
"loss": 0.2609, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 34.53616714477539, |
|
"learning_rate": 4.1707798617966434e-05, |
|
"loss": 0.5495, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 14.104715347290039, |
|
"learning_rate": 4.160908193484699e-05, |
|
"loss": 0.33, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 30.295068740844727, |
|
"learning_rate": 4.1510365251727546e-05, |
|
"loss": 1.0008, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 93.3653793334961, |
|
"learning_rate": 4.1411648568608094e-05, |
|
"loss": 1.0401, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 114.31365966796875, |
|
"learning_rate": 4.131293188548865e-05, |
|
"loss": 0.4156, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 134.54774475097656, |
|
"learning_rate": 4.12142152023692e-05, |
|
"loss": 0.5463, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 3.021076202392578, |
|
"learning_rate": 4.1115498519249754e-05, |
|
"loss": 0.2947, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 9.884215354919434, |
|
"learning_rate": 4.101678183613031e-05, |
|
"loss": 0.3674, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 167.9898223876953, |
|
"learning_rate": 4.091806515301086e-05, |
|
"loss": 0.4516, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 34.41691207885742, |
|
"learning_rate": 4.0819348469891414e-05, |
|
"loss": 0.504, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 10.135024070739746, |
|
"learning_rate": 4.072063178677197e-05, |
|
"loss": 0.2834, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.0688509941101074, |
|
"learning_rate": 4.062191510365252e-05, |
|
"loss": 0.3188, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 5.052711009979248, |
|
"learning_rate": 4.052319842053307e-05, |
|
"loss": 0.3693, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.37648436427116394, |
|
"learning_rate": 4.042448173741363e-05, |
|
"loss": 0.1054, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 18.3348445892334, |
|
"learning_rate": 4.032576505429418e-05, |
|
"loss": 0.3397, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 10.808074951171875, |
|
"learning_rate": 4.022704837117473e-05, |
|
"loss": 0.3628, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 141.88064575195312, |
|
"learning_rate": 4.012833168805528e-05, |
|
"loss": 0.9269, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.555182695388794, |
|
"learning_rate": 4.002961500493584e-05, |
|
"loss": 0.1197, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_balanced accuracy": 0.917760474601409, |
|
"eval_f1": 0.9176981176842771, |
|
"eval_loss": 0.40740078687667847, |
|
"eval_precision": 0.9176448492816227, |
|
"eval_recall": 0.917760474601409, |
|
"eval_runtime": 5.5647, |
|
"eval_samples_per_second": 161.733, |
|
"eval_steps_per_second": 10.243, |
|
"step": 1013 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 4.549361228942871, |
|
"learning_rate": 3.993089832181639e-05, |
|
"loss": 0.5231, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 5.699501991271973, |
|
"learning_rate": 3.983218163869694e-05, |
|
"loss": 0.4139, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 2.1153147220611572, |
|
"learning_rate": 3.973346495557749e-05, |
|
"loss": 0.2718, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 5.258866310119629, |
|
"learning_rate": 3.963474827245805e-05, |
|
"loss": 0.3115, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 13.351494789123535, |
|
"learning_rate": 3.95360315893386e-05, |
|
"loss": 0.3992, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 9.7189359664917, |
|
"learning_rate": 3.943731490621915e-05, |
|
"loss": 0.1346, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 5.006288051605225, |
|
"learning_rate": 3.933859822309971e-05, |
|
"loss": 0.3118, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 7.094489574432373, |
|
"learning_rate": 3.923988153998026e-05, |
|
"loss": 0.1807, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 4.784492492675781, |
|
"learning_rate": 3.914116485686081e-05, |
|
"loss": 0.3839, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 1.5643423795700073, |
|
"learning_rate": 3.904244817374136e-05, |
|
"loss": 0.1729, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 33.595703125, |
|
"learning_rate": 3.8943731490621916e-05, |
|
"loss": 0.1749, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.5887395143508911, |
|
"learning_rate": 3.884501480750247e-05, |
|
"loss": 0.2513, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 22.53057289123535, |
|
"learning_rate": 3.874629812438302e-05, |
|
"loss": 0.2858, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 52.66212463378906, |
|
"learning_rate": 3.8647581441263576e-05, |
|
"loss": 0.1328, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 5.8826117515563965, |
|
"learning_rate": 3.8548864758144125e-05, |
|
"loss": 0.3296, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 10.208854675292969, |
|
"learning_rate": 3.845014807502468e-05, |
|
"loss": 0.1743, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 5.222922325134277, |
|
"learning_rate": 3.8351431391905236e-05, |
|
"loss": 0.2482, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 0.3885471224784851, |
|
"learning_rate": 3.8252714708785785e-05, |
|
"loss": 0.3651, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 68.36416625976562, |
|
"learning_rate": 3.815399802566634e-05, |
|
"loss": 0.5256, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 103.91950988769531, |
|
"learning_rate": 3.8055281342546896e-05, |
|
"loss": 0.2199, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.17333897948265076, |
|
"learning_rate": 3.7956564659427445e-05, |
|
"loss": 0.126, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 41.487117767333984, |
|
"learning_rate": 3.7857847976308e-05, |
|
"loss": 0.2293, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 0.1527445763349533, |
|
"learning_rate": 3.775913129318855e-05, |
|
"loss": 0.2754, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.3720811605453491, |
|
"learning_rate": 3.7660414610069105e-05, |
|
"loss": 0.1904, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.2801426947116852, |
|
"learning_rate": 3.756169792694966e-05, |
|
"loss": 0.2894, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.912218451499939, |
|
"learning_rate": 3.746298124383021e-05, |
|
"loss": 0.4345, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 0.25501587986946106, |
|
"learning_rate": 3.736426456071076e-05, |
|
"loss": 0.2249, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 19.25888442993164, |
|
"learning_rate": 3.7265547877591314e-05, |
|
"loss": 0.4532, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 7.447415351867676, |
|
"learning_rate": 3.716683119447187e-05, |
|
"loss": 0.419, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 1.2623952627182007, |
|
"learning_rate": 3.706811451135242e-05, |
|
"loss": 0.3596, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 49.27845001220703, |
|
"learning_rate": 3.6969397828232974e-05, |
|
"loss": 0.1807, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 8.055280685424805, |
|
"learning_rate": 3.687068114511353e-05, |
|
"loss": 0.1877, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.24801558256149292, |
|
"learning_rate": 3.677196446199408e-05, |
|
"loss": 0.1906, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 0.37148603796958923, |
|
"learning_rate": 3.6673247778874634e-05, |
|
"loss": 0.6613, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 2.0603933334350586, |
|
"learning_rate": 3.657453109575518e-05, |
|
"loss": 0.1717, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 1.4730746746063232, |
|
"learning_rate": 3.647581441263574e-05, |
|
"loss": 0.3606, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 11.129170417785645, |
|
"learning_rate": 3.6377097729516294e-05, |
|
"loss": 0.4668, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 107.76866912841797, |
|
"learning_rate": 3.627838104639684e-05, |
|
"loss": 0.4248, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 0.4574478566646576, |
|
"learning_rate": 3.617966436327739e-05, |
|
"loss": 0.2463, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 9.523133277893066, |
|
"learning_rate": 3.6080947680157954e-05, |
|
"loss": 0.2986, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 724.2791137695312, |
|
"learning_rate": 3.59822309970385e-05, |
|
"loss": 0.1994, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 0.495822012424469, |
|
"learning_rate": 3.588351431391905e-05, |
|
"loss": 0.405, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 0.7077971696853638, |
|
"learning_rate": 3.578479763079961e-05, |
|
"loss": 0.3258, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 0.471545934677124, |
|
"learning_rate": 3.568608094768016e-05, |
|
"loss": 0.3381, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 160.64279174804688, |
|
"learning_rate": 3.558736426456071e-05, |
|
"loss": 0.4319, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 213.93475341796875, |
|
"learning_rate": 3.548864758144127e-05, |
|
"loss": 0.3506, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.5124903917312622, |
|
"learning_rate": 3.5389930898321816e-05, |
|
"loss": 0.261, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 0.2033979296684265, |
|
"learning_rate": 3.529121421520237e-05, |
|
"loss": 0.3329, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.14042626321315765, |
|
"learning_rate": 3.519249753208293e-05, |
|
"loss": 0.198, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 0.052474942058324814, |
|
"learning_rate": 3.5093780848963476e-05, |
|
"loss": 0.3291, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.7498096823692322, |
|
"learning_rate": 3.4995064165844024e-05, |
|
"loss": 0.5893, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 56.467071533203125, |
|
"learning_rate": 3.489634748272459e-05, |
|
"loss": 0.22, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 5.047154903411865, |
|
"learning_rate": 3.4797630799605136e-05, |
|
"loss": 0.3128, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 0.24173791706562042, |
|
"learning_rate": 3.4698914116485684e-05, |
|
"loss": 0.2632, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.23745213449001312, |
|
"learning_rate": 3.460019743336624e-05, |
|
"loss": 0.1316, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 0.3697431683540344, |
|
"learning_rate": 3.4501480750246796e-05, |
|
"loss": 0.3162, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 125.36990356445312, |
|
"learning_rate": 3.4402764067127344e-05, |
|
"loss": 0.7252, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 30.01531410217285, |
|
"learning_rate": 3.43040473840079e-05, |
|
"loss": 0.567, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 44.524818420410156, |
|
"learning_rate": 3.420533070088845e-05, |
|
"loss": 0.4531, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 133.4363555908203, |
|
"learning_rate": 3.4106614017769004e-05, |
|
"loss": 0.4438, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 1119.47509765625, |
|
"learning_rate": 3.400789733464956e-05, |
|
"loss": 0.3973, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 4.369329929351807, |
|
"learning_rate": 3.390918065153011e-05, |
|
"loss": 0.482, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 28.413909912109375, |
|
"learning_rate": 3.381046396841066e-05, |
|
"loss": 0.3454, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 76.58002471923828, |
|
"learning_rate": 3.371174728529122e-05, |
|
"loss": 0.2663, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 597.3102416992188, |
|
"learning_rate": 3.361303060217177e-05, |
|
"loss": 0.155, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 24.984447479248047, |
|
"learning_rate": 3.351431391905232e-05, |
|
"loss": 0.2535, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 30.53813934326172, |
|
"learning_rate": 3.341559723593287e-05, |
|
"loss": 0.315, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 1.5513701438903809, |
|
"learning_rate": 3.331688055281343e-05, |
|
"loss": 0.3617, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 3.676360845565796, |
|
"learning_rate": 3.321816386969398e-05, |
|
"loss": 0.6472, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 23.96689796447754, |
|
"learning_rate": 3.311944718657453e-05, |
|
"loss": 0.5382, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 18.116992950439453, |
|
"learning_rate": 3.302073050345508e-05, |
|
"loss": 0.345, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 4.786412239074707, |
|
"learning_rate": 3.292201382033564e-05, |
|
"loss": 0.3599, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 2.5227644443511963, |
|
"learning_rate": 3.282329713721619e-05, |
|
"loss": 0.4313, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 4.462274074554443, |
|
"learning_rate": 3.272458045409674e-05, |
|
"loss": 0.5479, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 49.19129180908203, |
|
"learning_rate": 3.26258637709773e-05, |
|
"loss": 0.5215, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 89.65460968017578, |
|
"learning_rate": 3.252714708785785e-05, |
|
"loss": 0.7555, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.9293081760406494, |
|
"learning_rate": 3.24284304047384e-05, |
|
"loss": 0.3071, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 11.949310302734375, |
|
"learning_rate": 3.232971372161895e-05, |
|
"loss": 0.2182, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 15.446320533752441, |
|
"learning_rate": 3.2230997038499506e-05, |
|
"loss": 0.2696, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 5.7437567710876465, |
|
"learning_rate": 3.213228035538006e-05, |
|
"loss": 0.3771, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 209.53298950195312, |
|
"learning_rate": 3.203356367226061e-05, |
|
"loss": 0.3023, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 1.2472151517868042, |
|
"learning_rate": 3.1934846989141166e-05, |
|
"loss": 0.29, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 370.38800048828125, |
|
"learning_rate": 3.1836130306021715e-05, |
|
"loss": 0.3409, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 145.07717895507812, |
|
"learning_rate": 3.173741362290227e-05, |
|
"loss": 0.3839, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 48.441585540771484, |
|
"learning_rate": 3.1638696939782826e-05, |
|
"loss": 0.2765, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 8.114079475402832, |
|
"learning_rate": 3.1539980256663375e-05, |
|
"loss": 0.4797, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 2.0335161685943604, |
|
"learning_rate": 3.144126357354393e-05, |
|
"loss": 0.3283, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 571.5001831054688, |
|
"learning_rate": 3.1342546890424486e-05, |
|
"loss": 0.3749, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 26.4891414642334, |
|
"learning_rate": 3.1243830207305035e-05, |
|
"loss": 0.5855, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 373.5781555175781, |
|
"learning_rate": 3.114511352418559e-05, |
|
"loss": 0.5779, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 12.153056144714355, |
|
"learning_rate": 3.1046396841066146e-05, |
|
"loss": 0.5083, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 246.8365936279297, |
|
"learning_rate": 3.0947680157946695e-05, |
|
"loss": 0.2794, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 16.38204002380371, |
|
"learning_rate": 3.084896347482725e-05, |
|
"loss": 0.1689, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 86.90618896484375, |
|
"learning_rate": 3.07502467917078e-05, |
|
"loss": 0.124, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 13.767565727233887, |
|
"learning_rate": 3.0651530108588355e-05, |
|
"loss": 0.4286, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 25.554912567138672, |
|
"learning_rate": 3.0552813425468904e-05, |
|
"loss": 0.342, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.8774542212486267, |
|
"learning_rate": 3.045409674234946e-05, |
|
"loss": 0.2024, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 53.33576583862305, |
|
"learning_rate": 3.0355380059230008e-05, |
|
"loss": 0.3032, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 6.252757549285889, |
|
"learning_rate": 3.0256663376110567e-05, |
|
"loss": 0.3633, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 0.8560687899589539, |
|
"learning_rate": 3.015794669299112e-05, |
|
"loss": 0.2662, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 5.560061454772949, |
|
"learning_rate": 3.0059230009871668e-05, |
|
"loss": 0.3057, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_balanced accuracy": 0.9113459399332592, |
|
"eval_f1": 0.9119394500117044, |
|
"eval_loss": 0.29854172468185425, |
|
"eval_precision": 0.913535516192521, |
|
"eval_recall": 0.9113459399332592, |
|
"eval_runtime": 5.6113, |
|
"eval_samples_per_second": 160.392, |
|
"eval_steps_per_second": 10.158, |
|
"step": 2026 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 5065, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"total_flos": 96787312128000.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|