{ "best_metric": 0.9832942485809326, "best_model_checkpoint": "../../experiments_checkpoints/MAdAiLab/google_t5/t5_small_patent/checkpoint-1400", "epoch": 3.0, "eval_steps": 50, "global_step": 2346, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 2.1583292484283447, "learning_rate": 0.0004978687127024723, "loss": 2.0566, "step": 10 }, { "epoch": 0.03, "grad_norm": 1.600330114364624, "learning_rate": 0.0004957374254049446, "loss": 1.9478, "step": 20 }, { "epoch": 0.04, "grad_norm": 1.930787444114685, "learning_rate": 0.0004936061381074169, "loss": 1.8597, "step": 30 }, { "epoch": 0.05, "grad_norm": 1.5029382705688477, "learning_rate": 0.0004914748508098892, "loss": 1.5716, "step": 40 }, { "epoch": 0.06, "grad_norm": 2.0470213890075684, "learning_rate": 0.0004893435635123615, "loss": 1.4814, "step": 50 }, { "epoch": 0.06, "eval_accuracy": 0.492, "eval_f1_macro": 0.34138249695786205, "eval_f1_micro": 0.492, "eval_loss": 1.4703689813613892, "eval_runtime": 4.5651, "eval_samples_per_second": 1095.26, "eval_steps_per_second": 34.391, "step": 50 }, { "epoch": 0.08, "grad_norm": 2.6178905963897705, "learning_rate": 0.00048721227621483377, "loss": 1.3366, "step": 60 }, { "epoch": 0.09, "grad_norm": 2.6837921142578125, "learning_rate": 0.00048508098891730605, "loss": 1.3933, "step": 70 }, { "epoch": 0.1, "grad_norm": 2.290954828262329, "learning_rate": 0.0004829497016197784, "loss": 1.3195, "step": 80 }, { "epoch": 0.12, "grad_norm": 2.3305504322052, "learning_rate": 0.00048081841432225065, "loss": 1.3749, "step": 90 }, { "epoch": 0.13, "grad_norm": 1.9327062368392944, "learning_rate": 0.0004786871270247229, "loss": 1.3003, "step": 100 }, { "epoch": 0.13, "eval_accuracy": 0.5512, "eval_f1_macro": 0.3876411779838728, "eval_f1_micro": 0.5512, "eval_loss": 1.2652499675750732, "eval_runtime": 4.5124, "eval_samples_per_second": 1108.067, "eval_steps_per_second": 34.793, "step": 100 }, { "epoch": 0.14, "grad_norm": 1.8468806743621826, "learning_rate": 0.00047655583972719526, "loss": 1.3399, "step": 110 }, { "epoch": 0.15, "grad_norm": 2.5669147968292236, "learning_rate": 0.00047442455242966753, "loss": 1.2677, "step": 120 }, { "epoch": 0.17, "grad_norm": 2.0333309173583984, "learning_rate": 0.0004722932651321398, "loss": 1.264, "step": 130 }, { "epoch": 0.18, "grad_norm": 2.6710681915283203, "learning_rate": 0.00047016197783461214, "loss": 1.2956, "step": 140 }, { "epoch": 0.19, "grad_norm": 1.8155871629714966, "learning_rate": 0.0004680306905370844, "loss": 1.2291, "step": 150 }, { "epoch": 0.19, "eval_accuracy": 0.563, "eval_f1_macro": 0.3999749460001978, "eval_f1_micro": 0.563, "eval_loss": 1.2304304838180542, "eval_runtime": 4.58, "eval_samples_per_second": 1091.714, "eval_steps_per_second": 34.28, "step": 150 }, { "epoch": 0.2, "grad_norm": 2.5728886127471924, "learning_rate": 0.0004658994032395567, "loss": 1.3538, "step": 160 }, { "epoch": 0.22, "grad_norm": 1.9312357902526855, "learning_rate": 0.000463768115942029, "loss": 1.3509, "step": 170 }, { "epoch": 0.23, "grad_norm": 2.0137836933135986, "learning_rate": 0.0004616368286445013, "loss": 1.2469, "step": 180 }, { "epoch": 0.24, "grad_norm": 2.5608091354370117, "learning_rate": 0.00045950554134697357, "loss": 1.2728, "step": 190 }, { "epoch": 0.26, "grad_norm": 2.511885404586792, "learning_rate": 0.0004573742540494459, "loss": 1.142, "step": 200 }, { "epoch": 0.26, "eval_accuracy": 0.5894, "eval_f1_macro": 0.45857413022807464, "eval_f1_micro": 0.5894, "eval_loss": 1.164367914199829, "eval_runtime": 4.5868, "eval_samples_per_second": 1090.093, "eval_steps_per_second": 34.229, "step": 200 }, { "epoch": 0.27, "grad_norm": 2.0801782608032227, "learning_rate": 0.0004552429667519182, "loss": 1.1444, "step": 210 }, { "epoch": 0.28, "grad_norm": 2.706130266189575, "learning_rate": 0.00045311167945439045, "loss": 1.2788, "step": 220 }, { "epoch": 0.29, "grad_norm": 2.2432401180267334, "learning_rate": 0.0004509803921568628, "loss": 1.2085, "step": 230 }, { "epoch": 0.31, "grad_norm": 2.283546209335327, "learning_rate": 0.00044884910485933505, "loss": 1.1881, "step": 240 }, { "epoch": 0.32, "grad_norm": 2.2058613300323486, "learning_rate": 0.00044671781756180733, "loss": 1.0581, "step": 250 }, { "epoch": 0.32, "eval_accuracy": 0.603, "eval_f1_macro": 0.4563218777873921, "eval_f1_micro": 0.603, "eval_loss": 1.1396372318267822, "eval_runtime": 4.6256, "eval_samples_per_second": 1080.947, "eval_steps_per_second": 33.942, "step": 250 }, { "epoch": 0.33, "grad_norm": 2.8488962650299072, "learning_rate": 0.00044458653026427966, "loss": 1.1765, "step": 260 }, { "epoch": 0.35, "grad_norm": 2.4596290588378906, "learning_rate": 0.00044245524296675193, "loss": 1.1927, "step": 270 }, { "epoch": 0.36, "grad_norm": 2.053030014038086, "learning_rate": 0.0004403239556692242, "loss": 1.0985, "step": 280 }, { "epoch": 0.37, "grad_norm": 2.664118766784668, "learning_rate": 0.00043819266837169654, "loss": 1.1782, "step": 290 }, { "epoch": 0.38, "grad_norm": 2.8193089962005615, "learning_rate": 0.0004360613810741688, "loss": 1.2415, "step": 300 }, { "epoch": 0.38, "eval_accuracy": 0.613, "eval_f1_macro": 0.4936808407307906, "eval_f1_micro": 0.613, "eval_loss": 1.1215277910232544, "eval_runtime": 4.6437, "eval_samples_per_second": 1076.718, "eval_steps_per_second": 33.809, "step": 300 }, { "epoch": 0.4, "grad_norm": 2.5075747966766357, "learning_rate": 0.0004339300937766411, "loss": 1.1684, "step": 310 }, { "epoch": 0.41, "grad_norm": 2.044328212738037, "learning_rate": 0.0004317988064791134, "loss": 1.0177, "step": 320 }, { "epoch": 0.42, "grad_norm": 2.432016611099243, "learning_rate": 0.0004296675191815857, "loss": 1.2053, "step": 330 }, { "epoch": 0.43, "grad_norm": 1.8879696130752563, "learning_rate": 0.00042753623188405797, "loss": 1.1831, "step": 340 }, { "epoch": 0.45, "grad_norm": 2.592384099960327, "learning_rate": 0.0004254049445865303, "loss": 1.1336, "step": 350 }, { "epoch": 0.45, "eval_accuracy": 0.6172, "eval_f1_macro": 0.5291747385702272, "eval_f1_micro": 0.6172, "eval_loss": 1.1041539907455444, "eval_runtime": 4.6545, "eval_samples_per_second": 1074.222, "eval_steps_per_second": 33.731, "step": 350 }, { "epoch": 0.46, "grad_norm": 2.692321538925171, "learning_rate": 0.0004232736572890026, "loss": 1.1718, "step": 360 }, { "epoch": 0.47, "grad_norm": 1.972010612487793, "learning_rate": 0.00042114236999147485, "loss": 1.2379, "step": 370 }, { "epoch": 0.49, "grad_norm": 2.035459041595459, "learning_rate": 0.0004190110826939472, "loss": 1.219, "step": 380 }, { "epoch": 0.5, "grad_norm": 3.327613115310669, "learning_rate": 0.00041687979539641946, "loss": 1.2285, "step": 390 }, { "epoch": 0.51, "grad_norm": 2.2528626918792725, "learning_rate": 0.00041474850809889173, "loss": 1.045, "step": 400 }, { "epoch": 0.51, "eval_accuracy": 0.624, "eval_f1_macro": 0.5271110605500514, "eval_f1_micro": 0.624, "eval_loss": 1.092409610748291, "eval_runtime": 4.6526, "eval_samples_per_second": 1074.679, "eval_steps_per_second": 33.745, "step": 400 }, { "epoch": 0.52, "grad_norm": 3.0680856704711914, "learning_rate": 0.00041261722080136406, "loss": 1.1587, "step": 410 }, { "epoch": 0.54, "grad_norm": 2.2462947368621826, "learning_rate": 0.00041048593350383634, "loss": 1.1927, "step": 420 }, { "epoch": 0.55, "grad_norm": 2.637155771255493, "learning_rate": 0.0004083546462063086, "loss": 1.1396, "step": 430 }, { "epoch": 0.56, "grad_norm": 2.410139799118042, "learning_rate": 0.00040622335890878094, "loss": 1.1035, "step": 440 }, { "epoch": 0.58, "grad_norm": 2.298185110092163, "learning_rate": 0.0004040920716112532, "loss": 1.1204, "step": 450 }, { "epoch": 0.58, "eval_accuracy": 0.6184, "eval_f1_macro": 0.5146174720519366, "eval_f1_micro": 0.6184, "eval_loss": 1.089658260345459, "eval_runtime": 4.6519, "eval_samples_per_second": 1074.824, "eval_steps_per_second": 33.749, "step": 450 }, { "epoch": 0.59, "grad_norm": 3.443023920059204, "learning_rate": 0.0004019607843137255, "loss": 1.1065, "step": 460 }, { "epoch": 0.6, "grad_norm": 2.6598963737487793, "learning_rate": 0.0003998294970161978, "loss": 1.143, "step": 470 }, { "epoch": 0.61, "grad_norm": 3.6134822368621826, "learning_rate": 0.0003976982097186701, "loss": 1.1093, "step": 480 }, { "epoch": 0.63, "grad_norm": 2.171005964279175, "learning_rate": 0.00039556692242114237, "loss": 1.2507, "step": 490 }, { "epoch": 0.64, "grad_norm": 2.519437551498413, "learning_rate": 0.0003934356351236147, "loss": 1.0691, "step": 500 }, { "epoch": 0.64, "eval_accuracy": 0.6236, "eval_f1_macro": 0.51689845984142, "eval_f1_micro": 0.6236, "eval_loss": 1.0826703310012817, "eval_runtime": 4.6344, "eval_samples_per_second": 1078.897, "eval_steps_per_second": 33.877, "step": 500 }, { "epoch": 0.65, "grad_norm": 3.738205671310425, "learning_rate": 0.000391304347826087, "loss": 1.1863, "step": 510 }, { "epoch": 0.66, "grad_norm": 2.0040698051452637, "learning_rate": 0.00038917306052855925, "loss": 1.1366, "step": 520 }, { "epoch": 0.68, "grad_norm": 2.250598907470703, "learning_rate": 0.0003870417732310316, "loss": 1.1372, "step": 530 }, { "epoch": 0.69, "grad_norm": 1.850501537322998, "learning_rate": 0.00038491048593350386, "loss": 1.0724, "step": 540 }, { "epoch": 0.7, "grad_norm": 2.0993905067443848, "learning_rate": 0.00038277919863597613, "loss": 0.9782, "step": 550 }, { "epoch": 0.7, "eval_accuracy": 0.6258, "eval_f1_macro": 0.5302739353593817, "eval_f1_micro": 0.6258, "eval_loss": 1.0663546323776245, "eval_runtime": 4.6409, "eval_samples_per_second": 1077.372, "eval_steps_per_second": 33.829, "step": 550 }, { "epoch": 0.72, "grad_norm": 2.872110605239868, "learning_rate": 0.00038064791133844846, "loss": 1.103, "step": 560 }, { "epoch": 0.73, "grad_norm": 2.1900711059570312, "learning_rate": 0.00037851662404092074, "loss": 1.1209, "step": 570 }, { "epoch": 0.74, "grad_norm": 2.3909690380096436, "learning_rate": 0.000376385336743393, "loss": 1.0616, "step": 580 }, { "epoch": 0.75, "grad_norm": 2.6566083431243896, "learning_rate": 0.00037425404944586534, "loss": 1.0967, "step": 590 }, { "epoch": 0.77, "grad_norm": 2.543503761291504, "learning_rate": 0.0003721227621483376, "loss": 1.081, "step": 600 }, { "epoch": 0.77, "eval_accuracy": 0.638, "eval_f1_macro": 0.5580934642391258, "eval_f1_micro": 0.638, "eval_loss": 1.0547568798065186, "eval_runtime": 4.6359, "eval_samples_per_second": 1078.528, "eval_steps_per_second": 33.866, "step": 600 }, { "epoch": 0.78, "grad_norm": 2.8259775638580322, "learning_rate": 0.0003699914748508099, "loss": 1.1526, "step": 610 }, { "epoch": 0.79, "grad_norm": 1.438499093055725, "learning_rate": 0.0003678601875532822, "loss": 1.0818, "step": 620 }, { "epoch": 0.81, "grad_norm": 2.1894028186798096, "learning_rate": 0.0003657289002557545, "loss": 1.1406, "step": 630 }, { "epoch": 0.82, "grad_norm": 2.273362159729004, "learning_rate": 0.0003635976129582268, "loss": 1.0805, "step": 640 }, { "epoch": 0.83, "grad_norm": 2.018456220626831, "learning_rate": 0.0003614663256606991, "loss": 1.1033, "step": 650 }, { "epoch": 0.83, "eval_accuracy": 0.6398, "eval_f1_macro": 0.5593218396680832, "eval_f1_micro": 0.6398, "eval_loss": 1.0300291776657104, "eval_runtime": 4.6355, "eval_samples_per_second": 1078.633, "eval_steps_per_second": 33.869, "step": 650 }, { "epoch": 0.84, "grad_norm": 2.58931303024292, "learning_rate": 0.0003593350383631714, "loss": 1.0768, "step": 660 }, { "epoch": 0.86, "grad_norm": 2.5274384021759033, "learning_rate": 0.00035720375106564365, "loss": 1.1398, "step": 670 }, { "epoch": 0.87, "grad_norm": 1.7806419134140015, "learning_rate": 0.000355072463768116, "loss": 1.1315, "step": 680 }, { "epoch": 0.88, "grad_norm": 2.8781118392944336, "learning_rate": 0.00035294117647058826, "loss": 1.2041, "step": 690 }, { "epoch": 0.9, "grad_norm": 1.604295253753662, "learning_rate": 0.00035080988917306053, "loss": 1.0946, "step": 700 }, { "epoch": 0.9, "eval_accuracy": 0.632, "eval_f1_macro": 0.5544640143725398, "eval_f1_micro": 0.632, "eval_loss": 1.0619752407073975, "eval_runtime": 4.6258, "eval_samples_per_second": 1080.883, "eval_steps_per_second": 33.94, "step": 700 }, { "epoch": 0.91, "grad_norm": 1.7683440446853638, "learning_rate": 0.00034867860187553286, "loss": 1.1201, "step": 710 }, { "epoch": 0.92, "grad_norm": 1.7039743661880493, "learning_rate": 0.00034654731457800514, "loss": 1.1598, "step": 720 }, { "epoch": 0.93, "grad_norm": 2.3733344078063965, "learning_rate": 0.0003444160272804774, "loss": 1.0286, "step": 730 }, { "epoch": 0.95, "grad_norm": 3.2192821502685547, "learning_rate": 0.00034228473998294974, "loss": 1.1397, "step": 740 }, { "epoch": 0.96, "grad_norm": 1.8821942806243896, "learning_rate": 0.000340153452685422, "loss": 1.0261, "step": 750 }, { "epoch": 0.96, "eval_accuracy": 0.6422, "eval_f1_macro": 0.5648413230502044, "eval_f1_micro": 0.6422, "eval_loss": 1.0328495502471924, "eval_runtime": 4.6303, "eval_samples_per_second": 1079.842, "eval_steps_per_second": 33.907, "step": 750 }, { "epoch": 0.97, "grad_norm": 3.0460636615753174, "learning_rate": 0.0003380221653878943, "loss": 1.0396, "step": 760 }, { "epoch": 0.98, "grad_norm": 1.969018816947937, "learning_rate": 0.0003358908780903666, "loss": 1.1069, "step": 770 }, { "epoch": 1.0, "grad_norm": 2.0290212631225586, "learning_rate": 0.0003337595907928389, "loss": 1.0933, "step": 780 }, { "epoch": 1.01, "grad_norm": 2.099224328994751, "learning_rate": 0.0003316283034953112, "loss": 0.9485, "step": 790 }, { "epoch": 1.02, "grad_norm": 1.5207663774490356, "learning_rate": 0.0003294970161977835, "loss": 0.9153, "step": 800 }, { "epoch": 1.02, "eval_accuracy": 0.6438, "eval_f1_macro": 0.5706285939519296, "eval_f1_micro": 0.6438, "eval_loss": 1.0378421545028687, "eval_runtime": 4.6333, "eval_samples_per_second": 1079.143, "eval_steps_per_second": 33.885, "step": 800 }, { "epoch": 1.04, "grad_norm": 2.9443259239196777, "learning_rate": 0.0003273657289002558, "loss": 1.0276, "step": 810 }, { "epoch": 1.05, "grad_norm": 2.0874528884887695, "learning_rate": 0.00032523444160272806, "loss": 0.8604, "step": 820 }, { "epoch": 1.06, "grad_norm": 2.344442367553711, "learning_rate": 0.0003231031543052004, "loss": 0.9834, "step": 830 }, { "epoch": 1.07, "grad_norm": 2.2925143241882324, "learning_rate": 0.0003209718670076726, "loss": 0.9142, "step": 840 }, { "epoch": 1.09, "grad_norm": 2.234575033187866, "learning_rate": 0.0003188405797101449, "loss": 0.9678, "step": 850 }, { "epoch": 1.09, "eval_accuracy": 0.6402, "eval_f1_macro": 0.5482744671072567, "eval_f1_micro": 0.6402, "eval_loss": 1.0520384311676025, "eval_runtime": 4.6291, "eval_samples_per_second": 1080.114, "eval_steps_per_second": 33.916, "step": 850 }, { "epoch": 1.1, "grad_norm": 2.2082581520080566, "learning_rate": 0.0003167092924126172, "loss": 0.9128, "step": 860 }, { "epoch": 1.11, "grad_norm": 2.396441698074341, "learning_rate": 0.0003145780051150895, "loss": 0.9842, "step": 870 }, { "epoch": 1.13, "grad_norm": 2.544588565826416, "learning_rate": 0.00031244671781756176, "loss": 0.899, "step": 880 }, { "epoch": 1.14, "grad_norm": 3.3907265663146973, "learning_rate": 0.0003103154305200341, "loss": 1.0218, "step": 890 }, { "epoch": 1.15, "grad_norm": 2.5349113941192627, "learning_rate": 0.00030818414322250637, "loss": 0.9619, "step": 900 }, { "epoch": 1.15, "eval_accuracy": 0.6408, "eval_f1_macro": 0.5593284960739974, "eval_f1_micro": 0.6408, "eval_loss": 1.0482654571533203, "eval_runtime": 4.5768, "eval_samples_per_second": 1092.472, "eval_steps_per_second": 34.304, "step": 900 }, { "epoch": 1.16, "grad_norm": 2.689525842666626, "learning_rate": 0.00030605285592497864, "loss": 0.9623, "step": 910 }, { "epoch": 1.18, "grad_norm": 2.4175920486450195, "learning_rate": 0.00030392156862745097, "loss": 0.9547, "step": 920 }, { "epoch": 1.19, "grad_norm": 2.392862319946289, "learning_rate": 0.00030179028132992325, "loss": 0.9661, "step": 930 }, { "epoch": 1.2, "grad_norm": 2.7447049617767334, "learning_rate": 0.0002996589940323955, "loss": 0.9644, "step": 940 }, { "epoch": 1.21, "grad_norm": 2.1388115882873535, "learning_rate": 0.00029752770673486785, "loss": 0.9972, "step": 950 }, { "epoch": 1.21, "eval_accuracy": 0.6496, "eval_f1_macro": 0.5684800364514564, "eval_f1_micro": 0.6496, "eval_loss": 1.0254615545272827, "eval_runtime": 4.6237, "eval_samples_per_second": 1081.385, "eval_steps_per_second": 33.955, "step": 950 }, { "epoch": 1.23, "grad_norm": 2.1175084114074707, "learning_rate": 0.00029539641943734013, "loss": 0.8829, "step": 960 }, { "epoch": 1.24, "grad_norm": 2.6198832988739014, "learning_rate": 0.0002932651321398124, "loss": 1.0612, "step": 970 }, { "epoch": 1.25, "grad_norm": 2.1012232303619385, "learning_rate": 0.00029113384484228473, "loss": 0.9605, "step": 980 }, { "epoch": 1.27, "grad_norm": 2.3467049598693848, "learning_rate": 0.000289002557544757, "loss": 0.9891, "step": 990 }, { "epoch": 1.28, "grad_norm": 3.222963809967041, "learning_rate": 0.0002868712702472293, "loss": 1.027, "step": 1000 }, { "epoch": 1.28, "eval_accuracy": 0.645, "eval_f1_macro": 0.5742246375864342, "eval_f1_micro": 0.645, "eval_loss": 1.0296003818511963, "eval_runtime": 4.632, "eval_samples_per_second": 1079.448, "eval_steps_per_second": 33.895, "step": 1000 }, { "epoch": 1.29, "grad_norm": 2.7652339935302734, "learning_rate": 0.0002847399829497016, "loss": 1.0417, "step": 1010 }, { "epoch": 1.3, "grad_norm": 2.1574389934539795, "learning_rate": 0.0002826086956521739, "loss": 0.9338, "step": 1020 }, { "epoch": 1.32, "grad_norm": 2.7982587814331055, "learning_rate": 0.00028047740835464616, "loss": 0.8798, "step": 1030 }, { "epoch": 1.33, "grad_norm": 3.097017526626587, "learning_rate": 0.0002783461210571185, "loss": 0.9041, "step": 1040 }, { "epoch": 1.34, "grad_norm": 2.7650091648101807, "learning_rate": 0.00027621483375959077, "loss": 0.8248, "step": 1050 }, { "epoch": 1.34, "eval_accuracy": 0.655, "eval_f1_macro": 0.5811778167583621, "eval_f1_micro": 0.655, "eval_loss": 1.0331394672393799, "eval_runtime": 4.5784, "eval_samples_per_second": 1092.075, "eval_steps_per_second": 34.291, "step": 1050 }, { "epoch": 1.36, "grad_norm": 2.5823092460632324, "learning_rate": 0.0002740835464620631, "loss": 1.1103, "step": 1060 }, { "epoch": 1.37, "grad_norm": 2.0402040481567383, "learning_rate": 0.0002719522591645354, "loss": 1.0841, "step": 1070 }, { "epoch": 1.38, "grad_norm": 3.023350238800049, "learning_rate": 0.00026982097186700765, "loss": 0.971, "step": 1080 }, { "epoch": 1.39, "grad_norm": 1.5279247760772705, "learning_rate": 0.00026768968456948, "loss": 0.9861, "step": 1090 }, { "epoch": 1.41, "grad_norm": 2.5153307914733887, "learning_rate": 0.00026555839727195225, "loss": 0.9405, "step": 1100 }, { "epoch": 1.41, "eval_accuracy": 0.6502, "eval_f1_macro": 0.5719450847979869, "eval_f1_micro": 0.6502, "eval_loss": 1.020789623260498, "eval_runtime": 4.5771, "eval_samples_per_second": 1092.404, "eval_steps_per_second": 34.301, "step": 1100 }, { "epoch": 1.42, "grad_norm": 2.871000289916992, "learning_rate": 0.00026342710997442453, "loss": 0.9694, "step": 1110 }, { "epoch": 1.43, "grad_norm": 2.492405414581299, "learning_rate": 0.00026129582267689686, "loss": 0.8878, "step": 1120 }, { "epoch": 1.45, "grad_norm": 2.770477056503296, "learning_rate": 0.00025916453537936913, "loss": 0.9456, "step": 1130 }, { "epoch": 1.46, "grad_norm": 7.3541340827941895, "learning_rate": 0.0002570332480818414, "loss": 0.9401, "step": 1140 }, { "epoch": 1.47, "grad_norm": 2.690429449081421, "learning_rate": 0.00025490196078431374, "loss": 0.9735, "step": 1150 }, { "epoch": 1.47, "eval_accuracy": 0.6388, "eval_f1_macro": 0.5743608505759401, "eval_f1_micro": 0.6388, "eval_loss": 1.0388753414154053, "eval_runtime": 4.6105, "eval_samples_per_second": 1084.485, "eval_steps_per_second": 34.053, "step": 1150 }, { "epoch": 1.48, "grad_norm": 3.4364089965820312, "learning_rate": 0.000252770673486786, "loss": 1.124, "step": 1160 }, { "epoch": 1.5, "grad_norm": 2.5237576961517334, "learning_rate": 0.0002506393861892583, "loss": 1.0429, "step": 1170 }, { "epoch": 1.51, "grad_norm": 2.4886770248413086, "learning_rate": 0.0002485080988917306, "loss": 0.9049, "step": 1180 }, { "epoch": 1.52, "grad_norm": 2.1807761192321777, "learning_rate": 0.0002463768115942029, "loss": 0.9292, "step": 1190 }, { "epoch": 1.53, "grad_norm": 2.830251455307007, "learning_rate": 0.0002442455242966752, "loss": 0.9566, "step": 1200 }, { "epoch": 1.53, "eval_accuracy": 0.658, "eval_f1_macro": 0.5749866859043913, "eval_f1_micro": 0.658, "eval_loss": 0.9963122010231018, "eval_runtime": 4.6173, "eval_samples_per_second": 1082.894, "eval_steps_per_second": 34.003, "step": 1200 }, { "epoch": 1.55, "grad_norm": 2.474280834197998, "learning_rate": 0.0002421142369991475, "loss": 0.8845, "step": 1210 }, { "epoch": 1.56, "grad_norm": 2.4428656101226807, "learning_rate": 0.0002399829497016198, "loss": 1.0546, "step": 1220 }, { "epoch": 1.57, "grad_norm": 2.1537678241729736, "learning_rate": 0.00023785166240409208, "loss": 0.9547, "step": 1230 }, { "epoch": 1.59, "grad_norm": 1.9569634199142456, "learning_rate": 0.00023572037510656438, "loss": 0.8683, "step": 1240 }, { "epoch": 1.6, "grad_norm": 2.2532451152801514, "learning_rate": 0.00023358908780903668, "loss": 0.9423, "step": 1250 }, { "epoch": 1.6, "eval_accuracy": 0.6496, "eval_f1_macro": 0.5831534303823815, "eval_f1_micro": 0.6496, "eval_loss": 0.9966222643852234, "eval_runtime": 4.6211, "eval_samples_per_second": 1081.985, "eval_steps_per_second": 33.974, "step": 1250 }, { "epoch": 1.61, "grad_norm": 2.022592306137085, "learning_rate": 0.00023145780051150893, "loss": 0.8889, "step": 1260 }, { "epoch": 1.62, "grad_norm": 3.371534585952759, "learning_rate": 0.00022932651321398123, "loss": 0.999, "step": 1270 }, { "epoch": 1.64, "grad_norm": 2.3057377338409424, "learning_rate": 0.00022719522591645354, "loss": 0.9226, "step": 1280 }, { "epoch": 1.65, "grad_norm": 2.5777747631073, "learning_rate": 0.0002250639386189258, "loss": 0.8813, "step": 1290 }, { "epoch": 1.66, "grad_norm": 2.6509506702423096, "learning_rate": 0.00022293265132139811, "loss": 0.9248, "step": 1300 }, { "epoch": 1.66, "eval_accuracy": 0.6558, "eval_f1_macro": 0.5856739858943232, "eval_f1_micro": 0.6558, "eval_loss": 0.9953044652938843, "eval_runtime": 4.6144, "eval_samples_per_second": 1083.57, "eval_steps_per_second": 34.024, "step": 1300 }, { "epoch": 1.68, "grad_norm": 2.321070671081543, "learning_rate": 0.00022080136402387042, "loss": 0.9733, "step": 1310 }, { "epoch": 1.69, "grad_norm": 3.0813984870910645, "learning_rate": 0.0002186700767263427, "loss": 0.9548, "step": 1320 }, { "epoch": 1.7, "grad_norm": 2.4469900131225586, "learning_rate": 0.000216538789428815, "loss": 0.9282, "step": 1330 }, { "epoch": 1.71, "grad_norm": 2.527189016342163, "learning_rate": 0.0002144075021312873, "loss": 0.972, "step": 1340 }, { "epoch": 1.73, "grad_norm": 2.339242935180664, "learning_rate": 0.00021227621483375957, "loss": 1.008, "step": 1350 }, { "epoch": 1.73, "eval_accuracy": 0.6588, "eval_f1_macro": 0.5809236785268852, "eval_f1_micro": 0.6588, "eval_loss": 0.9940443634986877, "eval_runtime": 4.6228, "eval_samples_per_second": 1081.603, "eval_steps_per_second": 33.962, "step": 1350 }, { "epoch": 1.74, "grad_norm": 2.1901655197143555, "learning_rate": 0.00021014492753623187, "loss": 0.9911, "step": 1360 }, { "epoch": 1.75, "grad_norm": 1.4824706315994263, "learning_rate": 0.00020801364023870418, "loss": 0.9618, "step": 1370 }, { "epoch": 1.76, "grad_norm": 1.87557852268219, "learning_rate": 0.00020588235294117645, "loss": 0.9933, "step": 1380 }, { "epoch": 1.78, "grad_norm": 2.4000425338745117, "learning_rate": 0.00020375106564364876, "loss": 0.9995, "step": 1390 }, { "epoch": 1.79, "grad_norm": 2.9583659172058105, "learning_rate": 0.00020161977834612106, "loss": 0.9098, "step": 1400 }, { "epoch": 1.79, "eval_accuracy": 0.657, "eval_f1_macro": 0.5821992906460801, "eval_f1_micro": 0.657, "eval_loss": 0.9832942485809326, "eval_runtime": 4.6161, "eval_samples_per_second": 1083.16, "eval_steps_per_second": 34.011, "step": 1400 }, { "epoch": 1.8, "grad_norm": 2.1622211933135986, "learning_rate": 0.00019948849104859333, "loss": 1.0026, "step": 1410 }, { "epoch": 1.82, "grad_norm": 2.2868545055389404, "learning_rate": 0.00019735720375106564, "loss": 0.9563, "step": 1420 }, { "epoch": 1.83, "grad_norm": 1.8215919733047485, "learning_rate": 0.00019522591645353794, "loss": 0.8711, "step": 1430 }, { "epoch": 1.84, "grad_norm": 2.21563458442688, "learning_rate": 0.0001930946291560102, "loss": 0.8976, "step": 1440 }, { "epoch": 1.85, "grad_norm": 2.023301839828491, "learning_rate": 0.00019096334185848252, "loss": 0.8679, "step": 1450 }, { "epoch": 1.85, "eval_accuracy": 0.6644, "eval_f1_macro": 0.589941204224004, "eval_f1_micro": 0.6644, "eval_loss": 0.9842157959938049, "eval_runtime": 4.6173, "eval_samples_per_second": 1082.894, "eval_steps_per_second": 34.003, "step": 1450 }, { "epoch": 1.87, "grad_norm": 3.6565377712249756, "learning_rate": 0.00018883205456095482, "loss": 0.9224, "step": 1460 }, { "epoch": 1.88, "grad_norm": 2.6100504398345947, "learning_rate": 0.0001867007672634271, "loss": 0.9561, "step": 1470 }, { "epoch": 1.89, "grad_norm": 2.7163302898406982, "learning_rate": 0.0001845694799658994, "loss": 0.9514, "step": 1480 }, { "epoch": 1.91, "grad_norm": 2.600562572479248, "learning_rate": 0.0001824381926683717, "loss": 0.9922, "step": 1490 }, { "epoch": 1.92, "grad_norm": 2.355883836746216, "learning_rate": 0.000180306905370844, "loss": 1.1342, "step": 1500 }, { "epoch": 1.92, "eval_accuracy": 0.6526, "eval_f1_macro": 0.5761908672451729, "eval_f1_micro": 0.6526, "eval_loss": 0.9932846426963806, "eval_runtime": 4.6151, "eval_samples_per_second": 1083.409, "eval_steps_per_second": 34.019, "step": 1500 }, { "epoch": 1.93, "grad_norm": 2.483553409576416, "learning_rate": 0.00017817561807331628, "loss": 0.8869, "step": 1510 }, { "epoch": 1.94, "grad_norm": 2.1051151752471924, "learning_rate": 0.00017604433077578858, "loss": 0.8881, "step": 1520 }, { "epoch": 1.96, "grad_norm": 2.524996757507324, "learning_rate": 0.00017391304347826088, "loss": 0.9509, "step": 1530 }, { "epoch": 1.97, "grad_norm": 2.025562286376953, "learning_rate": 0.00017178175618073316, "loss": 0.9632, "step": 1540 }, { "epoch": 1.98, "grad_norm": 2.144996404647827, "learning_rate": 0.00016965046888320546, "loss": 0.9157, "step": 1550 }, { "epoch": 1.98, "eval_accuracy": 0.6626, "eval_f1_macro": 0.5923777456303617, "eval_f1_micro": 0.6626, "eval_loss": 0.986947774887085, "eval_runtime": 4.6155, "eval_samples_per_second": 1083.31, "eval_steps_per_second": 34.016, "step": 1550 }, { "epoch": 1.99, "grad_norm": 2.602703809738159, "learning_rate": 0.00016751918158567776, "loss": 0.9573, "step": 1560 }, { "epoch": 2.01, "grad_norm": 2.3346447944641113, "learning_rate": 0.00016538789428815004, "loss": 0.781, "step": 1570 }, { "epoch": 2.02, "grad_norm": 1.8888440132141113, "learning_rate": 0.00016325660699062234, "loss": 0.9018, "step": 1580 }, { "epoch": 2.03, "grad_norm": 2.854541063308716, "learning_rate": 0.00016112531969309464, "loss": 0.833, "step": 1590 }, { "epoch": 2.05, "grad_norm": 2.8383681774139404, "learning_rate": 0.00015899403239556692, "loss": 0.8084, "step": 1600 }, { "epoch": 2.05, "eval_accuracy": 0.6654, "eval_f1_macro": 0.5892559746460942, "eval_f1_micro": 0.6654, "eval_loss": 0.9908738136291504, "eval_runtime": 4.6175, "eval_samples_per_second": 1082.838, "eval_steps_per_second": 34.001, "step": 1600 }, { "epoch": 2.06, "grad_norm": 2.7356762886047363, "learning_rate": 0.00015686274509803922, "loss": 0.8014, "step": 1610 }, { "epoch": 2.07, "grad_norm": 1.9319524765014648, "learning_rate": 0.00015473145780051152, "loss": 0.9392, "step": 1620 }, { "epoch": 2.08, "grad_norm": 2.2316906452178955, "learning_rate": 0.0001526001705029838, "loss": 0.8349, "step": 1630 }, { "epoch": 2.1, "grad_norm": 2.030834674835205, "learning_rate": 0.0001504688832054561, "loss": 0.804, "step": 1640 }, { "epoch": 2.11, "grad_norm": 2.651904821395874, "learning_rate": 0.0001483375959079284, "loss": 0.7373, "step": 1650 }, { "epoch": 2.11, "eval_accuracy": 0.6622, "eval_f1_macro": 0.5964545001055274, "eval_f1_micro": 0.6622, "eval_loss": 0.9894343614578247, "eval_runtime": 4.6227, "eval_samples_per_second": 1081.625, "eval_steps_per_second": 33.963, "step": 1650 }, { "epoch": 2.12, "grad_norm": 2.3340699672698975, "learning_rate": 0.00014620630861040068, "loss": 0.79, "step": 1660 }, { "epoch": 2.14, "grad_norm": 1.6943384408950806, "learning_rate": 0.00014407502131287298, "loss": 0.8374, "step": 1670 }, { "epoch": 2.15, "grad_norm": 2.2380564212799072, "learning_rate": 0.00014194373401534528, "loss": 0.8359, "step": 1680 }, { "epoch": 2.16, "grad_norm": 2.5444910526275635, "learning_rate": 0.00013981244671781756, "loss": 0.73, "step": 1690 }, { "epoch": 2.17, "grad_norm": 2.747690200805664, "learning_rate": 0.00013768115942028986, "loss": 0.9081, "step": 1700 }, { "epoch": 2.17, "eval_accuracy": 0.6614, "eval_f1_macro": 0.5879935284170169, "eval_f1_micro": 0.6614, "eval_loss": 0.999671459197998, "eval_runtime": 4.6154, "eval_samples_per_second": 1083.329, "eval_steps_per_second": 34.017, "step": 1700 }, { "epoch": 2.19, "grad_norm": 2.335947275161743, "learning_rate": 0.00013554987212276216, "loss": 0.8419, "step": 1710 }, { "epoch": 2.2, "grad_norm": 2.619969129562378, "learning_rate": 0.00013341858482523444, "loss": 0.8277, "step": 1720 }, { "epoch": 2.21, "grad_norm": 2.147700071334839, "learning_rate": 0.00013128729752770674, "loss": 0.7419, "step": 1730 }, { "epoch": 2.23, "grad_norm": 3.1116886138916016, "learning_rate": 0.00012915601023017904, "loss": 0.8837, "step": 1740 }, { "epoch": 2.24, "grad_norm": 2.3517539501190186, "learning_rate": 0.00012702472293265132, "loss": 0.8064, "step": 1750 }, { "epoch": 2.24, "eval_accuracy": 0.659, "eval_f1_macro": 0.5918523615308761, "eval_f1_micro": 0.659, "eval_loss": 0.9998078346252441, "eval_runtime": 4.6138, "eval_samples_per_second": 1083.711, "eval_steps_per_second": 34.029, "step": 1750 }, { "epoch": 2.25, "grad_norm": 2.4132673740386963, "learning_rate": 0.00012489343563512362, "loss": 0.7836, "step": 1760 }, { "epoch": 2.26, "grad_norm": 2.801809787750244, "learning_rate": 0.00012276214833759592, "loss": 0.9516, "step": 1770 }, { "epoch": 2.28, "grad_norm": 2.2415406703948975, "learning_rate": 0.00012063086104006821, "loss": 0.7053, "step": 1780 }, { "epoch": 2.29, "grad_norm": 2.9133055210113525, "learning_rate": 0.0001184995737425405, "loss": 0.8583, "step": 1790 }, { "epoch": 2.3, "grad_norm": 2.091689109802246, "learning_rate": 0.00011636828644501279, "loss": 0.8519, "step": 1800 }, { "epoch": 2.3, "eval_accuracy": 0.6584, "eval_f1_macro": 0.5880330853167925, "eval_f1_micro": 0.6584, "eval_loss": 1.0030735731124878, "eval_runtime": 4.6181, "eval_samples_per_second": 1082.688, "eval_steps_per_second": 33.996, "step": 1800 }, { "epoch": 2.31, "grad_norm": 2.7555630207061768, "learning_rate": 0.0001142369991474851, "loss": 0.8504, "step": 1810 }, { "epoch": 2.33, "grad_norm": 2.736027717590332, "learning_rate": 0.00011210571184995738, "loss": 0.7777, "step": 1820 }, { "epoch": 2.34, "grad_norm": 2.5624210834503174, "learning_rate": 0.00010997442455242967, "loss": 0.8027, "step": 1830 }, { "epoch": 2.35, "grad_norm": 3.113119602203369, "learning_rate": 0.00010784313725490197, "loss": 0.7215, "step": 1840 }, { "epoch": 2.37, "grad_norm": 2.8675148487091064, "learning_rate": 0.00010571184995737426, "loss": 0.8711, "step": 1850 }, { "epoch": 2.37, "eval_accuracy": 0.6666, "eval_f1_macro": 0.59808390789386, "eval_f1_micro": 0.6666, "eval_loss": 0.9974775314331055, "eval_runtime": 4.6223, "eval_samples_per_second": 1081.724, "eval_steps_per_second": 33.966, "step": 1850 }, { "epoch": 2.38, "grad_norm": 3.3633134365081787, "learning_rate": 0.00010358056265984655, "loss": 0.872, "step": 1860 }, { "epoch": 2.39, "grad_norm": 2.622450828552246, "learning_rate": 0.00010144927536231885, "loss": 0.8464, "step": 1870 }, { "epoch": 2.4, "grad_norm": 2.2080349922180176, "learning_rate": 9.931798806479114e-05, "loss": 0.7183, "step": 1880 }, { "epoch": 2.42, "grad_norm": 2.735830068588257, "learning_rate": 9.718670076726342e-05, "loss": 0.7964, "step": 1890 }, { "epoch": 2.43, "grad_norm": 2.720543384552002, "learning_rate": 9.505541346973572e-05, "loss": 0.7617, "step": 1900 }, { "epoch": 2.43, "eval_accuracy": 0.6584, "eval_f1_macro": 0.5848770420558893, "eval_f1_micro": 0.6584, "eval_loss": 1.014381766319275, "eval_runtime": 4.6129, "eval_samples_per_second": 1083.915, "eval_steps_per_second": 34.035, "step": 1900 }, { "epoch": 2.44, "grad_norm": 2.777467966079712, "learning_rate": 9.292412617220801e-05, "loss": 0.7938, "step": 1910 }, { "epoch": 2.46, "grad_norm": 2.8683433532714844, "learning_rate": 9.07928388746803e-05, "loss": 0.7492, "step": 1920 }, { "epoch": 2.47, "grad_norm": 2.9691097736358643, "learning_rate": 8.86615515771526e-05, "loss": 0.8453, "step": 1930 }, { "epoch": 2.48, "grad_norm": 3.0169878005981445, "learning_rate": 8.653026427962489e-05, "loss": 0.7834, "step": 1940 }, { "epoch": 2.49, "grad_norm": 2.8592581748962402, "learning_rate": 8.439897698209718e-05, "loss": 0.717, "step": 1950 }, { "epoch": 2.49, "eval_accuracy": 0.6622, "eval_f1_macro": 0.5902745243208923, "eval_f1_micro": 0.6622, "eval_loss": 1.010201334953308, "eval_runtime": 4.606, "eval_samples_per_second": 1085.541, "eval_steps_per_second": 34.086, "step": 1950 }, { "epoch": 2.51, "grad_norm": 2.6895742416381836, "learning_rate": 8.226768968456948e-05, "loss": 0.7258, "step": 1960 }, { "epoch": 2.52, "grad_norm": 2.743197441101074, "learning_rate": 8.013640238704177e-05, "loss": 0.8436, "step": 1970 }, { "epoch": 2.53, "grad_norm": 3.289165735244751, "learning_rate": 7.800511508951406e-05, "loss": 0.8923, "step": 1980 }, { "epoch": 2.54, "grad_norm": 2.217371940612793, "learning_rate": 7.587382779198636e-05, "loss": 0.7365, "step": 1990 }, { "epoch": 2.56, "grad_norm": 3.4667258262634277, "learning_rate": 7.374254049445865e-05, "loss": 0.857, "step": 2000 }, { "epoch": 2.56, "eval_accuracy": 0.6622, "eval_f1_macro": 0.5923250147204171, "eval_f1_micro": 0.6622, "eval_loss": 1.0059385299682617, "eval_runtime": 4.5385, "eval_samples_per_second": 1101.676, "eval_steps_per_second": 34.593, "step": 2000 }, { "epoch": 2.57, "grad_norm": 2.3217034339904785, "learning_rate": 7.161125319693094e-05, "loss": 0.8445, "step": 2010 }, { "epoch": 2.58, "grad_norm": 2.4780850410461426, "learning_rate": 6.947996589940324e-05, "loss": 0.7369, "step": 2020 }, { "epoch": 2.6, "grad_norm": 2.254239559173584, "learning_rate": 6.734867860187553e-05, "loss": 0.7862, "step": 2030 }, { "epoch": 2.61, "grad_norm": 1.7889450788497925, "learning_rate": 6.521739130434782e-05, "loss": 0.7451, "step": 2040 }, { "epoch": 2.62, "grad_norm": 3.1137242317199707, "learning_rate": 6.308610400682012e-05, "loss": 0.8623, "step": 2050 }, { "epoch": 2.62, "eval_accuracy": 0.664, "eval_f1_macro": 0.5971048543279723, "eval_f1_micro": 0.664, "eval_loss": 1.0025016069412231, "eval_runtime": 4.6042, "eval_samples_per_second": 1085.954, "eval_steps_per_second": 34.099, "step": 2050 }, { "epoch": 2.63, "grad_norm": 2.7454991340637207, "learning_rate": 6.095481670929241e-05, "loss": 0.8, "step": 2060 }, { "epoch": 2.65, "grad_norm": 2.9460010528564453, "learning_rate": 5.882352941176471e-05, "loss": 0.7284, "step": 2070 }, { "epoch": 2.66, "grad_norm": 3.019240379333496, "learning_rate": 5.6692242114237e-05, "loss": 0.7608, "step": 2080 }, { "epoch": 2.67, "grad_norm": 2.111807346343994, "learning_rate": 5.456095481670929e-05, "loss": 0.829, "step": 2090 }, { "epoch": 2.69, "grad_norm": 3.3570430278778076, "learning_rate": 5.242966751918159e-05, "loss": 0.782, "step": 2100 }, { "epoch": 2.69, "eval_accuracy": 0.6644, "eval_f1_macro": 0.5985316673414206, "eval_f1_micro": 0.6644, "eval_loss": 1.0012755393981934, "eval_runtime": 4.5602, "eval_samples_per_second": 1096.451, "eval_steps_per_second": 34.429, "step": 2100 }, { "epoch": 2.7, "grad_norm": 2.958252191543579, "learning_rate": 5.0298380221653884e-05, "loss": 0.7804, "step": 2110 }, { "epoch": 2.71, "grad_norm": 2.9975624084472656, "learning_rate": 4.816709292412617e-05, "loss": 0.7807, "step": 2120 }, { "epoch": 2.72, "grad_norm": 2.5771234035491943, "learning_rate": 4.603580562659847e-05, "loss": 0.7609, "step": 2130 }, { "epoch": 2.74, "grad_norm": 2.9194953441619873, "learning_rate": 4.3904518329070764e-05, "loss": 0.7783, "step": 2140 }, { "epoch": 2.75, "grad_norm": 3.0004827976226807, "learning_rate": 4.177323103154305e-05, "loss": 0.8018, "step": 2150 }, { "epoch": 2.75, "eval_accuracy": 0.6652, "eval_f1_macro": 0.5984558375770885, "eval_f1_micro": 0.6652, "eval_loss": 1.0043922662734985, "eval_runtime": 4.5535, "eval_samples_per_second": 1098.053, "eval_steps_per_second": 34.479, "step": 2150 }, { "epoch": 2.76, "grad_norm": 1.9689658880233765, "learning_rate": 3.964194373401535e-05, "loss": 0.8252, "step": 2160 }, { "epoch": 2.77, "grad_norm": 2.3247179985046387, "learning_rate": 3.7510656436487644e-05, "loss": 0.8301, "step": 2170 }, { "epoch": 2.79, "grad_norm": 3.333657741546631, "learning_rate": 3.537936913895993e-05, "loss": 0.7161, "step": 2180 }, { "epoch": 2.8, "grad_norm": 3.242521047592163, "learning_rate": 3.324808184143223e-05, "loss": 0.796, "step": 2190 }, { "epoch": 2.81, "grad_norm": 2.6882381439208984, "learning_rate": 3.111679454390452e-05, "loss": 0.7901, "step": 2200 }, { "epoch": 2.81, "eval_accuracy": 0.6678, "eval_f1_macro": 0.6030435999289231, "eval_f1_micro": 0.6678, "eval_loss": 0.9987305402755737, "eval_runtime": 4.6038, "eval_samples_per_second": 1086.055, "eval_steps_per_second": 34.102, "step": 2200 }, { "epoch": 2.83, "grad_norm": 2.786713123321533, "learning_rate": 2.8985507246376814e-05, "loss": 0.7368, "step": 2210 }, { "epoch": 2.84, "grad_norm": 2.7163522243499756, "learning_rate": 2.6854219948849106e-05, "loss": 0.7976, "step": 2220 }, { "epoch": 2.85, "grad_norm": 2.269670248031616, "learning_rate": 2.4722932651321398e-05, "loss": 0.7612, "step": 2230 }, { "epoch": 2.86, "grad_norm": 2.7400472164154053, "learning_rate": 2.2591645353793694e-05, "loss": 0.8234, "step": 2240 }, { "epoch": 2.88, "grad_norm": 4.061866760253906, "learning_rate": 2.0460358056265986e-05, "loss": 0.8835, "step": 2250 }, { "epoch": 2.88, "eval_accuracy": 0.6644, "eval_f1_macro": 0.5985551818802296, "eval_f1_micro": 0.6644, "eval_loss": 1.0015017986297607, "eval_runtime": 4.6024, "eval_samples_per_second": 1086.396, "eval_steps_per_second": 34.113, "step": 2250 }, { "epoch": 2.89, "grad_norm": 3.1923465728759766, "learning_rate": 1.8329070758738275e-05, "loss": 0.8002, "step": 2260 }, { "epoch": 2.9, "grad_norm": 2.8874006271362305, "learning_rate": 1.619778346121057e-05, "loss": 0.9052, "step": 2270 }, { "epoch": 2.92, "grad_norm": 2.9610722064971924, "learning_rate": 1.4066496163682863e-05, "loss": 0.7826, "step": 2280 }, { "epoch": 2.93, "grad_norm": 2.6525473594665527, "learning_rate": 1.1935208866155157e-05, "loss": 0.8008, "step": 2290 }, { "epoch": 2.94, "grad_norm": 2.8157360553741455, "learning_rate": 9.803921568627451e-06, "loss": 0.8679, "step": 2300 }, { "epoch": 2.94, "eval_accuracy": 0.6636, "eval_f1_macro": 0.5960754763007041, "eval_f1_micro": 0.6636, "eval_loss": 0.9994349479675293, "eval_runtime": 4.5976, "eval_samples_per_second": 1087.518, "eval_steps_per_second": 34.148, "step": 2300 }, { "epoch": 2.95, "grad_norm": 2.7772059440612793, "learning_rate": 7.672634271099745e-06, "loss": 0.7572, "step": 2310 }, { "epoch": 2.97, "grad_norm": 2.3767988681793213, "learning_rate": 5.541346973572038e-06, "loss": 0.7519, "step": 2320 }, { "epoch": 2.98, "grad_norm": 3.354130506515503, "learning_rate": 3.4100596760443308e-06, "loss": 0.798, "step": 2330 }, { "epoch": 2.99, "grad_norm": 2.4790210723876953, "learning_rate": 1.2787723785166241e-06, "loss": 0.732, "step": 2340 }, { "epoch": 3.0, "step": 2346, "total_flos": 2555504607363072.0, "train_loss": 0.9882089091688776, "train_runtime": 435.725, "train_samples_per_second": 172.127, "train_steps_per_second": 5.384 } ], "logging_steps": 10, "max_steps": 2346, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "total_flos": 2555504607363072.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }