{ "best_metric": 0.18961240351200104, "best_model_checkpoint": "mobilevitv2-1.0-imagenet1k-256-finetuned_v2024-7-25-frost/checkpoint-1000", "epoch": 30.0, "eval_steps": 100, "global_step": 1920, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.15625, "grad_norm": 0.2519198954105377, "learning_rate": 1.0416666666666668e-05, "loss": 0.6958, "step": 10 }, { "epoch": 0.3125, "grad_norm": 0.21561957895755768, "learning_rate": 2.0833333333333336e-05, "loss": 0.6958, "step": 20 }, { "epoch": 0.46875, "grad_norm": 0.2186066061258316, "learning_rate": 3.125e-05, "loss": 0.6948, "step": 30 }, { "epoch": 0.625, "grad_norm": 0.22916783392429352, "learning_rate": 4.166666666666667e-05, "loss": 0.6934, "step": 40 }, { "epoch": 0.78125, "grad_norm": 0.23298271000385284, "learning_rate": 5.208333333333334e-05, "loss": 0.691, "step": 50 }, { "epoch": 0.9375, "grad_norm": 0.24950967729091644, "learning_rate": 6.25e-05, "loss": 0.6895, "step": 60 }, { "epoch": 1.09375, "grad_norm": 0.23937764763832092, "learning_rate": 7.291666666666667e-05, "loss": 0.684, "step": 70 }, { "epoch": 1.25, "grad_norm": 0.23057785630226135, "learning_rate": 8.333333333333334e-05, "loss": 0.6794, "step": 80 }, { "epoch": 1.40625, "grad_norm": 0.2983661890029907, "learning_rate": 9.375e-05, "loss": 0.6751, "step": 90 }, { "epoch": 1.5625, "grad_norm": 0.2652019262313843, "learning_rate": 0.00010416666666666667, "loss": 0.6687, "step": 100 }, { "epoch": 1.5625, "eval_accuracy": 0.7230088495575221, "eval_f1": 0.5335320417287631, "eval_loss": 0.6623277068138123, "eval_precision": 0.40224719101123596, "eval_recall": 0.7920353982300885, "eval_runtime": 1.2295, "eval_samples_per_second": 91.909, "eval_steps_per_second": 12.2, "step": 100 }, { "epoch": 1.71875, "grad_norm": 0.34594935178756714, "learning_rate": 0.00011458333333333333, "loss": 0.6586, "step": 110 }, { "epoch": 1.875, "grad_norm": 0.26900964975357056, "learning_rate": 0.000125, "loss": 0.6476, "step": 120 }, { "epoch": 2.03125, "grad_norm": 0.30181896686553955, "learning_rate": 0.0001354166666666667, "loss": 0.6336, "step": 130 }, { "epoch": 2.1875, "grad_norm": 0.33757150173187256, "learning_rate": 0.00014583333333333335, "loss": 0.6079, "step": 140 }, { "epoch": 2.34375, "grad_norm": 0.54989093542099, "learning_rate": 0.00015625, "loss": 0.5823, "step": 150 }, { "epoch": 2.5, "grad_norm": 0.6273936629295349, "learning_rate": 0.0001666666666666667, "loss": 0.5598, "step": 160 }, { "epoch": 2.65625, "grad_norm": 0.46115073561668396, "learning_rate": 0.00017708333333333335, "loss": 0.5239, "step": 170 }, { "epoch": 2.8125, "grad_norm": 0.47255411744117737, "learning_rate": 0.0001875, "loss": 0.4972, "step": 180 }, { "epoch": 2.96875, "grad_norm": 0.522071361541748, "learning_rate": 0.0001979166666666667, "loss": 0.4617, "step": 190 }, { "epoch": 3.125, "grad_norm": 1.0379081964492798, "learning_rate": 0.0001990740740740741, "loss": 0.4454, "step": 200 }, { "epoch": 3.125, "eval_accuracy": 0.8831858407079646, "eval_f1": 0.7490494296577946, "eval_loss": 0.41519004106521606, "eval_precision": 0.6566666666666666, "eval_recall": 0.8716814159292036, "eval_runtime": 0.9401, "eval_samples_per_second": 120.203, "eval_steps_per_second": 15.956, "step": 200 }, { "epoch": 3.28125, "grad_norm": 0.545850932598114, "learning_rate": 0.0001979166666666667, "loss": 0.4247, "step": 210 }, { "epoch": 3.4375, "grad_norm": 0.7891977429389954, "learning_rate": 0.00019675925925925926, "loss": 0.4041, "step": 220 }, { "epoch": 3.59375, "grad_norm": 1.542927861213684, "learning_rate": 0.00019560185185185186, "loss": 0.3845, "step": 230 }, { "epoch": 3.75, "grad_norm": 0.44223108887672424, "learning_rate": 0.00019444444444444446, "loss": 0.3278, "step": 240 }, { "epoch": 3.90625, "grad_norm": 0.8207859396934509, "learning_rate": 0.00019328703703703706, "loss": 0.3507, "step": 250 }, { "epoch": 4.0625, "grad_norm": 0.42037203907966614, "learning_rate": 0.00019212962962962963, "loss": 0.3481, "step": 260 }, { "epoch": 4.21875, "grad_norm": 1.829210877418518, "learning_rate": 0.00019097222222222223, "loss": 0.3133, "step": 270 }, { "epoch": 4.375, "grad_norm": 1.213773250579834, "learning_rate": 0.00018981481481481483, "loss": 0.2934, "step": 280 }, { "epoch": 4.53125, "grad_norm": 0.6154431104660034, "learning_rate": 0.00018865740740740743, "loss": 0.2923, "step": 290 }, { "epoch": 4.6875, "grad_norm": 0.3814193606376648, "learning_rate": 0.0001875, "loss": 0.2835, "step": 300 }, { "epoch": 4.6875, "eval_accuracy": 0.9097345132743363, "eval_f1": 0.7660550458715596, "eval_loss": 0.26609960198402405, "eval_precision": 0.7952380952380952, "eval_recall": 0.7389380530973452, "eval_runtime": 1.4414, "eval_samples_per_second": 78.394, "eval_steps_per_second": 10.406, "step": 300 }, { "epoch": 4.84375, "grad_norm": 0.4515719711780548, "learning_rate": 0.0001863425925925926, "loss": 0.2672, "step": 310 }, { "epoch": 5.0, "grad_norm": 2.077721357345581, "learning_rate": 0.0001851851851851852, "loss": 0.2914, "step": 320 }, { "epoch": 5.15625, "grad_norm": 0.9644371867179871, "learning_rate": 0.00018402777777777778, "loss": 0.2527, "step": 330 }, { "epoch": 5.3125, "grad_norm": 0.8245725631713867, "learning_rate": 0.00018287037037037038, "loss": 0.2477, "step": 340 }, { "epoch": 5.46875, "grad_norm": 0.5262947082519531, "learning_rate": 0.00018171296296296297, "loss": 0.2452, "step": 350 }, { "epoch": 5.625, "grad_norm": 0.6464282870292664, "learning_rate": 0.00018055555555555557, "loss": 0.2308, "step": 360 }, { "epoch": 5.78125, "grad_norm": 0.6029626131057739, "learning_rate": 0.00017939814814814815, "loss": 0.233, "step": 370 }, { "epoch": 5.9375, "grad_norm": 0.683201789855957, "learning_rate": 0.00017824074074074075, "loss": 0.2258, "step": 380 }, { "epoch": 6.09375, "grad_norm": 0.5622811317443848, "learning_rate": 0.00017708333333333335, "loss": 0.2342, "step": 390 }, { "epoch": 6.25, "grad_norm": 0.5229126214981079, "learning_rate": 0.00017592592592592595, "loss": 0.2197, "step": 400 }, { "epoch": 6.25, "eval_accuracy": 0.9194690265486726, "eval_f1": 0.7868852459016394, "eval_loss": 0.21510393917560577, "eval_precision": 0.835820895522388, "eval_recall": 0.7433628318584071, "eval_runtime": 0.9737, "eval_samples_per_second": 116.048, "eval_steps_per_second": 15.405, "step": 400 }, { "epoch": 6.40625, "grad_norm": 0.5103662610054016, "learning_rate": 0.00017476851851851852, "loss": 0.2084, "step": 410 }, { "epoch": 6.5625, "grad_norm": 1.2655210494995117, "learning_rate": 0.00017361111111111112, "loss": 0.2005, "step": 420 }, { "epoch": 6.71875, "grad_norm": 0.5232699513435364, "learning_rate": 0.00017245370370370372, "loss": 0.2296, "step": 430 }, { "epoch": 6.875, "grad_norm": 0.8142613172531128, "learning_rate": 0.00017129629629629632, "loss": 0.187, "step": 440 }, { "epoch": 7.03125, "grad_norm": 0.9919219017028809, "learning_rate": 0.0001701388888888889, "loss": 0.2027, "step": 450 }, { "epoch": 7.1875, "grad_norm": 1.2590153217315674, "learning_rate": 0.0001689814814814815, "loss": 0.1873, "step": 460 }, { "epoch": 7.34375, "grad_norm": 0.6513100266456604, "learning_rate": 0.0001678240740740741, "loss": 0.1939, "step": 470 }, { "epoch": 7.5, "grad_norm": 1.0872722864151, "learning_rate": 0.0001666666666666667, "loss": 0.2156, "step": 480 }, { "epoch": 7.65625, "grad_norm": 0.3712750971317291, "learning_rate": 0.00016550925925925926, "loss": 0.1968, "step": 490 }, { "epoch": 7.8125, "grad_norm": 0.8672028183937073, "learning_rate": 0.00016435185185185186, "loss": 0.1613, "step": 500 }, { "epoch": 7.8125, "eval_accuracy": 0.9292035398230089, "eval_f1": 0.813953488372093, "eval_loss": 0.20068036019802094, "eval_precision": 0.8578431372549019, "eval_recall": 0.7743362831858407, "eval_runtime": 0.9681, "eval_samples_per_second": 116.722, "eval_steps_per_second": 15.494, "step": 500 }, { "epoch": 7.96875, "grad_norm": 1.2281358242034912, "learning_rate": 0.00016319444444444446, "loss": 0.1864, "step": 510 }, { "epoch": 8.125, "grad_norm": 1.1462125778198242, "learning_rate": 0.00016203703703703706, "loss": 0.1743, "step": 520 }, { "epoch": 8.28125, "grad_norm": 0.5552182197570801, "learning_rate": 0.00016087962962962963, "loss": 0.1963, "step": 530 }, { "epoch": 8.4375, "grad_norm": 0.8015744686126709, "learning_rate": 0.00015972222222222223, "loss": 0.1987, "step": 540 }, { "epoch": 8.59375, "grad_norm": 0.8516111969947815, "learning_rate": 0.00015856481481481483, "loss": 0.1753, "step": 550 }, { "epoch": 8.75, "grad_norm": 1.3942711353302002, "learning_rate": 0.00015740740740740743, "loss": 0.1577, "step": 560 }, { "epoch": 8.90625, "grad_norm": 0.812676191329956, "learning_rate": 0.00015625, "loss": 0.1726, "step": 570 }, { "epoch": 9.0625, "grad_norm": 0.567040205001831, "learning_rate": 0.0001550925925925926, "loss": 0.1665, "step": 580 }, { "epoch": 9.21875, "grad_norm": 0.7389497756958008, "learning_rate": 0.0001539351851851852, "loss": 0.1445, "step": 590 }, { "epoch": 9.375, "grad_norm": 0.6939622163772583, "learning_rate": 0.00015277777777777777, "loss": 0.1655, "step": 600 }, { "epoch": 9.375, "eval_accuracy": 0.9309734513274336, "eval_f1": 0.8227272727272726, "eval_loss": 0.1935483068227768, "eval_precision": 0.8457943925233645, "eval_recall": 0.8008849557522124, "eval_runtime": 1.422, "eval_samples_per_second": 79.467, "eval_steps_per_second": 10.549, "step": 600 }, { "epoch": 9.53125, "grad_norm": 0.6073923110961914, "learning_rate": 0.00015162037037037037, "loss": 0.159, "step": 610 }, { "epoch": 9.6875, "grad_norm": 0.8762220740318298, "learning_rate": 0.00015046296296296297, "loss": 0.1959, "step": 620 }, { "epoch": 9.84375, "grad_norm": 0.7490831017494202, "learning_rate": 0.00014930555555555557, "loss": 0.1465, "step": 630 }, { "epoch": 10.0, "grad_norm": 1.0123506784439087, "learning_rate": 0.00014814814814814815, "loss": 0.1683, "step": 640 }, { "epoch": 10.15625, "grad_norm": 0.5325204133987427, "learning_rate": 0.00014699074074074075, "loss": 0.1636, "step": 650 }, { "epoch": 10.3125, "grad_norm": 0.5814504623413086, "learning_rate": 0.00014583333333333335, "loss": 0.1729, "step": 660 }, { "epoch": 10.46875, "grad_norm": 1.0156935453414917, "learning_rate": 0.00014467592592592594, "loss": 0.1569, "step": 670 }, { "epoch": 10.625, "grad_norm": 1.257921576499939, "learning_rate": 0.00014351851851851852, "loss": 0.1429, "step": 680 }, { "epoch": 10.78125, "grad_norm": 0.929108202457428, "learning_rate": 0.00014236111111111112, "loss": 0.1554, "step": 690 }, { "epoch": 10.9375, "grad_norm": 0.5256894826889038, "learning_rate": 0.00014120370370370372, "loss": 0.1815, "step": 700 }, { "epoch": 10.9375, "eval_accuracy": 0.9265486725663716, "eval_f1": 0.8074245939675174, "eval_loss": 0.18833249807357788, "eval_precision": 0.848780487804878, "eval_recall": 0.7699115044247787, "eval_runtime": 0.9484, "eval_samples_per_second": 119.151, "eval_steps_per_second": 15.817, "step": 700 }, { "epoch": 11.09375, "grad_norm": 1.1333953142166138, "learning_rate": 0.00014004629629629632, "loss": 0.1703, "step": 710 }, { "epoch": 11.25, "grad_norm": 0.6658828854560852, "learning_rate": 0.0001388888888888889, "loss": 0.1475, "step": 720 }, { "epoch": 11.40625, "grad_norm": 1.04364812374115, "learning_rate": 0.0001377314814814815, "loss": 0.1598, "step": 730 }, { "epoch": 11.5625, "grad_norm": 0.8811527490615845, "learning_rate": 0.0001365740740740741, "loss": 0.1678, "step": 740 }, { "epoch": 11.71875, "grad_norm": 0.8651083111763, "learning_rate": 0.0001354166666666667, "loss": 0.1681, "step": 750 }, { "epoch": 11.875, "grad_norm": 0.833223283290863, "learning_rate": 0.00013425925925925926, "loss": 0.1616, "step": 760 }, { "epoch": 12.03125, "grad_norm": 0.5667290687561035, "learning_rate": 0.00013310185185185186, "loss": 0.1185, "step": 770 }, { "epoch": 12.1875, "grad_norm": 1.3427128791809082, "learning_rate": 0.00013194444444444446, "loss": 0.1442, "step": 780 }, { "epoch": 12.34375, "grad_norm": 0.859018087387085, "learning_rate": 0.00013078703703703706, "loss": 0.1552, "step": 790 }, { "epoch": 12.5, "grad_norm": 0.6311579942703247, "learning_rate": 0.00012962962962962963, "loss": 0.1316, "step": 800 }, { "epoch": 12.5, "eval_accuracy": 0.9327433628318584, "eval_f1": 0.8272727272727272, "eval_loss": 0.18246687948703766, "eval_precision": 0.8504672897196262, "eval_recall": 0.8053097345132744, "eval_runtime": 0.9594, "eval_samples_per_second": 117.786, "eval_steps_per_second": 15.635, "step": 800 }, { "epoch": 12.65625, "grad_norm": 0.8464061617851257, "learning_rate": 0.00012847222222222223, "loss": 0.1344, "step": 810 }, { "epoch": 12.8125, "grad_norm": 0.6711329221725464, "learning_rate": 0.00012731481481481483, "loss": 0.1602, "step": 820 }, { "epoch": 12.96875, "grad_norm": 1.0340158939361572, "learning_rate": 0.00012615740740740743, "loss": 0.1483, "step": 830 }, { "epoch": 13.125, "grad_norm": 0.711726725101471, "learning_rate": 0.000125, "loss": 0.1507, "step": 840 }, { "epoch": 13.28125, "grad_norm": 0.8784794211387634, "learning_rate": 0.00012384259259259258, "loss": 0.1515, "step": 850 }, { "epoch": 13.4375, "grad_norm": 0.9908888339996338, "learning_rate": 0.0001226851851851852, "loss": 0.1583, "step": 860 }, { "epoch": 13.59375, "grad_norm": 0.5473937392234802, "learning_rate": 0.00012152777777777777, "loss": 0.1433, "step": 870 }, { "epoch": 13.75, "grad_norm": 1.6888905763626099, "learning_rate": 0.00012037037037037037, "loss": 0.1371, "step": 880 }, { "epoch": 13.90625, "grad_norm": 1.0640438795089722, "learning_rate": 0.00011921296296296296, "loss": 0.1376, "step": 890 }, { "epoch": 14.0625, "grad_norm": 1.9941257238388062, "learning_rate": 0.00011805555555555556, "loss": 0.1612, "step": 900 }, { "epoch": 14.0625, "eval_accuracy": 0.9256637168141593, "eval_f1": 0.8099547511312217, "eval_loss": 0.18371373414993286, "eval_precision": 0.8287037037037037, "eval_recall": 0.7920353982300885, "eval_runtime": 1.4255, "eval_samples_per_second": 79.27, "eval_steps_per_second": 10.523, "step": 900 }, { "epoch": 14.21875, "grad_norm": 0.650867760181427, "learning_rate": 0.00011689814814814815, "loss": 0.1468, "step": 910 }, { "epoch": 14.375, "grad_norm": 0.7459059357643127, "learning_rate": 0.00011574074074074075, "loss": 0.1468, "step": 920 }, { "epoch": 14.53125, "grad_norm": 0.7468872666358948, "learning_rate": 0.00011458333333333333, "loss": 0.1169, "step": 930 }, { "epoch": 14.6875, "grad_norm": 0.6512945890426636, "learning_rate": 0.00011342592592592593, "loss": 0.1373, "step": 940 }, { "epoch": 14.84375, "grad_norm": 0.710382878780365, "learning_rate": 0.00011226851851851852, "loss": 0.1223, "step": 950 }, { "epoch": 15.0, "grad_norm": 1.2112369537353516, "learning_rate": 0.00011111111111111112, "loss": 0.1257, "step": 960 }, { "epoch": 15.15625, "grad_norm": 4.069777965545654, "learning_rate": 0.0001099537037037037, "loss": 0.1369, "step": 970 }, { "epoch": 15.3125, "grad_norm": 0.9751072525978088, "learning_rate": 0.0001087962962962963, "loss": 0.1316, "step": 980 }, { "epoch": 15.46875, "grad_norm": 0.49943211674690247, "learning_rate": 0.00010763888888888889, "loss": 0.1088, "step": 990 }, { "epoch": 15.625, "grad_norm": 0.7845533490180969, "learning_rate": 0.00010648148148148149, "loss": 0.118, "step": 1000 }, { "epoch": 15.625, "eval_accuracy": 0.9309734513274336, "eval_f1": 0.8227272727272726, "eval_loss": 0.18961240351200104, "eval_precision": 0.8457943925233645, "eval_recall": 0.8008849557522124, "eval_runtime": 0.9541, "eval_samples_per_second": 118.435, "eval_steps_per_second": 15.721, "step": 1000 }, { "epoch": 15.78125, "grad_norm": 0.5193383693695068, "learning_rate": 0.00010532407407407407, "loss": 0.1233, "step": 1010 }, { "epoch": 15.9375, "grad_norm": 0.5976629257202148, "learning_rate": 0.00010416666666666667, "loss": 0.1351, "step": 1020 }, { "epoch": 16.09375, "grad_norm": 1.0629384517669678, "learning_rate": 0.00010300925925925926, "loss": 0.1597, "step": 1030 }, { "epoch": 16.25, "grad_norm": 0.8576996326446533, "learning_rate": 0.00010185185185185186, "loss": 0.1268, "step": 1040 }, { "epoch": 16.40625, "grad_norm": 0.7236841917037964, "learning_rate": 0.00010069444444444445, "loss": 0.1411, "step": 1050 }, { "epoch": 16.5625, "grad_norm": 1.1142785549163818, "learning_rate": 9.953703703703704e-05, "loss": 0.1297, "step": 1060 }, { "epoch": 16.71875, "grad_norm": 0.8304411768913269, "learning_rate": 9.837962962962963e-05, "loss": 0.1231, "step": 1070 }, { "epoch": 16.875, "grad_norm": 0.8226402997970581, "learning_rate": 9.722222222222223e-05, "loss": 0.1399, "step": 1080 }, { "epoch": 17.03125, "grad_norm": 0.6692397594451904, "learning_rate": 9.606481481481482e-05, "loss": 0.1833, "step": 1090 }, { "epoch": 17.1875, "grad_norm": 0.6689762473106384, "learning_rate": 9.490740740740742e-05, "loss": 0.1178, "step": 1100 }, { "epoch": 17.1875, "eval_accuracy": 0.9238938053097345, "eval_f1": 0.8027522935779817, "eval_loss": 0.19371576607227325, "eval_precision": 0.8333333333333334, "eval_recall": 0.7743362831858407, "eval_runtime": 0.9499, "eval_samples_per_second": 118.958, "eval_steps_per_second": 15.791, "step": 1100 }, { "epoch": 17.34375, "grad_norm": 0.6079881191253662, "learning_rate": 9.375e-05, "loss": 0.1259, "step": 1110 }, { "epoch": 17.5, "grad_norm": 0.37670084834098816, "learning_rate": 9.25925925925926e-05, "loss": 0.1101, "step": 1120 }, { "epoch": 17.65625, "grad_norm": 0.7734571695327759, "learning_rate": 9.143518518518519e-05, "loss": 0.1279, "step": 1130 }, { "epoch": 17.8125, "grad_norm": 1.0208630561828613, "learning_rate": 9.027777777777779e-05, "loss": 0.1258, "step": 1140 }, { "epoch": 17.96875, "grad_norm": 0.5698951482772827, "learning_rate": 8.912037037037037e-05, "loss": 0.1111, "step": 1150 }, { "epoch": 18.125, "grad_norm": 1.77188241481781, "learning_rate": 8.796296296296297e-05, "loss": 0.1254, "step": 1160 }, { "epoch": 18.28125, "grad_norm": 0.8389852643013, "learning_rate": 8.680555555555556e-05, "loss": 0.1042, "step": 1170 }, { "epoch": 18.4375, "grad_norm": 0.6655524969100952, "learning_rate": 8.564814814814816e-05, "loss": 0.1272, "step": 1180 }, { "epoch": 18.59375, "grad_norm": 0.4668845236301422, "learning_rate": 8.449074074074074e-05, "loss": 0.1096, "step": 1190 }, { "epoch": 18.75, "grad_norm": 0.8379706740379333, "learning_rate": 8.333333333333334e-05, "loss": 0.1248, "step": 1200 }, { "epoch": 18.75, "eval_accuracy": 0.9300884955752212, "eval_f1": 0.8192219679633868, "eval_loss": 0.19132623076438904, "eval_precision": 0.8483412322274881, "eval_recall": 0.7920353982300885, "eval_runtime": 1.2549, "eval_samples_per_second": 90.047, "eval_steps_per_second": 11.953, "step": 1200 }, { "epoch": 18.90625, "grad_norm": 0.9271652698516846, "learning_rate": 8.217592592592593e-05, "loss": 0.1126, "step": 1210 }, { "epoch": 19.0625, "grad_norm": 1.1356163024902344, "learning_rate": 8.113425925925926e-05, "loss": 0.129, "step": 1220 }, { "epoch": 19.21875, "grad_norm": 0.4993898570537567, "learning_rate": 7.997685185185186e-05, "loss": 0.1385, "step": 1230 }, { "epoch": 19.375, "grad_norm": 1.2999491691589355, "learning_rate": 7.881944444444444e-05, "loss": 0.1242, "step": 1240 }, { "epoch": 19.53125, "grad_norm": 1.1871651411056519, "learning_rate": 7.766203703703704e-05, "loss": 0.113, "step": 1250 }, { "epoch": 19.6875, "grad_norm": 1.5129660367965698, "learning_rate": 7.650462962962963e-05, "loss": 0.1024, "step": 1260 }, { "epoch": 19.84375, "grad_norm": 0.7286781072616577, "learning_rate": 7.534722222222223e-05, "loss": 0.0994, "step": 1270 }, { "epoch": 20.0, "grad_norm": 1.0448476076126099, "learning_rate": 7.418981481481481e-05, "loss": 0.1115, "step": 1280 }, { "epoch": 20.15625, "grad_norm": 0.6149379014968872, "learning_rate": 7.303240740740741e-05, "loss": 0.1243, "step": 1290 }, { "epoch": 20.3125, "grad_norm": 0.7020682692527771, "learning_rate": 7.1875e-05, "loss": 0.1169, "step": 1300 }, { "epoch": 20.3125, "eval_accuracy": 0.9300884955752212, "eval_f1": 0.8167053364269141, "eval_loss": 0.19162432849407196, "eval_precision": 0.8585365853658536, "eval_recall": 0.7787610619469026, "eval_runtime": 0.958, "eval_samples_per_second": 117.951, "eval_steps_per_second": 15.657, "step": 1300 }, { "epoch": 20.46875, "grad_norm": 1.8554919958114624, "learning_rate": 7.07175925925926e-05, "loss": 0.1175, "step": 1310 }, { "epoch": 20.625, "grad_norm": 1.1227444410324097, "learning_rate": 6.956018518518518e-05, "loss": 0.1004, "step": 1320 }, { "epoch": 20.78125, "grad_norm": 1.376546025276184, "learning_rate": 6.840277777777778e-05, "loss": 0.1099, "step": 1330 }, { "epoch": 20.9375, "grad_norm": 0.86075758934021, "learning_rate": 6.724537037037037e-05, "loss": 0.133, "step": 1340 }, { "epoch": 21.09375, "grad_norm": 1.093257188796997, "learning_rate": 6.608796296296297e-05, "loss": 0.1143, "step": 1350 }, { "epoch": 21.25, "grad_norm": 0.5665271282196045, "learning_rate": 6.493055555555556e-05, "loss": 0.1065, "step": 1360 }, { "epoch": 21.40625, "grad_norm": 0.607912003993988, "learning_rate": 6.377314814814816e-05, "loss": 0.1221, "step": 1370 }, { "epoch": 21.5625, "grad_norm": 0.4708748161792755, "learning_rate": 6.261574074074074e-05, "loss": 0.0941, "step": 1380 }, { "epoch": 21.71875, "grad_norm": 0.8719390630722046, "learning_rate": 6.145833333333334e-05, "loss": 0.112, "step": 1390 }, { "epoch": 21.875, "grad_norm": 0.45299583673477173, "learning_rate": 6.0300925925925934e-05, "loss": 0.1094, "step": 1400 }, { "epoch": 21.875, "eval_accuracy": 0.9292035398230089, "eval_f1": 0.8181818181818182, "eval_loss": 0.19246041774749756, "eval_precision": 0.8411214953271028, "eval_recall": 0.7964601769911505, "eval_runtime": 1.2007, "eval_samples_per_second": 94.111, "eval_steps_per_second": 12.493, "step": 1400 }, { "epoch": 22.03125, "grad_norm": 0.7893074154853821, "learning_rate": 5.9143518518518527e-05, "loss": 0.1142, "step": 1410 }, { "epoch": 22.1875, "grad_norm": 1.0832182168960571, "learning_rate": 5.798611111111112e-05, "loss": 0.096, "step": 1420 }, { "epoch": 22.34375, "grad_norm": 0.5880953669548035, "learning_rate": 5.682870370370371e-05, "loss": 0.1142, "step": 1430 }, { "epoch": 22.5, "grad_norm": 0.6071570515632629, "learning_rate": 5.567129629629629e-05, "loss": 0.1146, "step": 1440 }, { "epoch": 22.65625, "grad_norm": 1.7968199253082275, "learning_rate": 5.4513888888888884e-05, "loss": 0.1141, "step": 1450 }, { "epoch": 22.8125, "grad_norm": 0.6409327983856201, "learning_rate": 5.335648148148148e-05, "loss": 0.0907, "step": 1460 }, { "epoch": 22.96875, "grad_norm": 1.28959321975708, "learning_rate": 5.219907407407407e-05, "loss": 0.0945, "step": 1470 }, { "epoch": 23.125, "grad_norm": 1.0384379625320435, "learning_rate": 5.115740740740741e-05, "loss": 0.1062, "step": 1480 }, { "epoch": 23.28125, "grad_norm": 0.8010191917419434, "learning_rate": 5e-05, "loss": 0.1029, "step": 1490 }, { "epoch": 23.4375, "grad_norm": 1.078620195388794, "learning_rate": 4.8842592592592595e-05, "loss": 0.1108, "step": 1500 }, { "epoch": 23.4375, "eval_accuracy": 0.9345132743362832, "eval_f1": 0.8333333333333334, "eval_loss": 0.19605357944965363, "eval_precision": 0.8486238532110092, "eval_recall": 0.8185840707964602, "eval_runtime": 0.9739, "eval_samples_per_second": 116.033, "eval_steps_per_second": 15.403, "step": 1500 }, { "epoch": 23.59375, "grad_norm": 0.9104486703872681, "learning_rate": 4.768518518518519e-05, "loss": 0.1039, "step": 1510 }, { "epoch": 23.75, "grad_norm": 1.1187772750854492, "learning_rate": 4.652777777777778e-05, "loss": 0.1179, "step": 1520 }, { "epoch": 23.90625, "grad_norm": 0.6038283109664917, "learning_rate": 4.5370370370370374e-05, "loss": 0.0981, "step": 1530 }, { "epoch": 24.0625, "grad_norm": 1.483780860900879, "learning_rate": 4.4212962962962966e-05, "loss": 0.1287, "step": 1540 }, { "epoch": 24.21875, "grad_norm": 0.9008955359458923, "learning_rate": 4.305555555555556e-05, "loss": 0.1038, "step": 1550 }, { "epoch": 24.375, "grad_norm": 1.3843752145767212, "learning_rate": 4.1898148148148145e-05, "loss": 0.1177, "step": 1560 }, { "epoch": 24.53125, "grad_norm": 1.3227291107177734, "learning_rate": 4.074074074074074e-05, "loss": 0.0992, "step": 1570 }, { "epoch": 24.6875, "grad_norm": 0.4530428349971771, "learning_rate": 3.958333333333333e-05, "loss": 0.1064, "step": 1580 }, { "epoch": 24.84375, "grad_norm": 1.476251244544983, "learning_rate": 3.8425925925925924e-05, "loss": 0.1008, "step": 1590 }, { "epoch": 25.0, "grad_norm": 1.4163908958435059, "learning_rate": 3.726851851851852e-05, "loss": 0.1089, "step": 1600 }, { "epoch": 25.0, "eval_accuracy": 0.9283185840707965, "eval_f1": 0.8171557562076749, "eval_loss": 0.1992984265089035, "eval_precision": 0.8341013824884793, "eval_recall": 0.8008849557522124, "eval_runtime": 0.9786, "eval_samples_per_second": 115.465, "eval_steps_per_second": 15.327, "step": 1600 }, { "epoch": 25.15625, "grad_norm": 0.805210292339325, "learning_rate": 3.611111111111111e-05, "loss": 0.0922, "step": 1610 }, { "epoch": 25.3125, "grad_norm": 0.7946519255638123, "learning_rate": 3.49537037037037e-05, "loss": 0.1033, "step": 1620 }, { "epoch": 25.46875, "grad_norm": 0.7051573991775513, "learning_rate": 3.3796296296296295e-05, "loss": 0.1031, "step": 1630 }, { "epoch": 25.625, "grad_norm": 0.6867948174476624, "learning_rate": 3.263888888888889e-05, "loss": 0.1203, "step": 1640 }, { "epoch": 25.78125, "grad_norm": 0.9575832486152649, "learning_rate": 3.148148148148148e-05, "loss": 0.101, "step": 1650 }, { "epoch": 25.9375, "grad_norm": 1.125503420829773, "learning_rate": 3.0324074074074077e-05, "loss": 0.0868, "step": 1660 }, { "epoch": 26.09375, "grad_norm": 0.694492757320404, "learning_rate": 2.916666666666667e-05, "loss": 0.0916, "step": 1670 }, { "epoch": 26.25, "grad_norm": 0.606955885887146, "learning_rate": 2.8009259259259263e-05, "loss": 0.0978, "step": 1680 }, { "epoch": 26.40625, "grad_norm": 0.855603814125061, "learning_rate": 2.6851851851851855e-05, "loss": 0.0988, "step": 1690 }, { "epoch": 26.5625, "grad_norm": 0.6119447946548462, "learning_rate": 2.5694444444444445e-05, "loss": 0.0919, "step": 1700 }, { "epoch": 26.5625, "eval_accuracy": 0.9318584070796461, "eval_f1": 0.8261851015801355, "eval_loss": 0.19360247254371643, "eval_precision": 0.8433179723502304, "eval_recall": 0.8097345132743363, "eval_runtime": 1.5343, "eval_samples_per_second": 73.651, "eval_steps_per_second": 9.777, "step": 1700 }, { "epoch": 26.71875, "grad_norm": 0.9873837828636169, "learning_rate": 2.4537037037037038e-05, "loss": 0.0829, "step": 1710 }, { "epoch": 26.875, "grad_norm": 0.9287075996398926, "learning_rate": 2.337962962962963e-05, "loss": 0.1128, "step": 1720 }, { "epoch": 27.03125, "grad_norm": 0.8201906681060791, "learning_rate": 2.2222222222222223e-05, "loss": 0.1084, "step": 1730 }, { "epoch": 27.1875, "grad_norm": 1.1874263286590576, "learning_rate": 2.1064814814814816e-05, "loss": 0.1065, "step": 1740 }, { "epoch": 27.34375, "grad_norm": 1.0616997480392456, "learning_rate": 1.990740740740741e-05, "loss": 0.1014, "step": 1750 }, { "epoch": 27.5, "grad_norm": 0.8941544890403748, "learning_rate": 1.8750000000000002e-05, "loss": 0.1008, "step": 1760 }, { "epoch": 27.65625, "grad_norm": 0.9521628022193909, "learning_rate": 1.7592592592592595e-05, "loss": 0.0908, "step": 1770 }, { "epoch": 27.8125, "grad_norm": 0.79527348279953, "learning_rate": 1.6435185185185187e-05, "loss": 0.1137, "step": 1780 }, { "epoch": 27.96875, "grad_norm": 0.8606336116790771, "learning_rate": 1.527777777777778e-05, "loss": 0.0929, "step": 1790 }, { "epoch": 28.125, "grad_norm": 1.2438716888427734, "learning_rate": 1.412037037037037e-05, "loss": 0.0969, "step": 1800 }, { "epoch": 28.125, "eval_accuracy": 0.9309734513274336, "eval_f1": 0.8227272727272726, "eval_loss": 0.19780634343624115, "eval_precision": 0.8457943925233645, "eval_recall": 0.8008849557522124, "eval_runtime": 0.9685, "eval_samples_per_second": 116.67, "eval_steps_per_second": 15.487, "step": 1800 }, { "epoch": 28.28125, "grad_norm": 0.9101031422615051, "learning_rate": 1.2962962962962962e-05, "loss": 0.0977, "step": 1810 }, { "epoch": 28.4375, "grad_norm": 0.6852589249610901, "learning_rate": 1.1805555555555555e-05, "loss": 0.1018, "step": 1820 }, { "epoch": 28.59375, "grad_norm": 0.8925357460975647, "learning_rate": 1.0648148148148148e-05, "loss": 0.0985, "step": 1830 }, { "epoch": 28.75, "grad_norm": 0.8219801783561707, "learning_rate": 9.490740740740741e-06, "loss": 0.1014, "step": 1840 }, { "epoch": 28.90625, "grad_norm": 0.6065762639045715, "learning_rate": 8.333333333333334e-06, "loss": 0.1161, "step": 1850 }, { "epoch": 29.0625, "grad_norm": 0.7718455791473389, "learning_rate": 7.1759259259259266e-06, "loss": 0.1101, "step": 1860 }, { "epoch": 29.21875, "grad_norm": 1.0950359106063843, "learning_rate": 6.0185185185185185e-06, "loss": 0.1042, "step": 1870 }, { "epoch": 29.375, "grad_norm": 0.7298617362976074, "learning_rate": 4.861111111111111e-06, "loss": 0.0823, "step": 1880 }, { "epoch": 29.53125, "grad_norm": 0.782146692276001, "learning_rate": 3.7037037037037037e-06, "loss": 0.1051, "step": 1890 }, { "epoch": 29.6875, "grad_norm": 0.7693170309066772, "learning_rate": 2.546296296296296e-06, "loss": 0.1093, "step": 1900 }, { "epoch": 29.6875, "eval_accuracy": 0.9283185840707965, "eval_f1": 0.8171557562076749, "eval_loss": 0.19546246528625488, "eval_precision": 0.8341013824884793, "eval_recall": 0.8008849557522124, "eval_runtime": 1.3969, "eval_samples_per_second": 80.891, "eval_steps_per_second": 10.738, "step": 1900 }, { "epoch": 29.84375, "grad_norm": 1.2105658054351807, "learning_rate": 1.388888888888889e-06, "loss": 0.0836, "step": 1910 }, { "epoch": 30.0, "grad_norm": 2.1491291522979736, "learning_rate": 2.3148148148148148e-07, "loss": 0.1122, "step": 1920 }, { "epoch": 30.0, "step": 1920, "total_flos": 1.9916656541540352e+17, "train_loss": 0.1991663834390541, "train_runtime": 388.9258, "train_samples_per_second": 78.061, "train_steps_per_second": 4.937 } ], "logging_steps": 10, "max_steps": 1920, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.9916656541540352e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }