{ "best_metric": 0.8732394366197183, "best_model_checkpoint": "deit-base-distilled-patch16-224-65-fold4/checkpoint-188", "epoch": 92.3076923076923, "eval_steps": 500, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.9230769230769231, "eval_accuracy": 0.5492957746478874, "eval_loss": 0.7218557596206665, "eval_runtime": 0.9745, "eval_samples_per_second": 72.855, "eval_steps_per_second": 3.078, "step": 3 }, { "epoch": 1.8461538461538463, "eval_accuracy": 0.5492957746478874, "eval_loss": 0.686266303062439, "eval_runtime": 0.9669, "eval_samples_per_second": 73.432, "eval_steps_per_second": 3.103, "step": 6 }, { "epoch": 2.769230769230769, "eval_accuracy": 0.5492957746478874, "eval_loss": 0.6578378081321716, "eval_runtime": 0.9657, "eval_samples_per_second": 73.525, "eval_steps_per_second": 3.107, "step": 9 }, { "epoch": 3.076923076923077, "grad_norm": 3.424938678741455, "learning_rate": 1.6666666666666667e-05, "loss": 0.6825, "step": 10 }, { "epoch": 4.0, "eval_accuracy": 0.6338028169014085, "eval_loss": 0.6292961239814758, "eval_runtime": 0.9785, "eval_samples_per_second": 72.558, "eval_steps_per_second": 3.066, "step": 13 }, { "epoch": 4.923076923076923, "eval_accuracy": 0.676056338028169, "eval_loss": 0.6186050176620483, "eval_runtime": 0.987, "eval_samples_per_second": 71.936, "eval_steps_per_second": 3.04, "step": 16 }, { "epoch": 5.846153846153846, "eval_accuracy": 0.704225352112676, "eval_loss": 0.6135374903678894, "eval_runtime": 1.0113, "eval_samples_per_second": 70.204, "eval_steps_per_second": 2.966, "step": 19 }, { "epoch": 6.153846153846154, "grad_norm": 4.245552062988281, "learning_rate": 3.3333333333333335e-05, "loss": 0.6206, "step": 20 }, { "epoch": 6.769230769230769, "eval_accuracy": 0.647887323943662, "eval_loss": 0.6162843704223633, "eval_runtime": 0.9805, "eval_samples_per_second": 72.413, "eval_steps_per_second": 3.06, "step": 22 }, { "epoch": 8.0, "eval_accuracy": 0.647887323943662, "eval_loss": 0.6350468397140503, "eval_runtime": 0.9932, "eval_samples_per_second": 71.487, "eval_steps_per_second": 3.021, "step": 26 }, { "epoch": 8.923076923076923, "eval_accuracy": 0.6901408450704225, "eval_loss": 0.6078002452850342, "eval_runtime": 0.9899, "eval_samples_per_second": 71.723, "eval_steps_per_second": 3.031, "step": 29 }, { "epoch": 9.23076923076923, "grad_norm": 3.0329267978668213, "learning_rate": 5e-05, "loss": 0.5728, "step": 30 }, { "epoch": 9.846153846153847, "eval_accuracy": 0.676056338028169, "eval_loss": 0.6872609257698059, "eval_runtime": 0.9874, "eval_samples_per_second": 71.907, "eval_steps_per_second": 3.038, "step": 32 }, { "epoch": 10.76923076923077, "eval_accuracy": 0.676056338028169, "eval_loss": 0.6771360635757446, "eval_runtime": 0.984, "eval_samples_per_second": 72.152, "eval_steps_per_second": 3.049, "step": 35 }, { "epoch": 12.0, "eval_accuracy": 0.6619718309859155, "eval_loss": 0.5912012457847595, "eval_runtime": 0.9919, "eval_samples_per_second": 71.581, "eval_steps_per_second": 3.025, "step": 39 }, { "epoch": 12.307692307692308, "grad_norm": 5.794826030731201, "learning_rate": 4.814814814814815e-05, "loss": 0.5329, "step": 40 }, { "epoch": 12.923076923076923, "eval_accuracy": 0.7464788732394366, "eval_loss": 0.5523556470870972, "eval_runtime": 0.9889, "eval_samples_per_second": 71.799, "eval_steps_per_second": 3.034, "step": 42 }, { "epoch": 13.846153846153847, "eval_accuracy": 0.7183098591549296, "eval_loss": 0.5922580361366272, "eval_runtime": 0.9948, "eval_samples_per_second": 71.37, "eval_steps_per_second": 3.016, "step": 45 }, { "epoch": 14.76923076923077, "eval_accuracy": 0.676056338028169, "eval_loss": 0.6649972200393677, "eval_runtime": 0.9954, "eval_samples_per_second": 71.326, "eval_steps_per_second": 3.014, "step": 48 }, { "epoch": 15.384615384615385, "grad_norm": 4.871856689453125, "learning_rate": 4.62962962962963e-05, "loss": 0.4279, "step": 50 }, { "epoch": 16.0, "eval_accuracy": 0.7746478873239436, "eval_loss": 0.5182512402534485, "eval_runtime": 0.9953, "eval_samples_per_second": 71.338, "eval_steps_per_second": 3.014, "step": 52 }, { "epoch": 16.923076923076923, "eval_accuracy": 0.7887323943661971, "eval_loss": 0.4760647714138031, "eval_runtime": 0.9935, "eval_samples_per_second": 71.467, "eval_steps_per_second": 3.02, "step": 55 }, { "epoch": 17.846153846153847, "eval_accuracy": 0.7183098591549296, "eval_loss": 0.558963418006897, "eval_runtime": 0.9911, "eval_samples_per_second": 71.637, "eval_steps_per_second": 3.027, "step": 58 }, { "epoch": 18.46153846153846, "grad_norm": 4.168473243713379, "learning_rate": 4.4444444444444447e-05, "loss": 0.4055, "step": 60 }, { "epoch": 18.76923076923077, "eval_accuracy": 0.7464788732394366, "eval_loss": 0.5320470333099365, "eval_runtime": 1.0156, "eval_samples_per_second": 69.91, "eval_steps_per_second": 2.954, "step": 61 }, { "epoch": 20.0, "eval_accuracy": 0.7183098591549296, "eval_loss": 0.6604605913162231, "eval_runtime": 0.9924, "eval_samples_per_second": 71.546, "eval_steps_per_second": 3.023, "step": 65 }, { "epoch": 20.923076923076923, "eval_accuracy": 0.7464788732394366, "eval_loss": 0.5820997357368469, "eval_runtime": 0.9824, "eval_samples_per_second": 72.271, "eval_steps_per_second": 3.054, "step": 68 }, { "epoch": 21.53846153846154, "grad_norm": 5.980050086975098, "learning_rate": 4.259259259259259e-05, "loss": 0.3478, "step": 70 }, { "epoch": 21.846153846153847, "eval_accuracy": 0.7464788732394366, "eval_loss": 0.5588834881782532, "eval_runtime": 0.9895, "eval_samples_per_second": 71.75, "eval_steps_per_second": 3.032, "step": 71 }, { "epoch": 22.76923076923077, "eval_accuracy": 0.7464788732394366, "eval_loss": 0.6247106194496155, "eval_runtime": 0.9919, "eval_samples_per_second": 71.578, "eval_steps_per_second": 3.024, "step": 74 }, { "epoch": 24.0, "eval_accuracy": 0.6619718309859155, "eval_loss": 0.7006160020828247, "eval_runtime": 1.0036, "eval_samples_per_second": 70.747, "eval_steps_per_second": 2.989, "step": 78 }, { "epoch": 24.615384615384617, "grad_norm": 3.0665063858032227, "learning_rate": 4.074074074074074e-05, "loss": 0.3769, "step": 80 }, { "epoch": 24.923076923076923, "eval_accuracy": 0.7183098591549296, "eval_loss": 0.7575426697731018, "eval_runtime": 0.9915, "eval_samples_per_second": 71.611, "eval_steps_per_second": 3.026, "step": 81 }, { "epoch": 25.846153846153847, "eval_accuracy": 0.7746478873239436, "eval_loss": 0.5367358922958374, "eval_runtime": 0.991, "eval_samples_per_second": 71.646, "eval_steps_per_second": 3.027, "step": 84 }, { "epoch": 26.76923076923077, "eval_accuracy": 0.7746478873239436, "eval_loss": 0.533458948135376, "eval_runtime": 0.9946, "eval_samples_per_second": 71.388, "eval_steps_per_second": 3.016, "step": 87 }, { "epoch": 27.692307692307693, "grad_norm": 5.900879859924316, "learning_rate": 3.888888888888889e-05, "loss": 0.2957, "step": 90 }, { "epoch": 28.0, "eval_accuracy": 0.7605633802816901, "eval_loss": 0.5913785696029663, "eval_runtime": 1.0059, "eval_samples_per_second": 70.582, "eval_steps_per_second": 2.982, "step": 91 }, { "epoch": 28.923076923076923, "eval_accuracy": 0.7464788732394366, "eval_loss": 0.6780489683151245, "eval_runtime": 0.9943, "eval_samples_per_second": 71.404, "eval_steps_per_second": 3.017, "step": 94 }, { "epoch": 29.846153846153847, "eval_accuracy": 0.7746478873239436, "eval_loss": 0.534493625164032, "eval_runtime": 0.9982, "eval_samples_per_second": 71.129, "eval_steps_per_second": 3.005, "step": 97 }, { "epoch": 30.76923076923077, "grad_norm": 3.5530407428741455, "learning_rate": 3.7037037037037037e-05, "loss": 0.2463, "step": 100 }, { "epoch": 30.76923076923077, "eval_accuracy": 0.7605633802816901, "eval_loss": 0.613211989402771, "eval_runtime": 0.9924, "eval_samples_per_second": 71.54, "eval_steps_per_second": 3.023, "step": 100 }, { "epoch": 32.0, "eval_accuracy": 0.7887323943661971, "eval_loss": 0.5757858157157898, "eval_runtime": 0.9882, "eval_samples_per_second": 71.845, "eval_steps_per_second": 3.036, "step": 104 }, { "epoch": 32.92307692307692, "eval_accuracy": 0.7323943661971831, "eval_loss": 0.7235603332519531, "eval_runtime": 0.9977, "eval_samples_per_second": 71.165, "eval_steps_per_second": 3.007, "step": 107 }, { "epoch": 33.84615384615385, "grad_norm": 4.502363204956055, "learning_rate": 3.518518518518519e-05, "loss": 0.2323, "step": 110 }, { "epoch": 33.84615384615385, "eval_accuracy": 0.8169014084507042, "eval_loss": 0.5247332453727722, "eval_runtime": 1.0185, "eval_samples_per_second": 69.714, "eval_steps_per_second": 2.946, "step": 110 }, { "epoch": 34.76923076923077, "eval_accuracy": 0.7183098591549296, "eval_loss": 0.6018014550209045, "eval_runtime": 0.994, "eval_samples_per_second": 71.429, "eval_steps_per_second": 3.018, "step": 113 }, { "epoch": 36.0, "eval_accuracy": 0.8028169014084507, "eval_loss": 0.5366302728652954, "eval_runtime": 0.9895, "eval_samples_per_second": 71.751, "eval_steps_per_second": 3.032, "step": 117 }, { "epoch": 36.92307692307692, "grad_norm": 1.8001881837844849, "learning_rate": 3.3333333333333335e-05, "loss": 0.1921, "step": 120 }, { "epoch": 36.92307692307692, "eval_accuracy": 0.7464788732394366, "eval_loss": 0.6314247846603394, "eval_runtime": 0.9859, "eval_samples_per_second": 72.013, "eval_steps_per_second": 3.043, "step": 120 }, { "epoch": 37.84615384615385, "eval_accuracy": 0.7746478873239436, "eval_loss": 0.5763274431228638, "eval_runtime": 0.9986, "eval_samples_per_second": 71.096, "eval_steps_per_second": 3.004, "step": 123 }, { "epoch": 38.76923076923077, "eval_accuracy": 0.8028169014084507, "eval_loss": 0.5573564171791077, "eval_runtime": 0.9999, "eval_samples_per_second": 71.009, "eval_steps_per_second": 3.0, "step": 126 }, { "epoch": 40.0, "grad_norm": 2.9866695404052734, "learning_rate": 3.148148148148148e-05, "loss": 0.1686, "step": 130 }, { "epoch": 40.0, "eval_accuracy": 0.7887323943661971, "eval_loss": 0.6260572075843811, "eval_runtime": 1.0023, "eval_samples_per_second": 70.835, "eval_steps_per_second": 2.993, "step": 130 }, { "epoch": 40.92307692307692, "eval_accuracy": 0.7887323943661971, "eval_loss": 0.6524580121040344, "eval_runtime": 0.9907, "eval_samples_per_second": 71.669, "eval_steps_per_second": 3.028, "step": 133 }, { "epoch": 41.84615384615385, "eval_accuracy": 0.8169014084507042, "eval_loss": 0.5726364254951477, "eval_runtime": 0.9944, "eval_samples_per_second": 71.399, "eval_steps_per_second": 3.017, "step": 136 }, { "epoch": 42.76923076923077, "eval_accuracy": 0.704225352112676, "eval_loss": 0.8199837803840637, "eval_runtime": 0.9956, "eval_samples_per_second": 71.317, "eval_steps_per_second": 3.013, "step": 139 }, { "epoch": 43.07692307692308, "grad_norm": 8.961894035339355, "learning_rate": 2.962962962962963e-05, "loss": 0.2073, "step": 140 }, { "epoch": 44.0, "eval_accuracy": 0.7887323943661971, "eval_loss": 0.4798416793346405, "eval_runtime": 0.9976, "eval_samples_per_second": 71.172, "eval_steps_per_second": 3.007, "step": 143 }, { "epoch": 44.92307692307692, "eval_accuracy": 0.7887323943661971, "eval_loss": 0.5341726541519165, "eval_runtime": 0.9976, "eval_samples_per_second": 71.169, "eval_steps_per_second": 3.007, "step": 146 }, { "epoch": 45.84615384615385, "eval_accuracy": 0.7887323943661971, "eval_loss": 0.48342809081077576, "eval_runtime": 0.9955, "eval_samples_per_second": 71.324, "eval_steps_per_second": 3.014, "step": 149 }, { "epoch": 46.15384615384615, "grad_norm": 6.036099433898926, "learning_rate": 2.777777777777778e-05, "loss": 0.1702, "step": 150 }, { "epoch": 46.76923076923077, "eval_accuracy": 0.7464788732394366, "eval_loss": 0.6100845336914062, "eval_runtime": 1.0014, "eval_samples_per_second": 70.903, "eval_steps_per_second": 2.996, "step": 152 }, { "epoch": 48.0, "eval_accuracy": 0.8169014084507042, "eval_loss": 0.47786733508110046, "eval_runtime": 0.9939, "eval_samples_per_second": 71.433, "eval_steps_per_second": 3.018, "step": 156 }, { "epoch": 48.92307692307692, "eval_accuracy": 0.7887323943661971, "eval_loss": 0.5047819018363953, "eval_runtime": 0.9956, "eval_samples_per_second": 71.313, "eval_steps_per_second": 3.013, "step": 159 }, { "epoch": 49.23076923076923, "grad_norm": 3.7059566974639893, "learning_rate": 2.5925925925925925e-05, "loss": 0.153, "step": 160 }, { "epoch": 49.84615384615385, "eval_accuracy": 0.7464788732394366, "eval_loss": 0.6297626495361328, "eval_runtime": 0.9916, "eval_samples_per_second": 71.6, "eval_steps_per_second": 3.025, "step": 162 }, { "epoch": 50.76923076923077, "eval_accuracy": 0.7605633802816901, "eval_loss": 0.5995147824287415, "eval_runtime": 0.9924, "eval_samples_per_second": 71.541, "eval_steps_per_second": 3.023, "step": 165 }, { "epoch": 52.0, "eval_accuracy": 0.704225352112676, "eval_loss": 0.6475109457969666, "eval_runtime": 0.9899, "eval_samples_per_second": 71.724, "eval_steps_per_second": 3.031, "step": 169 }, { "epoch": 52.30769230769231, "grad_norm": 2.797576427459717, "learning_rate": 2.4074074074074074e-05, "loss": 0.1508, "step": 170 }, { "epoch": 52.92307692307692, "eval_accuracy": 0.8028169014084507, "eval_loss": 0.48875874280929565, "eval_runtime": 1.0078, "eval_samples_per_second": 70.449, "eval_steps_per_second": 2.977, "step": 172 }, { "epoch": 53.84615384615385, "eval_accuracy": 0.8309859154929577, "eval_loss": 0.4953955411911011, "eval_runtime": 0.9916, "eval_samples_per_second": 71.604, "eval_steps_per_second": 3.026, "step": 175 }, { "epoch": 54.76923076923077, "eval_accuracy": 0.8028169014084507, "eval_loss": 0.43896353244781494, "eval_runtime": 0.9963, "eval_samples_per_second": 71.26, "eval_steps_per_second": 3.011, "step": 178 }, { "epoch": 55.38461538461539, "grad_norm": 2.899003744125366, "learning_rate": 2.2222222222222223e-05, "loss": 0.1293, "step": 180 }, { "epoch": 56.0, "eval_accuracy": 0.8591549295774648, "eval_loss": 0.477837473154068, "eval_runtime": 0.9883, "eval_samples_per_second": 71.838, "eval_steps_per_second": 3.035, "step": 182 }, { "epoch": 56.92307692307692, "eval_accuracy": 0.8309859154929577, "eval_loss": 0.4887799024581909, "eval_runtime": 0.9913, "eval_samples_per_second": 71.622, "eval_steps_per_second": 3.026, "step": 185 }, { "epoch": 57.84615384615385, "eval_accuracy": 0.8732394366197183, "eval_loss": 0.48321232199668884, "eval_runtime": 0.9931, "eval_samples_per_second": 71.494, "eval_steps_per_second": 3.021, "step": 188 }, { "epoch": 58.46153846153846, "grad_norm": 2.5362701416015625, "learning_rate": 2.037037037037037e-05, "loss": 0.1489, "step": 190 }, { "epoch": 58.76923076923077, "eval_accuracy": 0.8309859154929577, "eval_loss": 0.527686595916748, "eval_runtime": 0.981, "eval_samples_per_second": 72.378, "eval_steps_per_second": 3.058, "step": 191 }, { "epoch": 60.0, "eval_accuracy": 0.7323943661971831, "eval_loss": 0.6216662526130676, "eval_runtime": 0.9828, "eval_samples_per_second": 72.243, "eval_steps_per_second": 3.053, "step": 195 }, { "epoch": 60.92307692307692, "eval_accuracy": 0.7464788732394366, "eval_loss": 0.6089593768119812, "eval_runtime": 0.9925, "eval_samples_per_second": 71.539, "eval_steps_per_second": 3.023, "step": 198 }, { "epoch": 61.53846153846154, "grad_norm": 2.5179524421691895, "learning_rate": 1.8518518518518518e-05, "loss": 0.1487, "step": 200 }, { "epoch": 61.84615384615385, "eval_accuracy": 0.8450704225352113, "eval_loss": 0.5424482226371765, "eval_runtime": 0.9877, "eval_samples_per_second": 71.886, "eval_steps_per_second": 3.037, "step": 201 }, { "epoch": 62.76923076923077, "eval_accuracy": 0.8450704225352113, "eval_loss": 0.5569549202919006, "eval_runtime": 0.9873, "eval_samples_per_second": 71.913, "eval_steps_per_second": 3.039, "step": 204 }, { "epoch": 64.0, "eval_accuracy": 0.7183098591549296, "eval_loss": 0.7247959971427917, "eval_runtime": 0.9969, "eval_samples_per_second": 71.22, "eval_steps_per_second": 3.009, "step": 208 }, { "epoch": 64.61538461538461, "grad_norm": 3.1306471824645996, "learning_rate": 1.6666666666666667e-05, "loss": 0.1456, "step": 210 }, { "epoch": 64.92307692307692, "eval_accuracy": 0.7887323943661971, "eval_loss": 0.5840876698493958, "eval_runtime": 1.0357, "eval_samples_per_second": 68.554, "eval_steps_per_second": 2.897, "step": 211 }, { "epoch": 65.84615384615384, "eval_accuracy": 0.8309859154929577, "eval_loss": 0.5905009508132935, "eval_runtime": 1.1058, "eval_samples_per_second": 64.208, "eval_steps_per_second": 2.713, "step": 214 }, { "epoch": 66.76923076923077, "eval_accuracy": 0.8309859154929577, "eval_loss": 0.5608553290367126, "eval_runtime": 1.0367, "eval_samples_per_second": 68.484, "eval_steps_per_second": 2.894, "step": 217 }, { "epoch": 67.6923076923077, "grad_norm": 2.4782862663269043, "learning_rate": 1.4814814814814815e-05, "loss": 0.1284, "step": 220 }, { "epoch": 68.0, "eval_accuracy": 0.8028169014084507, "eval_loss": 0.546994149684906, "eval_runtime": 0.9751, "eval_samples_per_second": 72.81, "eval_steps_per_second": 3.076, "step": 221 }, { "epoch": 68.92307692307692, "eval_accuracy": 0.8309859154929577, "eval_loss": 0.5472754240036011, "eval_runtime": 1.0047, "eval_samples_per_second": 70.665, "eval_steps_per_second": 2.986, "step": 224 }, { "epoch": 69.84615384615384, "eval_accuracy": 0.8309859154929577, "eval_loss": 0.5813232660293579, "eval_runtime": 0.9977, "eval_samples_per_second": 71.162, "eval_steps_per_second": 3.007, "step": 227 }, { "epoch": 70.76923076923077, "grad_norm": 1.8887909650802612, "learning_rate": 1.2962962962962962e-05, "loss": 0.1225, "step": 230 }, { "epoch": 70.76923076923077, "eval_accuracy": 0.8450704225352113, "eval_loss": 0.5682988166809082, "eval_runtime": 0.9949, "eval_samples_per_second": 71.367, "eval_steps_per_second": 3.016, "step": 230 }, { "epoch": 72.0, "eval_accuracy": 0.8309859154929577, "eval_loss": 0.5581419467926025, "eval_runtime": 0.9914, "eval_samples_per_second": 71.615, "eval_steps_per_second": 3.026, "step": 234 }, { "epoch": 72.92307692307692, "eval_accuracy": 0.7887323943661971, "eval_loss": 0.5717195272445679, "eval_runtime": 0.976, "eval_samples_per_second": 72.745, "eval_steps_per_second": 3.074, "step": 237 }, { "epoch": 73.84615384615384, "grad_norm": 2.5398831367492676, "learning_rate": 1.1111111111111112e-05, "loss": 0.1233, "step": 240 }, { "epoch": 73.84615384615384, "eval_accuracy": 0.7605633802816901, "eval_loss": 0.6054385304450989, "eval_runtime": 0.9717, "eval_samples_per_second": 73.067, "eval_steps_per_second": 3.087, "step": 240 }, { "epoch": 74.76923076923077, "eval_accuracy": 0.7887323943661971, "eval_loss": 0.5910421013832092, "eval_runtime": 0.9862, "eval_samples_per_second": 71.995, "eval_steps_per_second": 3.042, "step": 243 }, { "epoch": 76.0, "eval_accuracy": 0.8169014084507042, "eval_loss": 0.570685863494873, "eval_runtime": 0.9929, "eval_samples_per_second": 71.507, "eval_steps_per_second": 3.021, "step": 247 }, { "epoch": 76.92307692307692, "grad_norm": 2.284491539001465, "learning_rate": 9.259259259259259e-06, "loss": 0.1234, "step": 250 }, { "epoch": 76.92307692307692, "eval_accuracy": 0.8028169014084507, "eval_loss": 0.5732807517051697, "eval_runtime": 0.9877, "eval_samples_per_second": 71.884, "eval_steps_per_second": 3.037, "step": 250 }, { "epoch": 77.84615384615384, "eval_accuracy": 0.8028169014084507, "eval_loss": 0.5748334527015686, "eval_runtime": 0.9897, "eval_samples_per_second": 71.74, "eval_steps_per_second": 3.031, "step": 253 }, { "epoch": 78.76923076923077, "eval_accuracy": 0.7887323943661971, "eval_loss": 0.5722648501396179, "eval_runtime": 0.9949, "eval_samples_per_second": 71.363, "eval_steps_per_second": 3.015, "step": 256 }, { "epoch": 80.0, "grad_norm": 4.123226165771484, "learning_rate": 7.4074074074074075e-06, "loss": 0.1219, "step": 260 }, { "epoch": 80.0, "eval_accuracy": 0.8169014084507042, "eval_loss": 0.5503106713294983, "eval_runtime": 1.0099, "eval_samples_per_second": 70.307, "eval_steps_per_second": 2.971, "step": 260 }, { "epoch": 80.92307692307692, "eval_accuracy": 0.8028169014084507, "eval_loss": 0.5531720519065857, "eval_runtime": 0.978, "eval_samples_per_second": 72.6, "eval_steps_per_second": 3.068, "step": 263 }, { "epoch": 81.84615384615384, "eval_accuracy": 0.7746478873239436, "eval_loss": 0.5827966332435608, "eval_runtime": 0.9983, "eval_samples_per_second": 71.119, "eval_steps_per_second": 3.005, "step": 266 }, { "epoch": 82.76923076923077, "eval_accuracy": 0.7746478873239436, "eval_loss": 0.6061961650848389, "eval_runtime": 0.98, "eval_samples_per_second": 72.449, "eval_steps_per_second": 3.061, "step": 269 }, { "epoch": 83.07692307692308, "grad_norm": 2.491278886795044, "learning_rate": 5.555555555555556e-06, "loss": 0.1075, "step": 270 }, { "epoch": 84.0, "eval_accuracy": 0.8028169014084507, "eval_loss": 0.5751623511314392, "eval_runtime": 1.0114, "eval_samples_per_second": 70.2, "eval_steps_per_second": 2.966, "step": 273 }, { "epoch": 84.92307692307692, "eval_accuracy": 0.8028169014084507, "eval_loss": 0.5748041868209839, "eval_runtime": 0.989, "eval_samples_per_second": 71.791, "eval_steps_per_second": 3.033, "step": 276 }, { "epoch": 85.84615384615384, "eval_accuracy": 0.8169014084507042, "eval_loss": 0.5776336789131165, "eval_runtime": 0.9954, "eval_samples_per_second": 71.325, "eval_steps_per_second": 3.014, "step": 279 }, { "epoch": 86.15384615384616, "grad_norm": 2.633176565170288, "learning_rate": 3.7037037037037037e-06, "loss": 0.1013, "step": 280 }, { "epoch": 86.76923076923077, "eval_accuracy": 0.8028169014084507, "eval_loss": 0.5844343304634094, "eval_runtime": 0.9859, "eval_samples_per_second": 72.012, "eval_steps_per_second": 3.043, "step": 282 }, { "epoch": 88.0, "eval_accuracy": 0.8028169014084507, "eval_loss": 0.5929856896400452, "eval_runtime": 1.0276, "eval_samples_per_second": 69.094, "eval_steps_per_second": 2.919, "step": 286 }, { "epoch": 88.92307692307692, "eval_accuracy": 0.7887323943661971, "eval_loss": 0.6019977331161499, "eval_runtime": 0.9928, "eval_samples_per_second": 71.514, "eval_steps_per_second": 3.022, "step": 289 }, { "epoch": 89.23076923076923, "grad_norm": 3.9098527431488037, "learning_rate": 1.8518518518518519e-06, "loss": 0.1092, "step": 290 }, { "epoch": 89.84615384615384, "eval_accuracy": 0.7746478873239436, "eval_loss": 0.6055042147636414, "eval_runtime": 1.004, "eval_samples_per_second": 70.718, "eval_steps_per_second": 2.988, "step": 292 }, { "epoch": 90.76923076923077, "eval_accuracy": 0.7746478873239436, "eval_loss": 0.6074590086936951, "eval_runtime": 0.9928, "eval_samples_per_second": 71.517, "eval_steps_per_second": 3.022, "step": 295 }, { "epoch": 92.0, "eval_accuracy": 0.7746478873239436, "eval_loss": 0.607978880405426, "eval_runtime": 0.9917, "eval_samples_per_second": 71.596, "eval_steps_per_second": 3.025, "step": 299 }, { "epoch": 92.3076923076923, "grad_norm": 2.8199360370635986, "learning_rate": 0.0, "loss": 0.1096, "step": 300 }, { "epoch": 92.3076923076923, "eval_accuracy": 0.7746478873239436, "eval_loss": 0.6081295013427734, "eval_runtime": 1.0121, "eval_samples_per_second": 70.154, "eval_steps_per_second": 2.964, "step": 300 }, { "epoch": 92.3076923076923, "step": 300, "total_flos": 2.8402872494292173e+18, "train_loss": 0.24676756938298544, "train_runtime": 1640.323, "train_samples_per_second": 24.203, "train_steps_per_second": 0.183 }, { "epoch": 92.3076923076923, "eval_accuracy": 0.8732394366197183, "eval_loss": 0.48321232199668884, "eval_runtime": 1.0328, "eval_samples_per_second": 68.746, "eval_steps_per_second": 2.905, "step": 300 } ], "logging_steps": 10, "max_steps": 300, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.8402872494292173e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }