|
{ |
|
"best_metric": 0.8481012658227848, |
|
"best_model_checkpoint": "beit-base-patch16-224-fold1/checkpoint-248", |
|
"epoch": 85.71428571428571, |
|
"eval_steps": 500, |
|
"global_step": 300, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.8571428571428571, |
|
"eval_accuracy": 0.45569620253164556, |
|
"eval_loss": 0.8050068020820618, |
|
"eval_runtime": 20.5223, |
|
"eval_samples_per_second": 3.849, |
|
"eval_steps_per_second": 0.146, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.569620253164557, |
|
"eval_loss": 0.7151382565498352, |
|
"eval_runtime": 1.3733, |
|
"eval_samples_per_second": 57.526, |
|
"eval_steps_per_second": 2.185, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 4.684664249420166, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 0.8103, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"eval_accuracy": 0.5569620253164557, |
|
"eval_loss": 0.6821601390838623, |
|
"eval_runtime": 1.4197, |
|
"eval_samples_per_second": 55.644, |
|
"eval_steps_per_second": 2.113, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.569620253164557, |
|
"eval_loss": 0.640774667263031, |
|
"eval_runtime": 1.4465, |
|
"eval_samples_per_second": 54.616, |
|
"eval_steps_per_second": 2.074, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 4.857142857142857, |
|
"eval_accuracy": 0.6708860759493671, |
|
"eval_loss": 0.6244170069694519, |
|
"eval_runtime": 1.5149, |
|
"eval_samples_per_second": 52.147, |
|
"eval_steps_per_second": 1.98, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 5.714285714285714, |
|
"grad_norm": 2.6553750038146973, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 0.6583, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.6708860759493671, |
|
"eval_loss": 0.5892533659934998, |
|
"eval_runtime": 1.448, |
|
"eval_samples_per_second": 54.56, |
|
"eval_steps_per_second": 2.072, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 6.857142857142857, |
|
"eval_accuracy": 0.6329113924050633, |
|
"eval_loss": 0.5876858234405518, |
|
"eval_runtime": 1.4555, |
|
"eval_samples_per_second": 54.277, |
|
"eval_steps_per_second": 2.061, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.6835443037974683, |
|
"eval_loss": 0.5752159953117371, |
|
"eval_runtime": 1.4455, |
|
"eval_samples_per_second": 54.653, |
|
"eval_steps_per_second": 2.075, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 8.571428571428571, |
|
"grad_norm": 8.61998176574707, |
|
"learning_rate": 5e-05, |
|
"loss": 0.5912, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 8.857142857142858, |
|
"eval_accuracy": 0.6455696202531646, |
|
"eval_loss": 0.5825986862182617, |
|
"eval_runtime": 1.4574, |
|
"eval_samples_per_second": 54.204, |
|
"eval_steps_per_second": 2.058, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.6835443037974683, |
|
"eval_loss": 0.5469183325767517, |
|
"eval_runtime": 1.4759, |
|
"eval_samples_per_second": 53.528, |
|
"eval_steps_per_second": 2.033, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 10.857142857142858, |
|
"eval_accuracy": 0.6582278481012658, |
|
"eval_loss": 0.6173216700553894, |
|
"eval_runtime": 1.4368, |
|
"eval_samples_per_second": 54.985, |
|
"eval_steps_per_second": 2.088, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 11.428571428571429, |
|
"grad_norm": 4.089001655578613, |
|
"learning_rate": 4.814814814814815e-05, |
|
"loss": 0.5301, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.6962025316455697, |
|
"eval_loss": 0.51507169008255, |
|
"eval_runtime": 1.4391, |
|
"eval_samples_per_second": 54.894, |
|
"eval_steps_per_second": 2.085, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 12.857142857142858, |
|
"eval_accuracy": 0.6962025316455697, |
|
"eval_loss": 0.5105239152908325, |
|
"eval_runtime": 1.5017, |
|
"eval_samples_per_second": 52.608, |
|
"eval_steps_per_second": 1.998, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.7088607594936709, |
|
"eval_loss": 0.5488570928573608, |
|
"eval_runtime": 1.4299, |
|
"eval_samples_per_second": 55.25, |
|
"eval_steps_per_second": 2.098, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 14.285714285714286, |
|
"grad_norm": 5.854975700378418, |
|
"learning_rate": 4.62962962962963e-05, |
|
"loss": 0.4703, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 14.857142857142858, |
|
"eval_accuracy": 0.6835443037974683, |
|
"eval_loss": 0.5724519491195679, |
|
"eval_runtime": 1.4269, |
|
"eval_samples_per_second": 55.364, |
|
"eval_steps_per_second": 2.102, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.6962025316455697, |
|
"eval_loss": 0.5559752583503723, |
|
"eval_runtime": 1.4698, |
|
"eval_samples_per_second": 53.75, |
|
"eval_steps_per_second": 2.041, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 16.857142857142858, |
|
"eval_accuracy": 0.6708860759493671, |
|
"eval_loss": 0.5824136137962341, |
|
"eval_runtime": 1.4546, |
|
"eval_samples_per_second": 54.312, |
|
"eval_steps_per_second": 2.062, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 17.142857142857142, |
|
"grad_norm": 6.086174964904785, |
|
"learning_rate": 4.4444444444444447e-05, |
|
"loss": 0.4189, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.7468354430379747, |
|
"eval_loss": 0.540145754814148, |
|
"eval_runtime": 1.4352, |
|
"eval_samples_per_second": 55.046, |
|
"eval_steps_per_second": 2.09, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 18.857142857142858, |
|
"eval_accuracy": 0.7721518987341772, |
|
"eval_loss": 0.514731764793396, |
|
"eval_runtime": 1.4503, |
|
"eval_samples_per_second": 54.473, |
|
"eval_steps_per_second": 2.069, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 8.862787246704102, |
|
"learning_rate": 4.259259259259259e-05, |
|
"loss": 0.3741, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.759493670886076, |
|
"eval_loss": 0.48641237616539, |
|
"eval_runtime": 1.4392, |
|
"eval_samples_per_second": 54.891, |
|
"eval_steps_per_second": 2.084, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 20.857142857142858, |
|
"eval_accuracy": 0.7341772151898734, |
|
"eval_loss": 0.5272199511528015, |
|
"eval_runtime": 1.461, |
|
"eval_samples_per_second": 54.072, |
|
"eval_steps_per_second": 2.053, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_accuracy": 0.7468354430379747, |
|
"eval_loss": 0.49136921763420105, |
|
"eval_runtime": 1.4904, |
|
"eval_samples_per_second": 53.005, |
|
"eval_steps_per_second": 2.013, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 22.857142857142858, |
|
"grad_norm": 8.650327682495117, |
|
"learning_rate": 4.074074074074074e-05, |
|
"loss": 0.387, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 22.857142857142858, |
|
"eval_accuracy": 0.7468354430379747, |
|
"eval_loss": 0.5658156275749207, |
|
"eval_runtime": 1.498, |
|
"eval_samples_per_second": 52.738, |
|
"eval_steps_per_second": 2.003, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_accuracy": 0.7721518987341772, |
|
"eval_loss": 0.4662097096443176, |
|
"eval_runtime": 1.512, |
|
"eval_samples_per_second": 52.249, |
|
"eval_steps_per_second": 1.984, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 24.857142857142858, |
|
"eval_accuracy": 0.7848101265822784, |
|
"eval_loss": 0.4376372694969177, |
|
"eval_runtime": 1.5044, |
|
"eval_samples_per_second": 52.514, |
|
"eval_steps_per_second": 1.994, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 25.714285714285715, |
|
"grad_norm": 6.057330131530762, |
|
"learning_rate": 3.888888888888889e-05, |
|
"loss": 0.3502, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_accuracy": 0.7721518987341772, |
|
"eval_loss": 0.5366873145103455, |
|
"eval_runtime": 1.5039, |
|
"eval_samples_per_second": 52.529, |
|
"eval_steps_per_second": 1.995, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 26.857142857142858, |
|
"eval_accuracy": 0.7341772151898734, |
|
"eval_loss": 0.5490015745162964, |
|
"eval_runtime": 1.4224, |
|
"eval_samples_per_second": 55.541, |
|
"eval_steps_per_second": 2.109, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_accuracy": 0.7721518987341772, |
|
"eval_loss": 0.7162956595420837, |
|
"eval_runtime": 1.4548, |
|
"eval_samples_per_second": 54.303, |
|
"eval_steps_per_second": 2.062, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 28.571428571428573, |
|
"grad_norm": 6.062076568603516, |
|
"learning_rate": 3.7037037037037037e-05, |
|
"loss": 0.3148, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 28.857142857142858, |
|
"eval_accuracy": 0.7468354430379747, |
|
"eval_loss": 0.6004660129547119, |
|
"eval_runtime": 1.4277, |
|
"eval_samples_per_second": 55.333, |
|
"eval_steps_per_second": 2.101, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_accuracy": 0.7721518987341772, |
|
"eval_loss": 0.6500609517097473, |
|
"eval_runtime": 1.4701, |
|
"eval_samples_per_second": 53.739, |
|
"eval_steps_per_second": 2.041, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 30.857142857142858, |
|
"eval_accuracy": 0.7974683544303798, |
|
"eval_loss": 0.5312591791152954, |
|
"eval_runtime": 1.4996, |
|
"eval_samples_per_second": 52.68, |
|
"eval_steps_per_second": 2.001, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 31.428571428571427, |
|
"grad_norm": 5.8153252601623535, |
|
"learning_rate": 3.518518518518519e-05, |
|
"loss": 0.2973, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_accuracy": 0.7721518987341772, |
|
"eval_loss": 0.5466110706329346, |
|
"eval_runtime": 1.5101, |
|
"eval_samples_per_second": 52.314, |
|
"eval_steps_per_second": 1.987, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 32.857142857142854, |
|
"eval_accuracy": 0.810126582278481, |
|
"eval_loss": 0.5730607509613037, |
|
"eval_runtime": 1.4879, |
|
"eval_samples_per_second": 53.094, |
|
"eval_steps_per_second": 2.016, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"eval_accuracy": 0.810126582278481, |
|
"eval_loss": 0.6543712615966797, |
|
"eval_runtime": 1.4649, |
|
"eval_samples_per_second": 53.927, |
|
"eval_steps_per_second": 2.048, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 34.285714285714285, |
|
"grad_norm": 5.931222438812256, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 0.2474, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 34.857142857142854, |
|
"eval_accuracy": 0.7848101265822784, |
|
"eval_loss": 0.6060739159584045, |
|
"eval_runtime": 1.4417, |
|
"eval_samples_per_second": 54.798, |
|
"eval_steps_per_second": 2.081, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"eval_accuracy": 0.7721518987341772, |
|
"eval_loss": 0.5815550684928894, |
|
"eval_runtime": 1.4713, |
|
"eval_samples_per_second": 53.693, |
|
"eval_steps_per_second": 2.039, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 36.857142857142854, |
|
"eval_accuracy": 0.759493670886076, |
|
"eval_loss": 0.7160954475402832, |
|
"eval_runtime": 1.5016, |
|
"eval_samples_per_second": 52.612, |
|
"eval_steps_per_second": 1.998, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 37.142857142857146, |
|
"grad_norm": 5.137592792510986, |
|
"learning_rate": 3.148148148148148e-05, |
|
"loss": 0.2033, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"eval_accuracy": 0.7848101265822784, |
|
"eval_loss": 0.6234713196754456, |
|
"eval_runtime": 1.4848, |
|
"eval_samples_per_second": 53.205, |
|
"eval_steps_per_second": 2.02, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 38.857142857142854, |
|
"eval_accuracy": 0.759493670886076, |
|
"eval_loss": 0.7888889312744141, |
|
"eval_runtime": 1.4207, |
|
"eval_samples_per_second": 55.607, |
|
"eval_steps_per_second": 2.112, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"grad_norm": 5.441008567810059, |
|
"learning_rate": 2.962962962962963e-05, |
|
"loss": 0.2338, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"eval_accuracy": 0.759493670886076, |
|
"eval_loss": 0.5943406224250793, |
|
"eval_runtime": 1.4387, |
|
"eval_samples_per_second": 54.911, |
|
"eval_steps_per_second": 2.085, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 40.857142857142854, |
|
"eval_accuracy": 0.7341772151898734, |
|
"eval_loss": 0.6169915795326233, |
|
"eval_runtime": 1.4387, |
|
"eval_samples_per_second": 54.909, |
|
"eval_steps_per_second": 2.085, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 42.0, |
|
"eval_accuracy": 0.6962025316455697, |
|
"eval_loss": 0.6963752508163452, |
|
"eval_runtime": 1.4482, |
|
"eval_samples_per_second": 54.55, |
|
"eval_steps_per_second": 2.072, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 42.857142857142854, |
|
"grad_norm": 6.6485161781311035, |
|
"learning_rate": 2.777777777777778e-05, |
|
"loss": 0.2067, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 42.857142857142854, |
|
"eval_accuracy": 0.7468354430379747, |
|
"eval_loss": 0.7153680324554443, |
|
"eval_runtime": 1.574, |
|
"eval_samples_per_second": 50.19, |
|
"eval_steps_per_second": 1.906, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"eval_accuracy": 0.7721518987341772, |
|
"eval_loss": 0.767503559589386, |
|
"eval_runtime": 1.4899, |
|
"eval_samples_per_second": 53.025, |
|
"eval_steps_per_second": 2.014, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 44.857142857142854, |
|
"eval_accuracy": 0.7468354430379747, |
|
"eval_loss": 0.7765600681304932, |
|
"eval_runtime": 1.4794, |
|
"eval_samples_per_second": 53.4, |
|
"eval_steps_per_second": 2.028, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 45.714285714285715, |
|
"grad_norm": 6.1349005699157715, |
|
"learning_rate": 2.5925925925925925e-05, |
|
"loss": 0.2133, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 46.0, |
|
"eval_accuracy": 0.7848101265822784, |
|
"eval_loss": 0.932968258857727, |
|
"eval_runtime": 1.4465, |
|
"eval_samples_per_second": 54.616, |
|
"eval_steps_per_second": 2.074, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 46.857142857142854, |
|
"eval_accuracy": 0.7974683544303798, |
|
"eval_loss": 0.64939284324646, |
|
"eval_runtime": 1.4334, |
|
"eval_samples_per_second": 55.113, |
|
"eval_steps_per_second": 2.093, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"eval_accuracy": 0.7721518987341772, |
|
"eval_loss": 0.5709493160247803, |
|
"eval_runtime": 1.4722, |
|
"eval_samples_per_second": 53.662, |
|
"eval_steps_per_second": 2.038, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 48.57142857142857, |
|
"grad_norm": 3.4344000816345215, |
|
"learning_rate": 2.4074074074074074e-05, |
|
"loss": 0.2004, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 48.857142857142854, |
|
"eval_accuracy": 0.810126582278481, |
|
"eval_loss": 0.6462149620056152, |
|
"eval_runtime": 1.5036, |
|
"eval_samples_per_second": 52.54, |
|
"eval_steps_per_second": 1.995, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"eval_accuracy": 0.7721518987341772, |
|
"eval_loss": 0.6667977571487427, |
|
"eval_runtime": 1.5326, |
|
"eval_samples_per_second": 51.547, |
|
"eval_steps_per_second": 1.957, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 50.857142857142854, |
|
"eval_accuracy": 0.810126582278481, |
|
"eval_loss": 0.6305052638053894, |
|
"eval_runtime": 1.4376, |
|
"eval_samples_per_second": 54.953, |
|
"eval_steps_per_second": 2.087, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 51.42857142857143, |
|
"grad_norm": 5.206828594207764, |
|
"learning_rate": 2.2222222222222223e-05, |
|
"loss": 0.188, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 52.0, |
|
"eval_accuracy": 0.8227848101265823, |
|
"eval_loss": 0.7189355492591858, |
|
"eval_runtime": 1.4518, |
|
"eval_samples_per_second": 54.415, |
|
"eval_steps_per_second": 2.066, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 52.857142857142854, |
|
"eval_accuracy": 0.7848101265822784, |
|
"eval_loss": 0.6853471398353577, |
|
"eval_runtime": 1.49, |
|
"eval_samples_per_second": 53.02, |
|
"eval_steps_per_second": 2.013, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 54.0, |
|
"eval_accuracy": 0.8227848101265823, |
|
"eval_loss": 0.8039994835853577, |
|
"eval_runtime": 1.4908, |
|
"eval_samples_per_second": 52.991, |
|
"eval_steps_per_second": 2.012, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 54.285714285714285, |
|
"grad_norm": 5.863402843475342, |
|
"learning_rate": 2.037037037037037e-05, |
|
"loss": 0.1623, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 54.857142857142854, |
|
"eval_accuracy": 0.810126582278481, |
|
"eval_loss": 0.69575035572052, |
|
"eval_runtime": 1.5387, |
|
"eval_samples_per_second": 51.343, |
|
"eval_steps_per_second": 1.95, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 56.0, |
|
"eval_accuracy": 0.810126582278481, |
|
"eval_loss": 0.6906704902648926, |
|
"eval_runtime": 1.576, |
|
"eval_samples_per_second": 50.126, |
|
"eval_steps_per_second": 1.904, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 56.857142857142854, |
|
"eval_accuracy": 0.810126582278481, |
|
"eval_loss": 0.6821295619010925, |
|
"eval_runtime": 1.4415, |
|
"eval_samples_per_second": 54.804, |
|
"eval_steps_per_second": 2.081, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 57.142857142857146, |
|
"grad_norm": 4.665853500366211, |
|
"learning_rate": 1.8518518518518518e-05, |
|
"loss": 0.1588, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 58.0, |
|
"eval_accuracy": 0.810126582278481, |
|
"eval_loss": 0.6534023880958557, |
|
"eval_runtime": 1.429, |
|
"eval_samples_per_second": 55.283, |
|
"eval_steps_per_second": 2.099, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 58.857142857142854, |
|
"eval_accuracy": 0.810126582278481, |
|
"eval_loss": 0.7192457318305969, |
|
"eval_runtime": 1.4185, |
|
"eval_samples_per_second": 55.694, |
|
"eval_steps_per_second": 2.115, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"grad_norm": 6.225094318389893, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 0.1607, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"eval_accuracy": 0.8227848101265823, |
|
"eval_loss": 0.7752671837806702, |
|
"eval_runtime": 1.4284, |
|
"eval_samples_per_second": 55.308, |
|
"eval_steps_per_second": 2.1, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 60.857142857142854, |
|
"eval_accuracy": 0.810126582278481, |
|
"eval_loss": 0.8949642181396484, |
|
"eval_runtime": 1.5505, |
|
"eval_samples_per_second": 50.951, |
|
"eval_steps_per_second": 1.935, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 62.0, |
|
"eval_accuracy": 0.810126582278481, |
|
"eval_loss": 0.7903599739074707, |
|
"eval_runtime": 1.5102, |
|
"eval_samples_per_second": 52.311, |
|
"eval_steps_per_second": 1.986, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 62.857142857142854, |
|
"grad_norm": 4.583127498626709, |
|
"learning_rate": 1.4814814814814815e-05, |
|
"loss": 0.1767, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 62.857142857142854, |
|
"eval_accuracy": 0.810126582278481, |
|
"eval_loss": 0.6972522735595703, |
|
"eval_runtime": 1.4628, |
|
"eval_samples_per_second": 54.005, |
|
"eval_steps_per_second": 2.051, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 64.0, |
|
"eval_accuracy": 0.7974683544303798, |
|
"eval_loss": 0.6694443225860596, |
|
"eval_runtime": 1.4975, |
|
"eval_samples_per_second": 52.754, |
|
"eval_steps_per_second": 2.003, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 64.85714285714286, |
|
"eval_accuracy": 0.810126582278481, |
|
"eval_loss": 0.6338869333267212, |
|
"eval_runtime": 1.4504, |
|
"eval_samples_per_second": 54.468, |
|
"eval_steps_per_second": 2.068, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 65.71428571428571, |
|
"grad_norm": 3.7681446075439453, |
|
"learning_rate": 1.2962962962962962e-05, |
|
"loss": 0.1463, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 66.0, |
|
"eval_accuracy": 0.810126582278481, |
|
"eval_loss": 0.6530351042747498, |
|
"eval_runtime": 1.4628, |
|
"eval_samples_per_second": 54.005, |
|
"eval_steps_per_second": 2.051, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 66.85714285714286, |
|
"eval_accuracy": 0.810126582278481, |
|
"eval_loss": 0.6141919493675232, |
|
"eval_runtime": 1.5198, |
|
"eval_samples_per_second": 51.98, |
|
"eval_steps_per_second": 1.974, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 68.0, |
|
"eval_accuracy": 0.8227848101265823, |
|
"eval_loss": 0.628998339176178, |
|
"eval_runtime": 1.4603, |
|
"eval_samples_per_second": 54.1, |
|
"eval_steps_per_second": 2.054, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 68.57142857142857, |
|
"grad_norm": 5.3702874183654785, |
|
"learning_rate": 1.1111111111111112e-05, |
|
"loss": 0.1287, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 68.85714285714286, |
|
"eval_accuracy": 0.8354430379746836, |
|
"eval_loss": 0.6333932280540466, |
|
"eval_runtime": 1.437, |
|
"eval_samples_per_second": 54.977, |
|
"eval_steps_per_second": 2.088, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 70.0, |
|
"eval_accuracy": 0.810126582278481, |
|
"eval_loss": 0.8058773279190063, |
|
"eval_runtime": 1.437, |
|
"eval_samples_per_second": 54.974, |
|
"eval_steps_per_second": 2.088, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 70.85714285714286, |
|
"eval_accuracy": 0.8481012658227848, |
|
"eval_loss": 0.7241003513336182, |
|
"eval_runtime": 1.4148, |
|
"eval_samples_per_second": 55.837, |
|
"eval_steps_per_second": 2.12, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 71.42857142857143, |
|
"grad_norm": 4.752800941467285, |
|
"learning_rate": 9.259259259259259e-06, |
|
"loss": 0.1323, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 72.0, |
|
"eval_accuracy": 0.8481012658227848, |
|
"eval_loss": 0.6835869550704956, |
|
"eval_runtime": 1.4344, |
|
"eval_samples_per_second": 55.075, |
|
"eval_steps_per_second": 2.091, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 72.85714285714286, |
|
"eval_accuracy": 0.8227848101265823, |
|
"eval_loss": 0.6587881445884705, |
|
"eval_runtime": 1.559, |
|
"eval_samples_per_second": 50.672, |
|
"eval_steps_per_second": 1.924, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 74.0, |
|
"eval_accuracy": 0.8481012658227848, |
|
"eval_loss": 0.6597732901573181, |
|
"eval_runtime": 1.5278, |
|
"eval_samples_per_second": 51.709, |
|
"eval_steps_per_second": 1.964, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 74.28571428571429, |
|
"grad_norm": 3.2891921997070312, |
|
"learning_rate": 7.4074074074074075e-06, |
|
"loss": 0.1042, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 74.85714285714286, |
|
"eval_accuracy": 0.8354430379746836, |
|
"eval_loss": 0.713896632194519, |
|
"eval_runtime": 1.4654, |
|
"eval_samples_per_second": 53.909, |
|
"eval_steps_per_second": 2.047, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 76.0, |
|
"eval_accuracy": 0.8354430379746836, |
|
"eval_loss": 0.72358638048172, |
|
"eval_runtime": 1.4884, |
|
"eval_samples_per_second": 53.077, |
|
"eval_steps_per_second": 2.016, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 76.85714285714286, |
|
"eval_accuracy": 0.8354430379746836, |
|
"eval_loss": 0.6918818950653076, |
|
"eval_runtime": 1.4316, |
|
"eval_samples_per_second": 55.184, |
|
"eval_steps_per_second": 2.096, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 77.14285714285714, |
|
"grad_norm": 4.013108730316162, |
|
"learning_rate": 5.555555555555556e-06, |
|
"loss": 0.1106, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 78.0, |
|
"eval_accuracy": 0.8354430379746836, |
|
"eval_loss": 0.6568043828010559, |
|
"eval_runtime": 1.5155, |
|
"eval_samples_per_second": 52.128, |
|
"eval_steps_per_second": 1.98, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 78.85714285714286, |
|
"eval_accuracy": 0.8481012658227848, |
|
"eval_loss": 0.6556110382080078, |
|
"eval_runtime": 1.5408, |
|
"eval_samples_per_second": 51.272, |
|
"eval_steps_per_second": 1.947, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 80.0, |
|
"grad_norm": 6.208752632141113, |
|
"learning_rate": 3.7037037037037037e-06, |
|
"loss": 0.1348, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 80.0, |
|
"eval_accuracy": 0.8354430379746836, |
|
"eval_loss": 0.6612224578857422, |
|
"eval_runtime": 1.4365, |
|
"eval_samples_per_second": 54.993, |
|
"eval_steps_per_second": 2.088, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 80.85714285714286, |
|
"eval_accuracy": 0.8227848101265823, |
|
"eval_loss": 0.6686135530471802, |
|
"eval_runtime": 1.4579, |
|
"eval_samples_per_second": 54.186, |
|
"eval_steps_per_second": 2.058, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 82.0, |
|
"eval_accuracy": 0.8481012658227848, |
|
"eval_loss": 0.6705390214920044, |
|
"eval_runtime": 1.4513, |
|
"eval_samples_per_second": 54.434, |
|
"eval_steps_per_second": 2.067, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 82.85714285714286, |
|
"grad_norm": 4.1432647705078125, |
|
"learning_rate": 1.8518518518518519e-06, |
|
"loss": 0.1352, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 82.85714285714286, |
|
"eval_accuracy": 0.8354430379746836, |
|
"eval_loss": 0.677626371383667, |
|
"eval_runtime": 1.4762, |
|
"eval_samples_per_second": 53.516, |
|
"eval_steps_per_second": 2.032, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 84.0, |
|
"eval_accuracy": 0.8354430379746836, |
|
"eval_loss": 0.6872657537460327, |
|
"eval_runtime": 1.5716, |
|
"eval_samples_per_second": 50.268, |
|
"eval_steps_per_second": 1.909, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 84.85714285714286, |
|
"eval_accuracy": 0.8354430379746836, |
|
"eval_loss": 0.6887751817703247, |
|
"eval_runtime": 1.5031, |
|
"eval_samples_per_second": 52.557, |
|
"eval_steps_per_second": 1.996, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 85.71428571428571, |
|
"grad_norm": 3.4207639694213867, |
|
"learning_rate": 0.0, |
|
"loss": 0.1226, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 85.71428571428571, |
|
"eval_accuracy": 0.8354430379746836, |
|
"eval_loss": 0.688024640083313, |
|
"eval_runtime": 1.4114, |
|
"eval_samples_per_second": 55.972, |
|
"eval_steps_per_second": 2.126, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 85.71428571428571, |
|
"step": 300, |
|
"total_flos": 2.9349165326823014e+18, |
|
"train_loss": 0.2789517060915629, |
|
"train_runtime": 2373.8808, |
|
"train_samples_per_second": 18.619, |
|
"train_steps_per_second": 0.126 |
|
}, |
|
{ |
|
"epoch": 85.71428571428571, |
|
"eval_accuracy": 0.8481012658227848, |
|
"eval_loss": 0.7241003513336182, |
|
"eval_runtime": 1.4361, |
|
"eval_samples_per_second": 55.011, |
|
"eval_steps_per_second": 2.089, |
|
"step": 300 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 300, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 100, |
|
"save_steps": 500, |
|
"total_flos": 2.9349165326823014e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|