adapters-llama2-bnb4-QLORA-super_glue-axg
/
trainer_state-llama2-bnb4-QLORA-super_glue-axg-sequence_classification.json
{ | |
"best_metric": null, | |
"best_model_checkpoint": null, | |
"epoch": 10.0, | |
"eval_steps": 1, | |
"global_step": 90, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.1111111111111111, | |
"grad_norm": 71.10752868652344, | |
"learning_rate": 2.5e-05, | |
"loss": 1.4086, | |
"step": 1 | |
}, | |
{ | |
"epoch": 0.1111111111111111, | |
"eval_accuracy": 0.5555555555555556, | |
"eval_loss": 1.0401158332824707, | |
"eval_runtime": 1.5415, | |
"eval_samples_per_second": 46.709, | |
"eval_steps_per_second": 3.244, | |
"step": 1 | |
}, | |
{ | |
"epoch": 0.2222222222222222, | |
"grad_norm": 70.05866241455078, | |
"learning_rate": 5e-05, | |
"loss": 1.2969, | |
"step": 2 | |
}, | |
{ | |
"epoch": 0.2222222222222222, | |
"eval_accuracy": 0.4722222222222222, | |
"eval_loss": 0.957859456539154, | |
"eval_runtime": 1.6886, | |
"eval_samples_per_second": 42.639, | |
"eval_steps_per_second": 2.961, | |
"step": 2 | |
}, | |
{ | |
"epoch": 0.3333333333333333, | |
"grad_norm": 51.52368927001953, | |
"learning_rate": 4.943181818181818e-05, | |
"loss": 1.0192, | |
"step": 3 | |
}, | |
{ | |
"epoch": 0.3333333333333333, | |
"eval_accuracy": 0.4166666666666667, | |
"eval_loss": 1.2530348300933838, | |
"eval_runtime": 1.7908, | |
"eval_samples_per_second": 40.206, | |
"eval_steps_per_second": 2.792, | |
"step": 3 | |
}, | |
{ | |
"epoch": 0.4444444444444444, | |
"grad_norm": 28.69840431213379, | |
"learning_rate": 4.886363636363637e-05, | |
"loss": 0.8252, | |
"step": 4 | |
}, | |
{ | |
"epoch": 0.4444444444444444, | |
"eval_accuracy": 0.4027777777777778, | |
"eval_loss": 1.3205498456954956, | |
"eval_runtime": 1.7073, | |
"eval_samples_per_second": 42.172, | |
"eval_steps_per_second": 2.929, | |
"step": 4 | |
}, | |
{ | |
"epoch": 0.5555555555555556, | |
"grad_norm": 26.98361587524414, | |
"learning_rate": 4.829545454545455e-05, | |
"loss": 0.8993, | |
"step": 5 | |
}, | |
{ | |
"epoch": 0.5555555555555556, | |
"eval_accuracy": 0.4166666666666667, | |
"eval_loss": 1.1885122060775757, | |
"eval_runtime": 1.7972, | |
"eval_samples_per_second": 40.062, | |
"eval_steps_per_second": 2.782, | |
"step": 5 | |
}, | |
{ | |
"epoch": 0.6666666666666666, | |
"grad_norm": 32.78493881225586, | |
"learning_rate": 4.772727272727273e-05, | |
"loss": 0.8969, | |
"step": 6 | |
}, | |
{ | |
"epoch": 0.6666666666666666, | |
"eval_accuracy": 0.4722222222222222, | |
"eval_loss": 1.0518155097961426, | |
"eval_runtime": 1.7895, | |
"eval_samples_per_second": 40.235, | |
"eval_steps_per_second": 2.794, | |
"step": 6 | |
}, | |
{ | |
"epoch": 0.7777777777777778, | |
"grad_norm": 35.246402740478516, | |
"learning_rate": 4.715909090909091e-05, | |
"loss": 0.7789, | |
"step": 7 | |
}, | |
{ | |
"epoch": 0.7777777777777778, | |
"eval_accuracy": 0.4027777777777778, | |
"eval_loss": 0.95684814453125, | |
"eval_runtime": 1.782, | |
"eval_samples_per_second": 40.405, | |
"eval_steps_per_second": 2.806, | |
"step": 7 | |
}, | |
{ | |
"epoch": 0.8888888888888888, | |
"grad_norm": 10.608202934265137, | |
"learning_rate": 4.659090909090909e-05, | |
"loss": 0.8172, | |
"step": 8 | |
}, | |
{ | |
"epoch": 0.8888888888888888, | |
"eval_accuracy": 0.4722222222222222, | |
"eval_loss": 0.9124009609222412, | |
"eval_runtime": 1.7, | |
"eval_samples_per_second": 42.354, | |
"eval_steps_per_second": 2.941, | |
"step": 8 | |
}, | |
{ | |
"epoch": 1.0, | |
"grad_norm": 20.640520095825195, | |
"learning_rate": 4.602272727272727e-05, | |
"loss": 0.9466, | |
"step": 9 | |
}, | |
{ | |
"epoch": 1.0, | |
"eval_accuracy": 0.4722222222222222, | |
"eval_loss": 0.8895958662033081, | |
"eval_runtime": 1.6914, | |
"eval_samples_per_second": 42.569, | |
"eval_steps_per_second": 2.956, | |
"step": 9 | |
}, | |
{ | |
"epoch": 1.1111111111111112, | |
"grad_norm": 16.80632972717285, | |
"learning_rate": 4.545454545454546e-05, | |
"loss": 0.7398, | |
"step": 10 | |
}, | |
{ | |
"epoch": 1.1111111111111112, | |
"eval_accuracy": 0.4722222222222222, | |
"eval_loss": 0.8696831464767456, | |
"eval_runtime": 1.7281, | |
"eval_samples_per_second": 41.664, | |
"eval_steps_per_second": 2.893, | |
"step": 10 | |
}, | |
{ | |
"epoch": 1.2222222222222223, | |
"grad_norm": 33.25798797607422, | |
"learning_rate": 4.488636363636364e-05, | |
"loss": 0.8393, | |
"step": 11 | |
}, | |
{ | |
"epoch": 1.2222222222222223, | |
"eval_accuracy": 0.4722222222222222, | |
"eval_loss": 0.8458387851715088, | |
"eval_runtime": 1.6403, | |
"eval_samples_per_second": 43.894, | |
"eval_steps_per_second": 3.048, | |
"step": 11 | |
}, | |
{ | |
"epoch": 1.3333333333333333, | |
"grad_norm": 18.833730697631836, | |
"learning_rate": 4.431818181818182e-05, | |
"loss": 0.627, | |
"step": 12 | |
}, | |
{ | |
"epoch": 1.3333333333333333, | |
"eval_accuracy": 0.4305555555555556, | |
"eval_loss": 0.8363681435585022, | |
"eval_runtime": 1.7646, | |
"eval_samples_per_second": 40.803, | |
"eval_steps_per_second": 2.834, | |
"step": 12 | |
}, | |
{ | |
"epoch": 1.4444444444444444, | |
"grad_norm": 11.44498062133789, | |
"learning_rate": 4.375e-05, | |
"loss": 0.5691, | |
"step": 13 | |
}, | |
{ | |
"epoch": 1.4444444444444444, | |
"eval_accuracy": 0.4166666666666667, | |
"eval_loss": 0.8569539189338684, | |
"eval_runtime": 1.6972, | |
"eval_samples_per_second": 42.423, | |
"eval_steps_per_second": 2.946, | |
"step": 13 | |
}, | |
{ | |
"epoch": 1.5555555555555556, | |
"grad_norm": 6.994156360626221, | |
"learning_rate": 4.318181818181819e-05, | |
"loss": 0.6796, | |
"step": 14 | |
}, | |
{ | |
"epoch": 1.5555555555555556, | |
"eval_accuracy": 0.4444444444444444, | |
"eval_loss": 0.9024861454963684, | |
"eval_runtime": 1.6366, | |
"eval_samples_per_second": 43.995, | |
"eval_steps_per_second": 3.055, | |
"step": 14 | |
}, | |
{ | |
"epoch": 1.6666666666666665, | |
"grad_norm": 23.839618682861328, | |
"learning_rate": 4.261363636363637e-05, | |
"loss": 0.6225, | |
"step": 15 | |
}, | |
{ | |
"epoch": 1.6666666666666665, | |
"eval_accuracy": 0.4722222222222222, | |
"eval_loss": 0.9257575273513794, | |
"eval_runtime": 1.6451, | |
"eval_samples_per_second": 43.767, | |
"eval_steps_per_second": 3.039, | |
"step": 15 | |
}, | |
{ | |
"epoch": 1.7777777777777777, | |
"grad_norm": 23.61461639404297, | |
"learning_rate": 4.204545454545455e-05, | |
"loss": 0.6299, | |
"step": 16 | |
}, | |
{ | |
"epoch": 1.7777777777777777, | |
"eval_accuracy": 0.4583333333333333, | |
"eval_loss": 0.9147135615348816, | |
"eval_runtime": 1.7411, | |
"eval_samples_per_second": 41.353, | |
"eval_steps_per_second": 2.872, | |
"step": 16 | |
}, | |
{ | |
"epoch": 1.8888888888888888, | |
"grad_norm": 15.60508918762207, | |
"learning_rate": 4.1477272727272734e-05, | |
"loss": 0.5009, | |
"step": 17 | |
}, | |
{ | |
"epoch": 1.8888888888888888, | |
"eval_accuracy": 0.4583333333333333, | |
"eval_loss": 0.8712971806526184, | |
"eval_runtime": 1.688, | |
"eval_samples_per_second": 42.653, | |
"eval_steps_per_second": 2.962, | |
"step": 17 | |
}, | |
{ | |
"epoch": 2.0, | |
"grad_norm": 14.364876747131348, | |
"learning_rate": 4.0909090909090915e-05, | |
"loss": 0.4875, | |
"step": 18 | |
}, | |
{ | |
"epoch": 2.0, | |
"eval_accuracy": 0.4305555555555556, | |
"eval_loss": 0.8122490644454956, | |
"eval_runtime": 1.6995, | |
"eval_samples_per_second": 42.366, | |
"eval_steps_per_second": 2.942, | |
"step": 18 | |
}, | |
{ | |
"epoch": 2.111111111111111, | |
"grad_norm": 9.64327621459961, | |
"learning_rate": 4.034090909090909e-05, | |
"loss": 0.4547, | |
"step": 19 | |
}, | |
{ | |
"epoch": 2.111111111111111, | |
"eval_accuracy": 0.4444444444444444, | |
"eval_loss": 0.7631157636642456, | |
"eval_runtime": 1.7098, | |
"eval_samples_per_second": 42.11, | |
"eval_steps_per_second": 2.924, | |
"step": 19 | |
}, | |
{ | |
"epoch": 2.2222222222222223, | |
"grad_norm": 8.928176879882812, | |
"learning_rate": 3.9772727272727275e-05, | |
"loss": 0.3933, | |
"step": 20 | |
}, | |
{ | |
"epoch": 2.2222222222222223, | |
"eval_accuracy": 0.5138888888888888, | |
"eval_loss": 0.7445746660232544, | |
"eval_runtime": 1.7154, | |
"eval_samples_per_second": 41.973, | |
"eval_steps_per_second": 2.915, | |
"step": 20 | |
}, | |
{ | |
"epoch": 2.3333333333333335, | |
"grad_norm": 12.450410842895508, | |
"learning_rate": 3.9204545454545456e-05, | |
"loss": 0.3994, | |
"step": 21 | |
}, | |
{ | |
"epoch": 2.3333333333333335, | |
"eval_accuracy": 0.5416666666666666, | |
"eval_loss": 0.7361382246017456, | |
"eval_runtime": 1.6893, | |
"eval_samples_per_second": 42.621, | |
"eval_steps_per_second": 2.96, | |
"step": 21 | |
}, | |
{ | |
"epoch": 2.4444444444444446, | |
"grad_norm": 9.766682624816895, | |
"learning_rate": 3.8636363636363636e-05, | |
"loss": 0.4413, | |
"step": 22 | |
}, | |
{ | |
"epoch": 2.4444444444444446, | |
"eval_accuracy": 0.5972222222222222, | |
"eval_loss": 0.7318115234375, | |
"eval_runtime": 1.6931, | |
"eval_samples_per_second": 42.526, | |
"eval_steps_per_second": 2.953, | |
"step": 22 | |
}, | |
{ | |
"epoch": 2.5555555555555554, | |
"grad_norm": 9.650227546691895, | |
"learning_rate": 3.8068181818181816e-05, | |
"loss": 0.4032, | |
"step": 23 | |
}, | |
{ | |
"epoch": 2.5555555555555554, | |
"eval_accuracy": 0.5972222222222222, | |
"eval_loss": 0.7219780683517456, | |
"eval_runtime": 1.6928, | |
"eval_samples_per_second": 42.532, | |
"eval_steps_per_second": 2.954, | |
"step": 23 | |
}, | |
{ | |
"epoch": 2.6666666666666665, | |
"grad_norm": 8.477367401123047, | |
"learning_rate": 3.7500000000000003e-05, | |
"loss": 0.4308, | |
"step": 24 | |
}, | |
{ | |
"epoch": 2.6666666666666665, | |
"eval_accuracy": 0.6527777777777778, | |
"eval_loss": 0.7055935263633728, | |
"eval_runtime": 1.7353, | |
"eval_samples_per_second": 41.492, | |
"eval_steps_per_second": 2.881, | |
"step": 24 | |
}, | |
{ | |
"epoch": 2.7777777777777777, | |
"grad_norm": 8.381991386413574, | |
"learning_rate": 3.6931818181818184e-05, | |
"loss": 0.3827, | |
"step": 25 | |
}, | |
{ | |
"epoch": 2.7777777777777777, | |
"eval_accuracy": 0.6666666666666666, | |
"eval_loss": 0.6947495937347412, | |
"eval_runtime": 1.69, | |
"eval_samples_per_second": 42.604, | |
"eval_steps_per_second": 2.959, | |
"step": 25 | |
}, | |
{ | |
"epoch": 2.888888888888889, | |
"grad_norm": 18.475757598876953, | |
"learning_rate": 3.6363636363636364e-05, | |
"loss": 0.3719, | |
"step": 26 | |
}, | |
{ | |
"epoch": 2.888888888888889, | |
"eval_accuracy": 0.6388888888888888, | |
"eval_loss": 0.695068359375, | |
"eval_runtime": 1.7654, | |
"eval_samples_per_second": 40.785, | |
"eval_steps_per_second": 2.832, | |
"step": 26 | |
}, | |
{ | |
"epoch": 3.0, | |
"grad_norm": 4.149219036102295, | |
"learning_rate": 3.579545454545455e-05, | |
"loss": 0.2756, | |
"step": 27 | |
}, | |
{ | |
"epoch": 3.0, | |
"eval_accuracy": 0.625, | |
"eval_loss": 0.7118733525276184, | |
"eval_runtime": 1.7076, | |
"eval_samples_per_second": 42.165, | |
"eval_steps_per_second": 2.928, | |
"step": 27 | |
}, | |
{ | |
"epoch": 3.111111111111111, | |
"grad_norm": 6.201833248138428, | |
"learning_rate": 3.522727272727273e-05, | |
"loss": 0.3048, | |
"step": 28 | |
}, | |
{ | |
"epoch": 3.111111111111111, | |
"eval_accuracy": 0.6388888888888888, | |
"eval_loss": 0.7207743525505066, | |
"eval_runtime": 1.6906, | |
"eval_samples_per_second": 42.59, | |
"eval_steps_per_second": 2.958, | |
"step": 28 | |
}, | |
{ | |
"epoch": 3.2222222222222223, | |
"grad_norm": 10.069761276245117, | |
"learning_rate": 3.465909090909091e-05, | |
"loss": 0.2833, | |
"step": 29 | |
}, | |
{ | |
"epoch": 3.2222222222222223, | |
"eval_accuracy": 0.6111111111111112, | |
"eval_loss": 0.758392333984375, | |
"eval_runtime": 1.689, | |
"eval_samples_per_second": 42.629, | |
"eval_steps_per_second": 2.96, | |
"step": 29 | |
}, | |
{ | |
"epoch": 3.3333333333333335, | |
"grad_norm": 14.6091947555542, | |
"learning_rate": 3.409090909090909e-05, | |
"loss": 0.2192, | |
"step": 30 | |
}, | |
{ | |
"epoch": 3.3333333333333335, | |
"eval_accuracy": 0.625, | |
"eval_loss": 0.7625613808631897, | |
"eval_runtime": 1.6873, | |
"eval_samples_per_second": 42.671, | |
"eval_steps_per_second": 2.963, | |
"step": 30 | |
}, | |
{ | |
"epoch": 3.4444444444444446, | |
"grad_norm": 12.838824272155762, | |
"learning_rate": 3.352272727272727e-05, | |
"loss": 0.159, | |
"step": 31 | |
}, | |
{ | |
"epoch": 3.4444444444444446, | |
"eval_accuracy": 0.6666666666666666, | |
"eval_loss": 0.7382490634918213, | |
"eval_runtime": 1.689, | |
"eval_samples_per_second": 42.629, | |
"eval_steps_per_second": 2.96, | |
"step": 31 | |
}, | |
{ | |
"epoch": 3.5555555555555554, | |
"grad_norm": 5.576656341552734, | |
"learning_rate": 3.295454545454545e-05, | |
"loss": 0.1545, | |
"step": 32 | |
}, | |
{ | |
"epoch": 3.5555555555555554, | |
"eval_accuracy": 0.6805555555555556, | |
"eval_loss": 0.7160136103630066, | |
"eval_runtime": 1.688, | |
"eval_samples_per_second": 42.655, | |
"eval_steps_per_second": 2.962, | |
"step": 32 | |
}, | |
{ | |
"epoch": 3.6666666666666665, | |
"grad_norm": 3.911334753036499, | |
"learning_rate": 3.238636363636364e-05, | |
"loss": 0.2048, | |
"step": 33 | |
}, | |
{ | |
"epoch": 3.6666666666666665, | |
"eval_accuracy": 0.6944444444444444, | |
"eval_loss": 0.7091013789176941, | |
"eval_runtime": 1.6904, | |
"eval_samples_per_second": 42.594, | |
"eval_steps_per_second": 2.958, | |
"step": 33 | |
}, | |
{ | |
"epoch": 3.7777777777777777, | |
"grad_norm": 4.566244125366211, | |
"learning_rate": 3.181818181818182e-05, | |
"loss": 0.1285, | |
"step": 34 | |
}, | |
{ | |
"epoch": 3.7777777777777777, | |
"eval_accuracy": 0.7361111111111112, | |
"eval_loss": 0.7089945673942566, | |
"eval_runtime": 1.6878, | |
"eval_samples_per_second": 42.659, | |
"eval_steps_per_second": 2.962, | |
"step": 34 | |
}, | |
{ | |
"epoch": 3.888888888888889, | |
"grad_norm": 4.949621677398682, | |
"learning_rate": 3.125e-05, | |
"loss": 0.0972, | |
"step": 35 | |
}, | |
{ | |
"epoch": 3.888888888888889, | |
"eval_accuracy": 0.7361111111111112, | |
"eval_loss": 0.7177971601486206, | |
"eval_runtime": 1.7101, | |
"eval_samples_per_second": 42.104, | |
"eval_steps_per_second": 2.924, | |
"step": 35 | |
}, | |
{ | |
"epoch": 4.0, | |
"grad_norm": 5.815789699554443, | |
"learning_rate": 3.068181818181818e-05, | |
"loss": 0.1396, | |
"step": 36 | |
}, | |
{ | |
"epoch": 4.0, | |
"eval_accuracy": 0.7361111111111112, | |
"eval_loss": 0.7336485385894775, | |
"eval_runtime": 1.704, | |
"eval_samples_per_second": 42.252, | |
"eval_steps_per_second": 2.934, | |
"step": 36 | |
}, | |
{ | |
"epoch": 4.111111111111111, | |
"grad_norm": 5.0971174240112305, | |
"learning_rate": 3.0113636363636365e-05, | |
"loss": 0.1058, | |
"step": 37 | |
}, | |
{ | |
"epoch": 4.111111111111111, | |
"eval_accuracy": 0.6944444444444444, | |
"eval_loss": 0.7677508592605591, | |
"eval_runtime": 1.6634, | |
"eval_samples_per_second": 43.285, | |
"eval_steps_per_second": 3.006, | |
"step": 37 | |
}, | |
{ | |
"epoch": 4.222222222222222, | |
"grad_norm": 3.026982545852661, | |
"learning_rate": 2.954545454545455e-05, | |
"loss": 0.1036, | |
"step": 38 | |
}, | |
{ | |
"epoch": 4.222222222222222, | |
"eval_accuracy": 0.6666666666666666, | |
"eval_loss": 0.81514573097229, | |
"eval_runtime": 1.7527, | |
"eval_samples_per_second": 41.08, | |
"eval_steps_per_second": 2.853, | |
"step": 38 | |
}, | |
{ | |
"epoch": 4.333333333333333, | |
"grad_norm": 3.815314531326294, | |
"learning_rate": 2.8977272727272732e-05, | |
"loss": 0.0651, | |
"step": 39 | |
}, | |
{ | |
"epoch": 4.333333333333333, | |
"eval_accuracy": 0.6944444444444444, | |
"eval_loss": 0.8406186699867249, | |
"eval_runtime": 1.7692, | |
"eval_samples_per_second": 40.696, | |
"eval_steps_per_second": 2.826, | |
"step": 39 | |
}, | |
{ | |
"epoch": 4.444444444444445, | |
"grad_norm": 2.897125244140625, | |
"learning_rate": 2.8409090909090912e-05, | |
"loss": 0.0643, | |
"step": 40 | |
}, | |
{ | |
"epoch": 4.444444444444445, | |
"eval_accuracy": 0.6944444444444444, | |
"eval_loss": 0.8639495372772217, | |
"eval_runtime": 1.7505, | |
"eval_samples_per_second": 41.132, | |
"eval_steps_per_second": 2.856, | |
"step": 40 | |
}, | |
{ | |
"epoch": 4.555555555555555, | |
"grad_norm": 8.826624870300293, | |
"learning_rate": 2.784090909090909e-05, | |
"loss": 0.0708, | |
"step": 41 | |
}, | |
{ | |
"epoch": 4.555555555555555, | |
"eval_accuracy": 0.7083333333333334, | |
"eval_loss": 0.8449772000312805, | |
"eval_runtime": 1.7389, | |
"eval_samples_per_second": 41.404, | |
"eval_steps_per_second": 2.875, | |
"step": 41 | |
}, | |
{ | |
"epoch": 4.666666666666667, | |
"grad_norm": 4.310912132263184, | |
"learning_rate": 2.7272727272727273e-05, | |
"loss": 0.0571, | |
"step": 42 | |
}, | |
{ | |
"epoch": 4.666666666666667, | |
"eval_accuracy": 0.7222222222222222, | |
"eval_loss": 0.8496870398521423, | |
"eval_runtime": 1.6953, | |
"eval_samples_per_second": 42.471, | |
"eval_steps_per_second": 2.949, | |
"step": 42 | |
}, | |
{ | |
"epoch": 4.777777777777778, | |
"grad_norm": 0.9157137870788574, | |
"learning_rate": 2.6704545454545453e-05, | |
"loss": 0.0101, | |
"step": 43 | |
}, | |
{ | |
"epoch": 4.777777777777778, | |
"eval_accuracy": 0.75, | |
"eval_loss": 0.8314664363861084, | |
"eval_runtime": 1.696, | |
"eval_samples_per_second": 42.452, | |
"eval_steps_per_second": 2.948, | |
"step": 43 | |
}, | |
{ | |
"epoch": 4.888888888888889, | |
"grad_norm": 4.54341983795166, | |
"learning_rate": 2.6136363636363637e-05, | |
"loss": 0.0345, | |
"step": 44 | |
}, | |
{ | |
"epoch": 4.888888888888889, | |
"eval_accuracy": 0.75, | |
"eval_loss": 0.8536232709884644, | |
"eval_runtime": 1.7637, | |
"eval_samples_per_second": 40.822, | |
"eval_steps_per_second": 2.835, | |
"step": 44 | |
}, | |
{ | |
"epoch": 5.0, | |
"grad_norm": 2.6652591228485107, | |
"learning_rate": 2.5568181818181817e-05, | |
"loss": 0.0194, | |
"step": 45 | |
}, | |
{ | |
"epoch": 5.0, | |
"eval_accuracy": 0.75, | |
"eval_loss": 0.8400527834892273, | |
"eval_runtime": 1.7745, | |
"eval_samples_per_second": 40.575, | |
"eval_steps_per_second": 2.818, | |
"step": 45 | |
}, | |
{ | |
"epoch": 5.111111111111111, | |
"grad_norm": 0.518764078617096, | |
"learning_rate": 2.5e-05, | |
"loss": 0.005, | |
"step": 46 | |
}, | |
{ | |
"epoch": 5.111111111111111, | |
"eval_accuracy": 0.75, | |
"eval_loss": 0.8562726378440857, | |
"eval_runtime": 1.7488, | |
"eval_samples_per_second": 41.171, | |
"eval_steps_per_second": 2.859, | |
"step": 46 | |
}, | |
{ | |
"epoch": 5.222222222222222, | |
"grad_norm": 0.5080448389053345, | |
"learning_rate": 2.4431818181818185e-05, | |
"loss": 0.0092, | |
"step": 47 | |
}, | |
{ | |
"epoch": 5.222222222222222, | |
"eval_accuracy": 0.75, | |
"eval_loss": 0.8858435153961182, | |
"eval_runtime": 1.7362, | |
"eval_samples_per_second": 41.469, | |
"eval_steps_per_second": 2.88, | |
"step": 47 | |
}, | |
{ | |
"epoch": 5.333333333333333, | |
"grad_norm": 1.4881385564804077, | |
"learning_rate": 2.3863636363636365e-05, | |
"loss": 0.0117, | |
"step": 48 | |
}, | |
{ | |
"epoch": 5.333333333333333, | |
"eval_accuracy": 0.7638888888888888, | |
"eval_loss": 0.9373347163200378, | |
"eval_runtime": 1.7561, | |
"eval_samples_per_second": 40.999, | |
"eval_steps_per_second": 2.847, | |
"step": 48 | |
}, | |
{ | |
"epoch": 5.444444444444445, | |
"grad_norm": 0.4491148293018341, | |
"learning_rate": 2.3295454545454546e-05, | |
"loss": 0.006, | |
"step": 49 | |
}, | |
{ | |
"epoch": 5.444444444444445, | |
"eval_accuracy": 0.7638888888888888, | |
"eval_loss": 0.9894835352897644, | |
"eval_runtime": 1.7035, | |
"eval_samples_per_second": 42.267, | |
"eval_steps_per_second": 2.935, | |
"step": 49 | |
}, | |
{ | |
"epoch": 5.555555555555555, | |
"grad_norm": 0.5586861371994019, | |
"learning_rate": 2.272727272727273e-05, | |
"loss": 0.0063, | |
"step": 50 | |
}, | |
{ | |
"epoch": 5.555555555555555, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.0309796333312988, | |
"eval_runtime": 1.7142, | |
"eval_samples_per_second": 42.001, | |
"eval_steps_per_second": 2.917, | |
"step": 50 | |
}, | |
{ | |
"epoch": 5.666666666666667, | |
"grad_norm": 1.2758698463439941, | |
"learning_rate": 2.215909090909091e-05, | |
"loss": 0.0057, | |
"step": 51 | |
}, | |
{ | |
"epoch": 5.666666666666667, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.0968831777572632, | |
"eval_runtime": 1.6931, | |
"eval_samples_per_second": 42.526, | |
"eval_steps_per_second": 2.953, | |
"step": 51 | |
}, | |
{ | |
"epoch": 5.777777777777778, | |
"grad_norm": 0.4022962152957916, | |
"learning_rate": 2.1590909090909093e-05, | |
"loss": 0.0024, | |
"step": 52 | |
}, | |
{ | |
"epoch": 5.777777777777778, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.1657804250717163, | |
"eval_runtime": 1.7525, | |
"eval_samples_per_second": 41.083, | |
"eval_steps_per_second": 2.853, | |
"step": 52 | |
}, | |
{ | |
"epoch": 5.888888888888889, | |
"grad_norm": 8.325399398803711, | |
"learning_rate": 2.1022727272727274e-05, | |
"loss": 0.0283, | |
"step": 53 | |
}, | |
{ | |
"epoch": 5.888888888888889, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.1517949104309082, | |
"eval_runtime": 1.7207, | |
"eval_samples_per_second": 41.843, | |
"eval_steps_per_second": 2.906, | |
"step": 53 | |
}, | |
{ | |
"epoch": 6.0, | |
"grad_norm": 0.6889083981513977, | |
"learning_rate": 2.0454545454545457e-05, | |
"loss": 0.0031, | |
"step": 54 | |
}, | |
{ | |
"epoch": 6.0, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.131449580192566, | |
"eval_runtime": 1.7267, | |
"eval_samples_per_second": 41.698, | |
"eval_steps_per_second": 2.896, | |
"step": 54 | |
}, | |
{ | |
"epoch": 6.111111111111111, | |
"grad_norm": 0.12238375097513199, | |
"learning_rate": 1.9886363636363638e-05, | |
"loss": 0.0006, | |
"step": 55 | |
}, | |
{ | |
"epoch": 6.111111111111111, | |
"eval_accuracy": 0.8055555555555556, | |
"eval_loss": 1.120394229888916, | |
"eval_runtime": 1.7518, | |
"eval_samples_per_second": 41.1, | |
"eval_steps_per_second": 2.854, | |
"step": 55 | |
}, | |
{ | |
"epoch": 6.222222222222222, | |
"grad_norm": 0.04647090286016464, | |
"learning_rate": 1.9318181818181818e-05, | |
"loss": 0.0004, | |
"step": 56 | |
}, | |
{ | |
"epoch": 6.222222222222222, | |
"eval_accuracy": 0.8333333333333334, | |
"eval_loss": 1.1473143100738525, | |
"eval_runtime": 1.6957, | |
"eval_samples_per_second": 42.461, | |
"eval_steps_per_second": 2.949, | |
"step": 56 | |
}, | |
{ | |
"epoch": 6.333333333333333, | |
"grad_norm": 0.07002800703048706, | |
"learning_rate": 1.8750000000000002e-05, | |
"loss": 0.0004, | |
"step": 57 | |
}, | |
{ | |
"epoch": 6.333333333333333, | |
"eval_accuracy": 0.8333333333333334, | |
"eval_loss": 1.1827261447906494, | |
"eval_runtime": 1.6914, | |
"eval_samples_per_second": 42.567, | |
"eval_steps_per_second": 2.956, | |
"step": 57 | |
}, | |
{ | |
"epoch": 6.444444444444445, | |
"grad_norm": 0.057534411549568176, | |
"learning_rate": 1.8181818181818182e-05, | |
"loss": 0.0002, | |
"step": 58 | |
}, | |
{ | |
"epoch": 6.444444444444445, | |
"eval_accuracy": 0.8333333333333334, | |
"eval_loss": 1.224190592765808, | |
"eval_runtime": 1.6907, | |
"eval_samples_per_second": 42.587, | |
"eval_steps_per_second": 2.957, | |
"step": 58 | |
}, | |
{ | |
"epoch": 6.555555555555555, | |
"grad_norm": 0.1322898268699646, | |
"learning_rate": 1.7613636363636366e-05, | |
"loss": 0.0006, | |
"step": 59 | |
}, | |
{ | |
"epoch": 6.555555555555555, | |
"eval_accuracy": 0.8333333333333334, | |
"eval_loss": 1.2592931985855103, | |
"eval_runtime": 1.6878, | |
"eval_samples_per_second": 42.659, | |
"eval_steps_per_second": 2.962, | |
"step": 59 | |
}, | |
{ | |
"epoch": 6.666666666666667, | |
"grad_norm": 0.014222492463886738, | |
"learning_rate": 1.7045454545454546e-05, | |
"loss": 0.0001, | |
"step": 60 | |
}, | |
{ | |
"epoch": 6.666666666666667, | |
"eval_accuracy": 0.8333333333333334, | |
"eval_loss": 1.281473159790039, | |
"eval_runtime": 1.6928, | |
"eval_samples_per_second": 42.532, | |
"eval_steps_per_second": 2.954, | |
"step": 60 | |
}, | |
{ | |
"epoch": 6.777777777777778, | |
"grad_norm": 3.633732795715332, | |
"learning_rate": 1.6477272727272726e-05, | |
"loss": 0.0144, | |
"step": 61 | |
}, | |
{ | |
"epoch": 6.777777777777778, | |
"eval_accuracy": 0.8333333333333334, | |
"eval_loss": 1.2948687076568604, | |
"eval_runtime": 1.6886, | |
"eval_samples_per_second": 42.64, | |
"eval_steps_per_second": 2.961, | |
"step": 61 | |
}, | |
{ | |
"epoch": 6.888888888888889, | |
"grad_norm": 0.6753994226455688, | |
"learning_rate": 1.590909090909091e-05, | |
"loss": 0.0021, | |
"step": 62 | |
}, | |
{ | |
"epoch": 6.888888888888889, | |
"eval_accuracy": 0.8333333333333334, | |
"eval_loss": 1.303892731666565, | |
"eval_runtime": 1.691, | |
"eval_samples_per_second": 42.579, | |
"eval_steps_per_second": 2.957, | |
"step": 62 | |
}, | |
{ | |
"epoch": 7.0, | |
"grad_norm": 0.047468412667512894, | |
"learning_rate": 1.534090909090909e-05, | |
"loss": 0.0002, | |
"step": 63 | |
}, | |
{ | |
"epoch": 7.0, | |
"eval_accuracy": 0.8333333333333334, | |
"eval_loss": 1.3037166595458984, | |
"eval_runtime": 1.6895, | |
"eval_samples_per_second": 42.615, | |
"eval_steps_per_second": 2.959, | |
"step": 63 | |
}, | |
{ | |
"epoch": 7.111111111111111, | |
"grad_norm": 0.04554079473018646, | |
"learning_rate": 1.4772727272727274e-05, | |
"loss": 0.0001, | |
"step": 64 | |
}, | |
{ | |
"epoch": 7.111111111111111, | |
"eval_accuracy": 0.8194444444444444, | |
"eval_loss": 1.3078640699386597, | |
"eval_runtime": 1.6898, | |
"eval_samples_per_second": 42.608, | |
"eval_steps_per_second": 2.959, | |
"step": 64 | |
}, | |
{ | |
"epoch": 7.222222222222222, | |
"grad_norm": 0.010344511829316616, | |
"learning_rate": 1.4204545454545456e-05, | |
"loss": 0.0001, | |
"step": 65 | |
}, | |
{ | |
"epoch": 7.222222222222222, | |
"eval_accuracy": 0.8055555555555556, | |
"eval_loss": 1.3234918117523193, | |
"eval_runtime": 1.6967, | |
"eval_samples_per_second": 42.434, | |
"eval_steps_per_second": 2.947, | |
"step": 65 | |
}, | |
{ | |
"epoch": 7.333333333333333, | |
"grad_norm": 0.006933713797479868, | |
"learning_rate": 1.3636363636363637e-05, | |
"loss": 0.0001, | |
"step": 66 | |
}, | |
{ | |
"epoch": 7.333333333333333, | |
"eval_accuracy": 0.8055555555555556, | |
"eval_loss": 1.3226008415222168, | |
"eval_runtime": 1.6947, | |
"eval_samples_per_second": 42.485, | |
"eval_steps_per_second": 2.95, | |
"step": 66 | |
}, | |
{ | |
"epoch": 7.444444444444445, | |
"grad_norm": 0.004614418838173151, | |
"learning_rate": 1.3068181818181819e-05, | |
"loss": 0.0, | |
"step": 67 | |
}, | |
{ | |
"epoch": 7.444444444444445, | |
"eval_accuracy": 0.8055555555555556, | |
"eval_loss": 1.3420900106430054, | |
"eval_runtime": 1.6925, | |
"eval_samples_per_second": 42.54, | |
"eval_steps_per_second": 2.954, | |
"step": 67 | |
}, | |
{ | |
"epoch": 7.555555555555555, | |
"grad_norm": 0.011573988012969494, | |
"learning_rate": 1.25e-05, | |
"loss": 0.0001, | |
"step": 68 | |
}, | |
{ | |
"epoch": 7.555555555555555, | |
"eval_accuracy": 0.8055555555555556, | |
"eval_loss": 1.3453483581542969, | |
"eval_runtime": 1.6925, | |
"eval_samples_per_second": 42.539, | |
"eval_steps_per_second": 2.954, | |
"step": 68 | |
}, | |
{ | |
"epoch": 7.666666666666667, | |
"grad_norm": 0.05624835565686226, | |
"learning_rate": 1.1931818181818183e-05, | |
"loss": 0.0002, | |
"step": 69 | |
}, | |
{ | |
"epoch": 7.666666666666667, | |
"eval_accuracy": 0.8055555555555556, | |
"eval_loss": 1.3481894731521606, | |
"eval_runtime": 1.6884, | |
"eval_samples_per_second": 42.643, | |
"eval_steps_per_second": 2.961, | |
"step": 69 | |
}, | |
{ | |
"epoch": 7.777777777777778, | |
"grad_norm": 0.026880212128162384, | |
"learning_rate": 1.1363636363636365e-05, | |
"loss": 0.0001, | |
"step": 70 | |
}, | |
{ | |
"epoch": 7.777777777777778, | |
"eval_accuracy": 0.7916666666666666, | |
"eval_loss": 1.3691877126693726, | |
"eval_runtime": 1.689, | |
"eval_samples_per_second": 42.628, | |
"eval_steps_per_second": 2.96, | |
"step": 70 | |
}, | |
{ | |
"epoch": 7.888888888888889, | |
"grad_norm": 0.014328360557556152, | |
"learning_rate": 1.0795454545454547e-05, | |
"loss": 0.0001, | |
"step": 71 | |
}, | |
{ | |
"epoch": 7.888888888888889, | |
"eval_accuracy": 0.7916666666666666, | |
"eval_loss": 1.3764379024505615, | |
"eval_runtime": 1.692, | |
"eval_samples_per_second": 42.552, | |
"eval_steps_per_second": 2.955, | |
"step": 71 | |
}, | |
{ | |
"epoch": 8.0, | |
"grad_norm": 0.004069478716701269, | |
"learning_rate": 1.0227272727272729e-05, | |
"loss": 0.0, | |
"step": 72 | |
}, | |
{ | |
"epoch": 8.0, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.3799124956130981, | |
"eval_runtime": 1.6421, | |
"eval_samples_per_second": 43.845, | |
"eval_steps_per_second": 3.045, | |
"step": 72 | |
}, | |
{ | |
"epoch": 8.11111111111111, | |
"grad_norm": 0.029666263610124588, | |
"learning_rate": 9.659090909090909e-06, | |
"loss": 0.0001, | |
"step": 73 | |
}, | |
{ | |
"epoch": 8.11111111111111, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.3902004957199097, | |
"eval_runtime": 1.6942, | |
"eval_samples_per_second": 42.498, | |
"eval_steps_per_second": 2.951, | |
"step": 73 | |
}, | |
{ | |
"epoch": 8.222222222222221, | |
"grad_norm": 0.005981099791824818, | |
"learning_rate": 9.090909090909091e-06, | |
"loss": 0.0, | |
"step": 74 | |
}, | |
{ | |
"epoch": 8.222222222222221, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.3968830108642578, | |
"eval_runtime": 1.6929, | |
"eval_samples_per_second": 42.531, | |
"eval_steps_per_second": 2.954, | |
"step": 74 | |
}, | |
{ | |
"epoch": 8.333333333333334, | |
"grad_norm": 0.008150852285325527, | |
"learning_rate": 8.522727272727273e-06, | |
"loss": 0.0001, | |
"step": 75 | |
}, | |
{ | |
"epoch": 8.333333333333334, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.4002034664154053, | |
"eval_runtime": 1.692, | |
"eval_samples_per_second": 42.554, | |
"eval_steps_per_second": 2.955, | |
"step": 75 | |
}, | |
{ | |
"epoch": 8.444444444444445, | |
"grad_norm": 0.10769578814506531, | |
"learning_rate": 7.954545454545455e-06, | |
"loss": 0.0004, | |
"step": 76 | |
}, | |
{ | |
"epoch": 8.444444444444445, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.4117597341537476, | |
"eval_runtime": 1.6904, | |
"eval_samples_per_second": 42.595, | |
"eval_steps_per_second": 2.958, | |
"step": 76 | |
}, | |
{ | |
"epoch": 8.555555555555555, | |
"grad_norm": 0.026451965793967247, | |
"learning_rate": 7.386363636363637e-06, | |
"loss": 0.0001, | |
"step": 77 | |
}, | |
{ | |
"epoch": 8.555555555555555, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.4056729078292847, | |
"eval_runtime": 1.6873, | |
"eval_samples_per_second": 42.672, | |
"eval_steps_per_second": 2.963, | |
"step": 77 | |
}, | |
{ | |
"epoch": 8.666666666666666, | |
"grad_norm": 0.07714568078517914, | |
"learning_rate": 6.818181818181818e-06, | |
"loss": 0.0003, | |
"step": 78 | |
}, | |
{ | |
"epoch": 8.666666666666666, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.4086921215057373, | |
"eval_runtime": 1.6884, | |
"eval_samples_per_second": 42.643, | |
"eval_steps_per_second": 2.961, | |
"step": 78 | |
}, | |
{ | |
"epoch": 8.777777777777779, | |
"grad_norm": 0.03726133331656456, | |
"learning_rate": 6.25e-06, | |
"loss": 0.0001, | |
"step": 79 | |
}, | |
{ | |
"epoch": 8.777777777777779, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.4101053476333618, | |
"eval_runtime": 1.6929, | |
"eval_samples_per_second": 42.53, | |
"eval_steps_per_second": 2.954, | |
"step": 79 | |
}, | |
{ | |
"epoch": 8.88888888888889, | |
"grad_norm": 0.01760346069931984, | |
"learning_rate": 5.681818181818182e-06, | |
"loss": 0.0001, | |
"step": 80 | |
}, | |
{ | |
"epoch": 8.88888888888889, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.4063137769699097, | |
"eval_runtime": 1.6938, | |
"eval_samples_per_second": 42.508, | |
"eval_steps_per_second": 2.952, | |
"step": 80 | |
}, | |
{ | |
"epoch": 9.0, | |
"grad_norm": 0.05216860771179199, | |
"learning_rate": 5.113636363636364e-06, | |
"loss": 0.0003, | |
"step": 81 | |
}, | |
{ | |
"epoch": 9.0, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.4048177003860474, | |
"eval_runtime": 1.6922, | |
"eval_samples_per_second": 42.549, | |
"eval_steps_per_second": 2.955, | |
"step": 81 | |
}, | |
{ | |
"epoch": 9.11111111111111, | |
"grad_norm": 0.019384268671274185, | |
"learning_rate": 4.5454545454545455e-06, | |
"loss": 0.0001, | |
"step": 82 | |
}, | |
{ | |
"epoch": 9.11111111111111, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.4030442237854004, | |
"eval_runtime": 1.6903, | |
"eval_samples_per_second": 42.595, | |
"eval_steps_per_second": 2.958, | |
"step": 82 | |
}, | |
{ | |
"epoch": 9.222222222222221, | |
"grad_norm": 0.04530010744929314, | |
"learning_rate": 3.9772727272727275e-06, | |
"loss": 0.0002, | |
"step": 83 | |
}, | |
{ | |
"epoch": 9.222222222222221, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.406543254852295, | |
"eval_runtime": 1.6884, | |
"eval_samples_per_second": 42.643, | |
"eval_steps_per_second": 2.961, | |
"step": 83 | |
}, | |
{ | |
"epoch": 9.333333333333334, | |
"grad_norm": 0.006314022000879049, | |
"learning_rate": 3.409090909090909e-06, | |
"loss": 0.0, | |
"step": 84 | |
}, | |
{ | |
"epoch": 9.333333333333334, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.4123640060424805, | |
"eval_runtime": 1.6879, | |
"eval_samples_per_second": 42.656, | |
"eval_steps_per_second": 2.962, | |
"step": 84 | |
}, | |
{ | |
"epoch": 9.444444444444445, | |
"grad_norm": 0.010984798893332481, | |
"learning_rate": 2.840909090909091e-06, | |
"loss": 0.0001, | |
"step": 85 | |
}, | |
{ | |
"epoch": 9.444444444444445, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.4055116176605225, | |
"eval_runtime": 1.6923, | |
"eval_samples_per_second": 42.547, | |
"eval_steps_per_second": 2.955, | |
"step": 85 | |
}, | |
{ | |
"epoch": 9.555555555555555, | |
"grad_norm": 0.04161923751235008, | |
"learning_rate": 2.2727272727272728e-06, | |
"loss": 0.0002, | |
"step": 86 | |
}, | |
{ | |
"epoch": 9.555555555555555, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.4036388397216797, | |
"eval_runtime": 1.6941, | |
"eval_samples_per_second": 42.5, | |
"eval_steps_per_second": 2.951, | |
"step": 86 | |
}, | |
{ | |
"epoch": 9.666666666666666, | |
"grad_norm": 0.02397008240222931, | |
"learning_rate": 1.7045454545454546e-06, | |
"loss": 0.0001, | |
"step": 87 | |
}, | |
{ | |
"epoch": 9.666666666666666, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.4052443504333496, | |
"eval_runtime": 1.6935, | |
"eval_samples_per_second": 42.517, | |
"eval_steps_per_second": 2.953, | |
"step": 87 | |
}, | |
{ | |
"epoch": 9.777777777777779, | |
"grad_norm": 0.020388655364513397, | |
"learning_rate": 1.1363636363636364e-06, | |
"loss": 0.0001, | |
"step": 88 | |
}, | |
{ | |
"epoch": 9.777777777777779, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.403831958770752, | |
"eval_runtime": 1.693, | |
"eval_samples_per_second": 42.529, | |
"eval_steps_per_second": 2.953, | |
"step": 88 | |
}, | |
{ | |
"epoch": 9.88888888888889, | |
"grad_norm": 0.03793564811348915, | |
"learning_rate": 5.681818181818182e-07, | |
"loss": 0.0002, | |
"step": 89 | |
}, | |
{ | |
"epoch": 9.88888888888889, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.4065537452697754, | |
"eval_runtime": 1.6926, | |
"eval_samples_per_second": 42.537, | |
"eval_steps_per_second": 2.954, | |
"step": 89 | |
}, | |
{ | |
"epoch": 10.0, | |
"grad_norm": 0.003561729798093438, | |
"learning_rate": 0.0, | |
"loss": 0.0, | |
"step": 90 | |
}, | |
{ | |
"epoch": 10.0, | |
"eval_accuracy": 0.7777777777777778, | |
"eval_loss": 1.4016788005828857, | |
"eval_runtime": 1.693, | |
"eval_samples_per_second": 42.528, | |
"eval_steps_per_second": 2.953, | |
"step": 90 | |
}, | |
{ | |
"epoch": 10.0, | |
"step": 90, | |
"total_flos": 5016736558481408.0, | |
"train_loss": 0.22732360793484582, | |
"train_runtime": 316.2669, | |
"train_samples_per_second": 8.98, | |
"train_steps_per_second": 0.285 | |
} | |
], | |
"logging_steps": 1, | |
"max_steps": 90, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 10, | |
"save_steps": 500, | |
"total_flos": 5016736558481408.0, | |
"train_batch_size": 4, | |
"trial_name": null, | |
"trial_params": null | |
} | |