|
{ |
|
"best_metric": 4.9971747398376465, |
|
"best_model_checkpoint": "distilbert_add_pre-training-complete/checkpoint-300000", |
|
"epoch": 83.9630562552477, |
|
"global_step": 300000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.0, |
|
"learning_rate": 4.942130710236746e-05, |
|
"loss": 6.295, |
|
"step": 3573 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.15223537974860726, |
|
"eval_loss": 6.0701375007629395, |
|
"eval_runtime": 0.7526, |
|
"eval_samples_per_second": 636.494, |
|
"eval_steps_per_second": 10.63, |
|
"step": 3573 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"learning_rate": 4.882560853617873e-05, |
|
"loss": 6.0482, |
|
"step": 7146 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.15652710741946346, |
|
"eval_loss": 5.953271865844727, |
|
"eval_runtime": 0.74, |
|
"eval_samples_per_second": 647.279, |
|
"eval_steps_per_second": 10.81, |
|
"step": 7146 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"learning_rate": 4.823007669223075e-05, |
|
"loss": 5.9799, |
|
"step": 10719 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.15838858298685246, |
|
"eval_loss": 5.900791645050049, |
|
"eval_runtime": 0.7474, |
|
"eval_samples_per_second": 640.85, |
|
"eval_steps_per_second": 10.703, |
|
"step": 10719 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"learning_rate": 4.763454484828276e-05, |
|
"loss": 5.9378, |
|
"step": 14292 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.1544888689508215, |
|
"eval_loss": 5.899669647216797, |
|
"eval_runtime": 0.741, |
|
"eval_samples_per_second": 646.414, |
|
"eval_steps_per_second": 10.796, |
|
"step": 14292 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"learning_rate": 4.703901300433478e-05, |
|
"loss": 5.9057, |
|
"step": 17865 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.15363002922004945, |
|
"eval_loss": 5.890487194061279, |
|
"eval_runtime": 0.7428, |
|
"eval_samples_per_second": 644.873, |
|
"eval_steps_per_second": 10.77, |
|
"step": 17865 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"learning_rate": 4.644381460486829e-05, |
|
"loss": 5.8811, |
|
"step": 21438 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.15501974055273549, |
|
"eval_loss": 5.864607334136963, |
|
"eval_runtime": 0.7414, |
|
"eval_samples_per_second": 646.074, |
|
"eval_steps_per_second": 10.79, |
|
"step": 21438 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"learning_rate": 4.584828276092031e-05, |
|
"loss": 5.8617, |
|
"step": 25011 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.15343423240607648, |
|
"eval_loss": 5.832152366638184, |
|
"eval_runtime": 0.7408, |
|
"eval_samples_per_second": 646.572, |
|
"eval_steps_per_second": 10.799, |
|
"step": 25011 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"learning_rate": 4.525291763921307e-05, |
|
"loss": 5.844, |
|
"step": 28584 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.15228028968486984, |
|
"eval_loss": 5.856273651123047, |
|
"eval_runtime": 0.7422, |
|
"eval_samples_per_second": 645.395, |
|
"eval_steps_per_second": 10.779, |
|
"step": 28584 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"learning_rate": 4.4657219073024345e-05, |
|
"loss": 5.8297, |
|
"step": 32157 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.15479184129645152, |
|
"eval_loss": 5.835241794586182, |
|
"eval_runtime": 0.7411, |
|
"eval_samples_per_second": 646.294, |
|
"eval_steps_per_second": 10.794, |
|
"step": 32157 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"learning_rate": 4.4061853951317107e-05, |
|
"loss": 5.8175, |
|
"step": 35730 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.1558288740089093, |
|
"eval_loss": 5.813608646392822, |
|
"eval_runtime": 0.7417, |
|
"eval_samples_per_second": 645.841, |
|
"eval_steps_per_second": 10.786, |
|
"step": 35730 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"learning_rate": 4.3466322107369125e-05, |
|
"loss": 5.8056, |
|
"step": 39303 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.15261607518586057, |
|
"eval_loss": 5.81471061706543, |
|
"eval_runtime": 0.7409, |
|
"eval_samples_per_second": 646.53, |
|
"eval_steps_per_second": 10.798, |
|
"step": 39303 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"learning_rate": 4.287095698566189e-05, |
|
"loss": 5.7921, |
|
"step": 42876 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.15484534935469516, |
|
"eval_loss": 5.802037239074707, |
|
"eval_runtime": 0.759, |
|
"eval_samples_per_second": 631.084, |
|
"eval_steps_per_second": 10.54, |
|
"step": 42876 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"learning_rate": 4.227542514171391e-05, |
|
"loss": 5.7777, |
|
"step": 46449 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.1545157176914121, |
|
"eval_loss": 5.7890777587890625, |
|
"eval_runtime": 0.7548, |
|
"eval_samples_per_second": 634.635, |
|
"eval_steps_per_second": 10.599, |
|
"step": 46449 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"learning_rate": 4.168006002000667e-05, |
|
"loss": 5.7596, |
|
"step": 50022 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.1587256836763462, |
|
"eval_loss": 5.736998081207275, |
|
"eval_runtime": 0.7569, |
|
"eval_samples_per_second": 632.812, |
|
"eval_steps_per_second": 10.569, |
|
"step": 50022 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"learning_rate": 4.1084694898299434e-05, |
|
"loss": 5.7414, |
|
"step": 53595 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.16035909664749615, |
|
"eval_loss": 5.739556789398193, |
|
"eval_runtime": 0.7394, |
|
"eval_samples_per_second": 647.798, |
|
"eval_steps_per_second": 10.819, |
|
"step": 53595 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"learning_rate": 4.048899633211071e-05, |
|
"loss": 5.7243, |
|
"step": 57168 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.15644779032482922, |
|
"eval_loss": 5.749042510986328, |
|
"eval_runtime": 0.745, |
|
"eval_samples_per_second": 642.952, |
|
"eval_steps_per_second": 10.738, |
|
"step": 57168 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"learning_rate": 3.989346448816272e-05, |
|
"loss": 5.6997, |
|
"step": 60741 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.15605646376164706, |
|
"eval_loss": 5.713454723358154, |
|
"eval_runtime": 0.7453, |
|
"eval_samples_per_second": 642.723, |
|
"eval_steps_per_second": 10.734, |
|
"step": 60741 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"learning_rate": 3.929809936645549e-05, |
|
"loss": 5.6698, |
|
"step": 64314 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.1619928905941432, |
|
"eval_loss": 5.685813903808594, |
|
"eval_runtime": 0.7411, |
|
"eval_samples_per_second": 646.313, |
|
"eval_steps_per_second": 10.794, |
|
"step": 64314 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"learning_rate": 3.870273424474825e-05, |
|
"loss": 5.6398, |
|
"step": 67887 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.16435543303985822, |
|
"eval_loss": 5.673512935638428, |
|
"eval_runtime": 0.7428, |
|
"eval_samples_per_second": 644.852, |
|
"eval_steps_per_second": 10.77, |
|
"step": 67887 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"learning_rate": 3.810720240080027e-05, |
|
"loss": 5.6135, |
|
"step": 71460 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.1681270846288654, |
|
"eval_loss": 5.617359638214111, |
|
"eval_runtime": 0.7426, |
|
"eval_samples_per_second": 645.022, |
|
"eval_steps_per_second": 10.773, |
|
"step": 71460 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"learning_rate": 3.751217072357452e-05, |
|
"loss": 5.5899, |
|
"step": 75033 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_accuracy": 0.1684192862736324, |
|
"eval_loss": 5.619091510772705, |
|
"eval_runtime": 0.7423, |
|
"eval_samples_per_second": 645.301, |
|
"eval_steps_per_second": 10.777, |
|
"step": 75033 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"learning_rate": 3.69164721573858e-05, |
|
"loss": 5.5699, |
|
"step": 78606 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_accuracy": 0.16689319288707669, |
|
"eval_loss": 5.5976715087890625, |
|
"eval_runtime": 0.7425, |
|
"eval_samples_per_second": 645.154, |
|
"eval_steps_per_second": 10.775, |
|
"step": 78606 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"learning_rate": 3.632094031343781e-05, |
|
"loss": 5.5487, |
|
"step": 82179 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_accuracy": 0.16692754919499106, |
|
"eval_loss": 5.613892078399658, |
|
"eval_runtime": 0.7425, |
|
"eval_samples_per_second": 645.088, |
|
"eval_steps_per_second": 10.774, |
|
"step": 82179 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"learning_rate": 3.5725741913971326e-05, |
|
"loss": 5.529, |
|
"step": 85752 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_accuracy": 0.17411228879080098, |
|
"eval_loss": 5.527188301086426, |
|
"eval_runtime": 0.7442, |
|
"eval_samples_per_second": 643.623, |
|
"eval_steps_per_second": 10.749, |
|
"step": 85752 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"learning_rate": 3.5130210070023345e-05, |
|
"loss": 5.512, |
|
"step": 89325 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"eval_accuracy": 0.17266307533424513, |
|
"eval_loss": 5.5271124839782715, |
|
"eval_runtime": 0.742, |
|
"eval_samples_per_second": 645.531, |
|
"eval_steps_per_second": 10.781, |
|
"step": 89325 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"learning_rate": 3.453467822607536e-05, |
|
"loss": 5.4939, |
|
"step": 92898 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_accuracy": 0.1721401127408363, |
|
"eval_loss": 5.518980979919434, |
|
"eval_runtime": 0.7466, |
|
"eval_samples_per_second": 641.587, |
|
"eval_steps_per_second": 10.715, |
|
"step": 92898 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"learning_rate": 3.3939313104368124e-05, |
|
"loss": 5.4765, |
|
"step": 96471 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"eval_accuracy": 0.17704899557659257, |
|
"eval_loss": 5.482353687286377, |
|
"eval_runtime": 0.7454, |
|
"eval_samples_per_second": 642.613, |
|
"eval_steps_per_second": 10.733, |
|
"step": 96471 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"learning_rate": 3.334378126042014e-05, |
|
"loss": 5.4604, |
|
"step": 100044 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_accuracy": 0.17465552419016742, |
|
"eval_loss": 5.515853404998779, |
|
"eval_runtime": 0.7444, |
|
"eval_samples_per_second": 643.514, |
|
"eval_steps_per_second": 10.748, |
|
"step": 100044 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"learning_rate": 3.274824941647216e-05, |
|
"loss": 5.4422, |
|
"step": 103617 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"eval_accuracy": 0.18071012188659247, |
|
"eval_loss": 5.457724571228027, |
|
"eval_runtime": 0.7437, |
|
"eval_samples_per_second": 644.106, |
|
"eval_steps_per_second": 10.758, |
|
"step": 103617 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"learning_rate": 3.215288429476492e-05, |
|
"loss": 5.4243, |
|
"step": 107190 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_accuracy": 0.17723527446832202, |
|
"eval_loss": 5.4546217918396, |
|
"eval_runtime": 0.7437, |
|
"eval_samples_per_second": 644.065, |
|
"eval_steps_per_second": 10.757, |
|
"step": 107190 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"learning_rate": 3.155735245081694e-05, |
|
"loss": 5.408, |
|
"step": 110763 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"eval_accuracy": 0.18372376285301534, |
|
"eval_loss": 5.42967414855957, |
|
"eval_runtime": 0.7402, |
|
"eval_samples_per_second": 647.111, |
|
"eval_steps_per_second": 10.808, |
|
"step": 110763 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"learning_rate": 3.09619873291097e-05, |
|
"loss": 5.3915, |
|
"step": 114336 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_accuracy": 0.18657804700207528, |
|
"eval_loss": 5.408855438232422, |
|
"eval_runtime": 0.7424, |
|
"eval_samples_per_second": 645.224, |
|
"eval_steps_per_second": 10.776, |
|
"step": 114336 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"learning_rate": 3.0366288762920974e-05, |
|
"loss": 5.3766, |
|
"step": 117909 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"eval_accuracy": 0.18478903369268443, |
|
"eval_loss": 5.399631023406982, |
|
"eval_runtime": 0.738, |
|
"eval_samples_per_second": 649.025, |
|
"eval_steps_per_second": 10.84, |
|
"step": 117909 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"learning_rate": 2.977092364121374e-05, |
|
"loss": 5.3594, |
|
"step": 121482 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"eval_accuracy": 0.1840884164346579, |
|
"eval_loss": 5.397375106811523, |
|
"eval_runtime": 0.7416, |
|
"eval_samples_per_second": 645.897, |
|
"eval_steps_per_second": 10.787, |
|
"step": 121482 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"learning_rate": 2.91755585195065e-05, |
|
"loss": 5.3451, |
|
"step": 125055 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"eval_accuracy": 0.19081023213883255, |
|
"eval_loss": 5.371816158294678, |
|
"eval_runtime": 0.7407, |
|
"eval_samples_per_second": 646.688, |
|
"eval_steps_per_second": 10.801, |
|
"step": 125055 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"learning_rate": 2.858002667555852e-05, |
|
"loss": 5.3294, |
|
"step": 128628 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"eval_accuracy": 0.18781625441696112, |
|
"eval_loss": 5.370606422424316, |
|
"eval_runtime": 0.7401, |
|
"eval_samples_per_second": 647.222, |
|
"eval_steps_per_second": 10.81, |
|
"step": 128628 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"learning_rate": 2.7984661553851283e-05, |
|
"loss": 5.3155, |
|
"step": 132201 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"eval_accuracy": 0.19025888053184598, |
|
"eval_loss": 5.367715835571289, |
|
"eval_runtime": 0.74, |
|
"eval_samples_per_second": 647.333, |
|
"eval_steps_per_second": 10.811, |
|
"step": 132201 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"learning_rate": 2.738929643214405e-05, |
|
"loss": 5.2996, |
|
"step": 135774 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"eval_accuracy": 0.1993961082531872, |
|
"eval_loss": 5.296998023986816, |
|
"eval_runtime": 0.741, |
|
"eval_samples_per_second": 646.422, |
|
"eval_steps_per_second": 10.796, |
|
"step": 135774 |
|
}, |
|
{ |
|
"epoch": 39.0, |
|
"learning_rate": 2.679376458819607e-05, |
|
"loss": 5.287, |
|
"step": 139347 |
|
}, |
|
{ |
|
"epoch": 39.0, |
|
"eval_accuracy": 0.1976698373388867, |
|
"eval_loss": 5.312738418579102, |
|
"eval_runtime": 0.7407, |
|
"eval_samples_per_second": 646.699, |
|
"eval_steps_per_second": 10.801, |
|
"step": 139347 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"learning_rate": 2.619856618872958e-05, |
|
"loss": 5.2735, |
|
"step": 142920 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"eval_accuracy": 0.1954803206793486, |
|
"eval_loss": 5.314471244812012, |
|
"eval_runtime": 0.7488, |
|
"eval_samples_per_second": 639.694, |
|
"eval_steps_per_second": 10.684, |
|
"step": 142920 |
|
}, |
|
{ |
|
"epoch": 41.0, |
|
"learning_rate": 2.560286762254085e-05, |
|
"loss": 5.26, |
|
"step": 146493 |
|
}, |
|
{ |
|
"epoch": 41.0, |
|
"eval_accuracy": 0.201684222384949, |
|
"eval_loss": 5.298509120941162, |
|
"eval_runtime": 0.7424, |
|
"eval_samples_per_second": 645.229, |
|
"eval_steps_per_second": 10.776, |
|
"step": 146493 |
|
}, |
|
{ |
|
"epoch": 42.0, |
|
"learning_rate": 2.5007502500833613e-05, |
|
"loss": 5.2487, |
|
"step": 150066 |
|
}, |
|
{ |
|
"epoch": 42.0, |
|
"eval_accuracy": 0.20246748544851978, |
|
"eval_loss": 5.266134262084961, |
|
"eval_runtime": 0.7667, |
|
"eval_samples_per_second": 624.736, |
|
"eval_steps_per_second": 10.434, |
|
"step": 150066 |
|
}, |
|
{ |
|
"epoch": 43.0, |
|
"learning_rate": 2.441197065688563e-05, |
|
"loss": 5.2362, |
|
"step": 153639 |
|
}, |
|
{ |
|
"epoch": 43.0, |
|
"eval_accuracy": 0.2031214848143982, |
|
"eval_loss": 5.2712297439575195, |
|
"eval_runtime": 0.7435, |
|
"eval_samples_per_second": 644.236, |
|
"eval_steps_per_second": 10.76, |
|
"step": 153639 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"learning_rate": 2.3816605535178393e-05, |
|
"loss": 5.2248, |
|
"step": 157212 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"eval_accuracy": 0.2048919322568828, |
|
"eval_loss": 5.2451701164245605, |
|
"eval_runtime": 0.7423, |
|
"eval_samples_per_second": 645.278, |
|
"eval_steps_per_second": 10.777, |
|
"step": 157212 |
|
}, |
|
{ |
|
"epoch": 45.0, |
|
"learning_rate": 2.3221240413471157e-05, |
|
"loss": 5.2115, |
|
"step": 160785 |
|
}, |
|
{ |
|
"epoch": 45.0, |
|
"eval_accuracy": 0.20544290288153683, |
|
"eval_loss": 5.232546806335449, |
|
"eval_runtime": 0.7431, |
|
"eval_samples_per_second": 644.607, |
|
"eval_steps_per_second": 10.766, |
|
"step": 160785 |
|
}, |
|
{ |
|
"epoch": 46.0, |
|
"learning_rate": 2.2625708569523176e-05, |
|
"loss": 5.1998, |
|
"step": 164358 |
|
}, |
|
{ |
|
"epoch": 46.0, |
|
"eval_accuracy": 0.20746922947226437, |
|
"eval_loss": 5.223347187042236, |
|
"eval_runtime": 0.742, |
|
"eval_samples_per_second": 645.58, |
|
"eval_steps_per_second": 10.782, |
|
"step": 164358 |
|
}, |
|
{ |
|
"epoch": 47.0, |
|
"learning_rate": 2.2030176725575194e-05, |
|
"loss": 5.188, |
|
"step": 167931 |
|
}, |
|
{ |
|
"epoch": 47.0, |
|
"eval_accuracy": 0.21180000560836768, |
|
"eval_loss": 5.199436664581299, |
|
"eval_runtime": 0.748, |
|
"eval_samples_per_second": 640.38, |
|
"eval_steps_per_second": 10.695, |
|
"step": 167931 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"learning_rate": 2.143481160386796e-05, |
|
"loss": 5.1779, |
|
"step": 171504 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"eval_accuracy": 0.20685218850510417, |
|
"eval_loss": 5.243590354919434, |
|
"eval_runtime": 0.7487, |
|
"eval_samples_per_second": 639.747, |
|
"eval_steps_per_second": 10.685, |
|
"step": 171504 |
|
}, |
|
{ |
|
"epoch": 49.0, |
|
"learning_rate": 2.083944648216072e-05, |
|
"loss": 5.1664, |
|
"step": 175077 |
|
}, |
|
{ |
|
"epoch": 49.0, |
|
"eval_accuracy": 0.2129035902940595, |
|
"eval_loss": 5.220259666442871, |
|
"eval_runtime": 0.7417, |
|
"eval_samples_per_second": 645.78, |
|
"eval_steps_per_second": 10.785, |
|
"step": 175077 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"learning_rate": 2.024391463821274e-05, |
|
"loss": 5.1546, |
|
"step": 178650 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"eval_accuracy": 0.21341669482933423, |
|
"eval_loss": 5.181967258453369, |
|
"eval_runtime": 0.74, |
|
"eval_samples_per_second": 647.263, |
|
"eval_steps_per_second": 10.81, |
|
"step": 178650 |
|
}, |
|
{ |
|
"epoch": 51.0, |
|
"learning_rate": 1.9648549516505503e-05, |
|
"loss": 5.1431, |
|
"step": 182223 |
|
}, |
|
{ |
|
"epoch": 51.0, |
|
"eval_accuracy": 0.2122388143648584, |
|
"eval_loss": 5.202933311462402, |
|
"eval_runtime": 0.7441, |
|
"eval_samples_per_second": 643.704, |
|
"eval_steps_per_second": 10.751, |
|
"step": 182223 |
|
}, |
|
{ |
|
"epoch": 52.0, |
|
"learning_rate": 1.9053184394798267e-05, |
|
"loss": 5.133, |
|
"step": 185796 |
|
}, |
|
{ |
|
"epoch": 52.0, |
|
"eval_accuracy": 0.2139890957371677, |
|
"eval_loss": 5.145828723907471, |
|
"eval_runtime": 0.7427, |
|
"eval_samples_per_second": 644.964, |
|
"eval_steps_per_second": 10.772, |
|
"step": 185796 |
|
}, |
|
{ |
|
"epoch": 53.0, |
|
"learning_rate": 1.8457652550850286e-05, |
|
"loss": 5.1226, |
|
"step": 189369 |
|
}, |
|
{ |
|
"epoch": 53.0, |
|
"eval_accuracy": 0.21629691642249196, |
|
"eval_loss": 5.175747394561768, |
|
"eval_runtime": 0.7414, |
|
"eval_samples_per_second": 646.07, |
|
"eval_steps_per_second": 10.79, |
|
"step": 189369 |
|
}, |
|
{ |
|
"epoch": 54.0, |
|
"learning_rate": 1.7862287429143047e-05, |
|
"loss": 5.1138, |
|
"step": 192942 |
|
}, |
|
{ |
|
"epoch": 54.0, |
|
"eval_accuracy": 0.21931465914485995, |
|
"eval_loss": 5.137957572937012, |
|
"eval_runtime": 0.7463, |
|
"eval_samples_per_second": 641.826, |
|
"eval_steps_per_second": 10.719, |
|
"step": 192942 |
|
}, |
|
{ |
|
"epoch": 55.0, |
|
"learning_rate": 1.726692230743581e-05, |
|
"loss": 5.1046, |
|
"step": 196515 |
|
}, |
|
{ |
|
"epoch": 55.0, |
|
"eval_accuracy": 0.2177620690622458, |
|
"eval_loss": 5.1497979164123535, |
|
"eval_runtime": 0.743, |
|
"eval_samples_per_second": 644.669, |
|
"eval_steps_per_second": 10.767, |
|
"step": 196515 |
|
}, |
|
{ |
|
"epoch": 56.0, |
|
"learning_rate": 1.6671557185728576e-05, |
|
"loss": 5.0984, |
|
"step": 200088 |
|
}, |
|
{ |
|
"epoch": 56.0, |
|
"eval_accuracy": 0.21936753338018272, |
|
"eval_loss": 5.1094207763671875, |
|
"eval_runtime": 0.7432, |
|
"eval_samples_per_second": 644.503, |
|
"eval_steps_per_second": 10.764, |
|
"step": 200088 |
|
}, |
|
{ |
|
"epoch": 57.0, |
|
"learning_rate": 1.6076025341780594e-05, |
|
"loss": 5.0907, |
|
"step": 203661 |
|
}, |
|
{ |
|
"epoch": 57.0, |
|
"eval_accuracy": 0.22019682397673898, |
|
"eval_loss": 5.135354995727539, |
|
"eval_runtime": 0.7592, |
|
"eval_samples_per_second": 630.904, |
|
"eval_steps_per_second": 10.537, |
|
"step": 203661 |
|
}, |
|
{ |
|
"epoch": 58.0, |
|
"learning_rate": 1.548066022007336e-05, |
|
"loss": 5.0812, |
|
"step": 207234 |
|
}, |
|
{ |
|
"epoch": 58.0, |
|
"eval_accuracy": 0.22555465405813727, |
|
"eval_loss": 5.066197395324707, |
|
"eval_runtime": 0.7435, |
|
"eval_samples_per_second": 644.224, |
|
"eval_steps_per_second": 10.759, |
|
"step": 207234 |
|
}, |
|
{ |
|
"epoch": 59.0, |
|
"learning_rate": 1.4885295098366123e-05, |
|
"loss": 5.0748, |
|
"step": 210807 |
|
}, |
|
{ |
|
"epoch": 59.0, |
|
"eval_accuracy": 0.21810022855231617, |
|
"eval_loss": 5.116298198699951, |
|
"eval_runtime": 0.7413, |
|
"eval_samples_per_second": 646.139, |
|
"eval_steps_per_second": 10.791, |
|
"step": 210807 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"learning_rate": 1.428976325441814e-05, |
|
"loss": 5.067, |
|
"step": 214380 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"eval_accuracy": 0.2199191102123357, |
|
"eval_loss": 5.119345664978027, |
|
"eval_runtime": 0.7438, |
|
"eval_samples_per_second": 643.957, |
|
"eval_steps_per_second": 10.755, |
|
"step": 214380 |
|
}, |
|
{ |
|
"epoch": 61.0, |
|
"learning_rate": 1.3694398132710903e-05, |
|
"loss": 5.0609, |
|
"step": 217953 |
|
}, |
|
{ |
|
"epoch": 61.0, |
|
"eval_accuracy": 0.22237028400099942, |
|
"eval_loss": 5.091867446899414, |
|
"eval_runtime": 0.7487, |
|
"eval_samples_per_second": 639.76, |
|
"eval_steps_per_second": 10.685, |
|
"step": 217953 |
|
}, |
|
{ |
|
"epoch": 62.0, |
|
"learning_rate": 1.309903301100367e-05, |
|
"loss": 5.0536, |
|
"step": 221526 |
|
}, |
|
{ |
|
"epoch": 62.0, |
|
"eval_accuracy": 0.2238659606063153, |
|
"eval_loss": 5.089934349060059, |
|
"eval_runtime": 0.7427, |
|
"eval_samples_per_second": 644.961, |
|
"eval_steps_per_second": 10.772, |
|
"step": 221526 |
|
}, |
|
{ |
|
"epoch": 63.0, |
|
"learning_rate": 1.2503667889296434e-05, |
|
"loss": 5.0491, |
|
"step": 225099 |
|
}, |
|
{ |
|
"epoch": 63.0, |
|
"eval_accuracy": 0.22235195996663887, |
|
"eval_loss": 5.112506866455078, |
|
"eval_runtime": 0.744, |
|
"eval_samples_per_second": 643.802, |
|
"eval_steps_per_second": 10.752, |
|
"step": 225099 |
|
}, |
|
{ |
|
"epoch": 64.0, |
|
"learning_rate": 1.1907969323107703e-05, |
|
"loss": 5.0433, |
|
"step": 228672 |
|
}, |
|
{ |
|
"epoch": 64.0, |
|
"eval_accuracy": 0.22262113416633686, |
|
"eval_loss": 5.0892157554626465, |
|
"eval_runtime": 0.7431, |
|
"eval_samples_per_second": 644.558, |
|
"eval_steps_per_second": 10.765, |
|
"step": 228672 |
|
}, |
|
{ |
|
"epoch": 65.0, |
|
"learning_rate": 1.1312604201400467e-05, |
|
"loss": 5.0373, |
|
"step": 232245 |
|
}, |
|
{ |
|
"epoch": 65.0, |
|
"eval_accuracy": 0.22597519475771297, |
|
"eval_loss": 5.064445972442627, |
|
"eval_runtime": 0.7398, |
|
"eval_samples_per_second": 647.432, |
|
"eval_steps_per_second": 10.813, |
|
"step": 232245 |
|
}, |
|
{ |
|
"epoch": 66.0, |
|
"learning_rate": 1.0717239079693232e-05, |
|
"loss": 5.032, |
|
"step": 235818 |
|
}, |
|
{ |
|
"epoch": 66.0, |
|
"eval_accuracy": 0.22527426278723942, |
|
"eval_loss": 5.062305927276611, |
|
"eval_runtime": 0.7441, |
|
"eval_samples_per_second": 643.745, |
|
"eval_steps_per_second": 10.751, |
|
"step": 235818 |
|
}, |
|
{ |
|
"epoch": 67.0, |
|
"learning_rate": 1.0121873957985996e-05, |
|
"loss": 5.0283, |
|
"step": 239391 |
|
}, |
|
{ |
|
"epoch": 67.0, |
|
"eval_accuracy": 0.22131170544158504, |
|
"eval_loss": 5.100430011749268, |
|
"eval_runtime": 0.7454, |
|
"eval_samples_per_second": 642.6, |
|
"eval_steps_per_second": 10.732, |
|
"step": 239391 |
|
}, |
|
{ |
|
"epoch": 68.0, |
|
"learning_rate": 9.526342114038013e-06, |
|
"loss": 5.0223, |
|
"step": 242964 |
|
}, |
|
{ |
|
"epoch": 68.0, |
|
"eval_accuracy": 0.22790462549498694, |
|
"eval_loss": 5.057290077209473, |
|
"eval_runtime": 0.7453, |
|
"eval_samples_per_second": 642.666, |
|
"eval_steps_per_second": 10.733, |
|
"step": 242964 |
|
}, |
|
{ |
|
"epoch": 69.0, |
|
"learning_rate": 8.930976992330777e-06, |
|
"loss": 5.0184, |
|
"step": 246537 |
|
}, |
|
{ |
|
"epoch": 69.0, |
|
"eval_accuracy": 0.22710971646307296, |
|
"eval_loss": 5.048848628997803, |
|
"eval_runtime": 0.7418, |
|
"eval_samples_per_second": 645.766, |
|
"eval_steps_per_second": 10.785, |
|
"step": 246537 |
|
}, |
|
{ |
|
"epoch": 70.0, |
|
"learning_rate": 8.335445148382794e-06, |
|
"loss": 5.014, |
|
"step": 250110 |
|
}, |
|
{ |
|
"epoch": 70.0, |
|
"eval_accuracy": 0.22795434198746642, |
|
"eval_loss": 5.048243999481201, |
|
"eval_runtime": 0.7433, |
|
"eval_samples_per_second": 644.41, |
|
"eval_steps_per_second": 10.763, |
|
"step": 250110 |
|
}, |
|
{ |
|
"epoch": 71.0, |
|
"learning_rate": 7.740246748916306e-06, |
|
"loss": 5.0102, |
|
"step": 253683 |
|
}, |
|
{ |
|
"epoch": 71.0, |
|
"eval_accuracy": 0.22687027178481367, |
|
"eval_loss": 5.060031414031982, |
|
"eval_runtime": 0.7439, |
|
"eval_samples_per_second": 643.923, |
|
"eval_steps_per_second": 10.754, |
|
"step": 253683 |
|
}, |
|
{ |
|
"epoch": 72.0, |
|
"learning_rate": 7.144714904968323e-06, |
|
"loss": 5.0079, |
|
"step": 257256 |
|
}, |
|
{ |
|
"epoch": 72.0, |
|
"eval_accuracy": 0.2278910610607803, |
|
"eval_loss": 5.027137279510498, |
|
"eval_runtime": 0.7414, |
|
"eval_samples_per_second": 646.086, |
|
"eval_steps_per_second": 10.791, |
|
"step": 257256 |
|
}, |
|
{ |
|
"epoch": 73.0, |
|
"learning_rate": 6.549349783261087e-06, |
|
"loss": 5.0029, |
|
"step": 260829 |
|
}, |
|
{ |
|
"epoch": 73.0, |
|
"eval_accuracy": 0.22669124629908943, |
|
"eval_loss": 5.062880039215088, |
|
"eval_runtime": 0.7483, |
|
"eval_samples_per_second": 640.112, |
|
"eval_steps_per_second": 10.691, |
|
"step": 260829 |
|
}, |
|
{ |
|
"epoch": 74.0, |
|
"learning_rate": 5.9538179393131045e-06, |
|
"loss": 4.9994, |
|
"step": 264402 |
|
}, |
|
{ |
|
"epoch": 74.0, |
|
"eval_accuracy": 0.22970634211701652, |
|
"eval_loss": 5.030394554138184, |
|
"eval_runtime": 0.7474, |
|
"eval_samples_per_second": 640.862, |
|
"eval_steps_per_second": 10.703, |
|
"step": 264402 |
|
}, |
|
{ |
|
"epoch": 75.0, |
|
"learning_rate": 5.358286095365122e-06, |
|
"loss": 4.9978, |
|
"step": 267975 |
|
}, |
|
{ |
|
"epoch": 75.0, |
|
"eval_accuracy": 0.22689803559951938, |
|
"eval_loss": 5.048463821411133, |
|
"eval_runtime": 0.7472, |
|
"eval_samples_per_second": 641.032, |
|
"eval_steps_per_second": 10.706, |
|
"step": 267975 |
|
}, |
|
{ |
|
"epoch": 76.0, |
|
"learning_rate": 4.7630876958986335e-06, |
|
"loss": 4.9945, |
|
"step": 271548 |
|
}, |
|
{ |
|
"epoch": 76.0, |
|
"eval_accuracy": 0.23055979997221837, |
|
"eval_loss": 5.037987232208252, |
|
"eval_runtime": 0.7423, |
|
"eval_samples_per_second": 645.335, |
|
"eval_steps_per_second": 10.778, |
|
"step": 271548 |
|
}, |
|
{ |
|
"epoch": 77.0, |
|
"learning_rate": 4.167389129709903e-06, |
|
"loss": 4.9917, |
|
"step": 275121 |
|
}, |
|
{ |
|
"epoch": 77.0, |
|
"eval_accuracy": 0.2264717436444245, |
|
"eval_loss": 5.058966636657715, |
|
"eval_runtime": 0.7422, |
|
"eval_samples_per_second": 645.356, |
|
"eval_steps_per_second": 10.778, |
|
"step": 275121 |
|
}, |
|
{ |
|
"epoch": 78.0, |
|
"learning_rate": 3.5720240080026674e-06, |
|
"loss": 4.9913, |
|
"step": 278694 |
|
}, |
|
{ |
|
"epoch": 78.0, |
|
"eval_accuracy": 0.226176645603695, |
|
"eval_loss": 5.058542728424072, |
|
"eval_runtime": 0.7385, |
|
"eval_samples_per_second": 648.623, |
|
"eval_steps_per_second": 10.833, |
|
"step": 278694 |
|
}, |
|
{ |
|
"epoch": 79.0, |
|
"learning_rate": 2.976658886295432e-06, |
|
"loss": 4.987, |
|
"step": 282267 |
|
}, |
|
{ |
|
"epoch": 79.0, |
|
"eval_accuracy": 0.2278051787916153, |
|
"eval_loss": 5.033863544464111, |
|
"eval_runtime": 0.7435, |
|
"eval_samples_per_second": 644.245, |
|
"eval_steps_per_second": 10.76, |
|
"step": 282267 |
|
}, |
|
{ |
|
"epoch": 80.0, |
|
"learning_rate": 2.3812937645881964e-06, |
|
"loss": 4.9862, |
|
"step": 285840 |
|
}, |
|
{ |
|
"epoch": 80.0, |
|
"eval_accuracy": 0.2305035650623886, |
|
"eval_loss": 5.021360397338867, |
|
"eval_runtime": 0.7486, |
|
"eval_samples_per_second": 639.881, |
|
"eval_steps_per_second": 10.687, |
|
"step": 285840 |
|
}, |
|
{ |
|
"epoch": 81.0, |
|
"learning_rate": 1.7857619206402134e-06, |
|
"loss": 4.9841, |
|
"step": 289413 |
|
}, |
|
{ |
|
"epoch": 81.0, |
|
"eval_accuracy": 0.227092194586068, |
|
"eval_loss": 5.039330005645752, |
|
"eval_runtime": 0.7471, |
|
"eval_samples_per_second": 641.165, |
|
"eval_steps_per_second": 10.708, |
|
"step": 289413 |
|
}, |
|
{ |
|
"epoch": 82.0, |
|
"learning_rate": 1.1903967989329777e-06, |
|
"loss": 4.983, |
|
"step": 292986 |
|
}, |
|
{ |
|
"epoch": 82.0, |
|
"eval_accuracy": 0.22981801867481705, |
|
"eval_loss": 5.02002477645874, |
|
"eval_runtime": 0.747, |
|
"eval_samples_per_second": 641.199, |
|
"eval_steps_per_second": 10.709, |
|
"step": 292986 |
|
}, |
|
{ |
|
"epoch": 83.0, |
|
"learning_rate": 5.95031677225742e-07, |
|
"loss": 4.9816, |
|
"step": 296559 |
|
}, |
|
{ |
|
"epoch": 83.0, |
|
"eval_accuracy": 0.22995899106791753, |
|
"eval_loss": 5.028861045837402, |
|
"eval_runtime": 0.7441, |
|
"eval_samples_per_second": 643.773, |
|
"eval_steps_per_second": 10.752, |
|
"step": 296559 |
|
}, |
|
{ |
|
"epoch": 83.96, |
|
"learning_rate": 2.1340446815605204e-08, |
|
"loss": 4.9801, |
|
"step": 300000 |
|
}, |
|
{ |
|
"epoch": 83.96, |
|
"eval_accuracy": 0.23321614400225005, |
|
"eval_loss": 4.9971747398376465, |
|
"eval_runtime": 0.7444, |
|
"eval_samples_per_second": 643.479, |
|
"eval_steps_per_second": 10.747, |
|
"step": 300000 |
|
}, |
|
{ |
|
"epoch": 83.96, |
|
"step": 300000, |
|
"total_flos": 2.3423641855784387e+18, |
|
"train_loss": 5.338385032552083, |
|
"train_runtime": 55456.4482, |
|
"train_samples_per_second": 346.218, |
|
"train_steps_per_second": 5.41 |
|
} |
|
], |
|
"max_steps": 300000, |
|
"num_train_epochs": 84, |
|
"total_flos": 2.3423641855784387e+18, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|