esunn's picture
Upload folder using huggingface_hub
2f1ccda verified
raw
history blame
15.6 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 7.846153846153846,
"eval_steps": 1,
"global_step": 40,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.20512820512820512,
"grad_norm": 32.65580368041992,
"learning_rate": 2e-07,
"loss": 2.9998,
"step": 1
},
{
"epoch": 0.20512820512820512,
"eval_loss": 3.0381886959075928,
"eval_runtime": 0.1881,
"eval_samples_per_second": 175.434,
"eval_steps_per_second": 26.581,
"step": 1
},
{
"epoch": 0.41025641025641024,
"grad_norm": 29.269834518432617,
"learning_rate": 4e-07,
"loss": 3.0081,
"step": 2
},
{
"epoch": 0.41025641025641024,
"eval_loss": 3.0378756523132324,
"eval_runtime": 0.1873,
"eval_samples_per_second": 176.167,
"eval_steps_per_second": 26.692,
"step": 2
},
{
"epoch": 0.6153846153846154,
"grad_norm": 31.252574920654297,
"learning_rate": 6e-07,
"loss": 2.9024,
"step": 3
},
{
"epoch": 0.6153846153846154,
"eval_loss": 3.0356383323669434,
"eval_runtime": 0.1862,
"eval_samples_per_second": 177.272,
"eval_steps_per_second": 26.859,
"step": 3
},
{
"epoch": 0.8205128205128205,
"grad_norm": 26.509944915771484,
"learning_rate": 8e-07,
"loss": 2.9814,
"step": 4
},
{
"epoch": 0.8205128205128205,
"eval_loss": 3.0280070304870605,
"eval_runtime": 0.1851,
"eval_samples_per_second": 178.263,
"eval_steps_per_second": 27.01,
"step": 4
},
{
"epoch": 1.0256410256410255,
"grad_norm": 26.16226577758789,
"learning_rate": 1e-06,
"loss": 2.9813,
"step": 5
},
{
"epoch": 1.0256410256410255,
"eval_loss": 3.0136334896087646,
"eval_runtime": 0.1863,
"eval_samples_per_second": 177.152,
"eval_steps_per_second": 26.841,
"step": 5
},
{
"epoch": 1.1794871794871795,
"grad_norm": 27.74286651611328,
"learning_rate": 1.2e-06,
"loss": 2.9137,
"step": 6
},
{
"epoch": 1.1794871794871795,
"eval_loss": 2.991811990737915,
"eval_runtime": 0.1876,
"eval_samples_per_second": 175.952,
"eval_steps_per_second": 26.659,
"step": 6
},
{
"epoch": 1.3846153846153846,
"grad_norm": 24.41265106201172,
"learning_rate": 1.4e-06,
"loss": 2.9909,
"step": 7
},
{
"epoch": 1.3846153846153846,
"eval_loss": 2.942638635635376,
"eval_runtime": 0.1874,
"eval_samples_per_second": 176.073,
"eval_steps_per_second": 26.678,
"step": 7
},
{
"epoch": 1.5897435897435899,
"grad_norm": 24.35742950439453,
"learning_rate": 1.6e-06,
"loss": 2.8925,
"step": 8
},
{
"epoch": 1.5897435897435899,
"eval_loss": 2.9046568870544434,
"eval_runtime": 0.1871,
"eval_samples_per_second": 176.344,
"eval_steps_per_second": 26.719,
"step": 8
},
{
"epoch": 1.7948717948717947,
"grad_norm": 18.749122619628906,
"learning_rate": 1.8e-06,
"loss": 2.825,
"step": 9
},
{
"epoch": 1.7948717948717947,
"eval_loss": 2.8789775371551514,
"eval_runtime": 0.1878,
"eval_samples_per_second": 175.683,
"eval_steps_per_second": 26.619,
"step": 9
},
{
"epoch": 2.0,
"grad_norm": 18.693262100219727,
"learning_rate": 2e-06,
"loss": 2.8329,
"step": 10
},
{
"epoch": 2.0,
"eval_loss": 2.794933319091797,
"eval_runtime": 0.1863,
"eval_samples_per_second": 177.152,
"eval_steps_per_second": 26.841,
"step": 10
},
{
"epoch": 2.1538461538461537,
"grad_norm": 14.351330757141113,
"learning_rate": 1.994521895368273e-06,
"loss": 2.6496,
"step": 11
},
{
"epoch": 2.1538461538461537,
"eval_loss": 2.763171911239624,
"eval_runtime": 0.1853,
"eval_samples_per_second": 178.074,
"eval_steps_per_second": 26.981,
"step": 11
},
{
"epoch": 2.358974358974359,
"grad_norm": 14.6412992477417,
"learning_rate": 1.9781476007338054e-06,
"loss": 2.6857,
"step": 12
},
{
"epoch": 2.358974358974359,
"eval_loss": 2.7388267517089844,
"eval_runtime": 0.187,
"eval_samples_per_second": 176.511,
"eval_steps_per_second": 26.744,
"step": 12
},
{
"epoch": 2.564102564102564,
"grad_norm": 16.191911697387695,
"learning_rate": 1.9510565162951534e-06,
"loss": 2.679,
"step": 13
},
{
"epoch": 2.564102564102564,
"eval_loss": 2.719318389892578,
"eval_runtime": 0.1879,
"eval_samples_per_second": 175.654,
"eval_steps_per_second": 26.614,
"step": 13
},
{
"epoch": 2.769230769230769,
"grad_norm": 14.66492748260498,
"learning_rate": 1.9135454576426007e-06,
"loss": 2.6802,
"step": 14
},
{
"epoch": 2.769230769230769,
"eval_loss": 2.674811840057373,
"eval_runtime": 0.1872,
"eval_samples_per_second": 176.31,
"eval_steps_per_second": 26.714,
"step": 14
},
{
"epoch": 2.9743589743589745,
"grad_norm": 12.284189224243164,
"learning_rate": 1.8660254037844386e-06,
"loss": 2.6269,
"step": 15
},
{
"epoch": 2.9743589743589745,
"eval_loss": 2.6451773643493652,
"eval_runtime": 0.1889,
"eval_samples_per_second": 174.656,
"eval_steps_per_second": 26.463,
"step": 15
},
{
"epoch": 3.128205128205128,
"grad_norm": 12.785527229309082,
"learning_rate": 1.8090169943749474e-06,
"loss": 2.5546,
"step": 16
},
{
"epoch": 3.128205128205128,
"eval_loss": 2.628568410873413,
"eval_runtime": 0.1861,
"eval_samples_per_second": 177.366,
"eval_steps_per_second": 26.874,
"step": 16
},
{
"epoch": 3.3333333333333335,
"grad_norm": 12.565117835998535,
"learning_rate": 1.743144825477394e-06,
"loss": 2.574,
"step": 17
},
{
"epoch": 3.3333333333333335,
"eval_loss": 2.616790294647217,
"eval_runtime": 0.1874,
"eval_samples_per_second": 176.09,
"eval_steps_per_second": 26.68,
"step": 17
},
{
"epoch": 3.5384615384615383,
"grad_norm": 12.94242000579834,
"learning_rate": 1.669130606358858e-06,
"loss": 2.5548,
"step": 18
},
{
"epoch": 3.5384615384615383,
"eval_loss": 2.6054270267486572,
"eval_runtime": 0.1852,
"eval_samples_per_second": 178.208,
"eval_steps_per_second": 27.001,
"step": 18
},
{
"epoch": 3.7435897435897436,
"grad_norm": 11.304039001464844,
"learning_rate": 1.587785252292473e-06,
"loss": 2.5145,
"step": 19
},
{
"epoch": 3.7435897435897436,
"eval_loss": 2.595207691192627,
"eval_runtime": 0.1861,
"eval_samples_per_second": 177.318,
"eval_steps_per_second": 26.866,
"step": 19
},
{
"epoch": 3.948717948717949,
"grad_norm": 11.087238311767578,
"learning_rate": 1.5e-06,
"loss": 2.452,
"step": 20
},
{
"epoch": 3.948717948717949,
"eval_loss": 2.5863306522369385,
"eval_runtime": 0.1862,
"eval_samples_per_second": 177.233,
"eval_steps_per_second": 26.854,
"step": 20
},
{
"epoch": 4.102564102564102,
"grad_norm": 10.603784561157227,
"learning_rate": 1.4067366430758004e-06,
"loss": 2.4647,
"step": 21
},
{
"epoch": 4.102564102564102,
"eval_loss": 2.5786077976226807,
"eval_runtime": 0.1867,
"eval_samples_per_second": 176.801,
"eval_steps_per_second": 26.788,
"step": 21
},
{
"epoch": 4.3076923076923075,
"grad_norm": 10.523798942565918,
"learning_rate": 1.3090169943749473e-06,
"loss": 2.423,
"step": 22
},
{
"epoch": 4.3076923076923075,
"eval_loss": 2.5714633464813232,
"eval_runtime": 0.1854,
"eval_samples_per_second": 178.035,
"eval_steps_per_second": 26.975,
"step": 22
},
{
"epoch": 4.512820512820513,
"grad_norm": 9.499349594116211,
"learning_rate": 1.207911690817759e-06,
"loss": 2.4104,
"step": 23
},
{
"epoch": 4.512820512820513,
"eval_loss": 2.5648255348205566,
"eval_runtime": 0.1877,
"eval_samples_per_second": 175.778,
"eval_steps_per_second": 26.633,
"step": 23
},
{
"epoch": 4.717948717948718,
"grad_norm": 9.946209907531738,
"learning_rate": 1.1045284632676535e-06,
"loss": 2.3664,
"step": 24
},
{
"epoch": 4.717948717948718,
"eval_loss": 2.5592212677001953,
"eval_runtime": 0.1861,
"eval_samples_per_second": 177.323,
"eval_steps_per_second": 26.867,
"step": 24
},
{
"epoch": 4.923076923076923,
"grad_norm": 9.741501808166504,
"learning_rate": 1e-06,
"loss": 2.4211,
"step": 25
},
{
"epoch": 4.923076923076923,
"eval_loss": 2.5535762310028076,
"eval_runtime": 0.1872,
"eval_samples_per_second": 176.274,
"eval_steps_per_second": 26.708,
"step": 25
},
{
"epoch": 5.076923076923077,
"grad_norm": 10.652682304382324,
"learning_rate": 8.954715367323466e-07,
"loss": 2.4291,
"step": 26
},
{
"epoch": 5.076923076923077,
"eval_loss": 2.549236536026001,
"eval_runtime": 0.1887,
"eval_samples_per_second": 174.886,
"eval_steps_per_second": 26.498,
"step": 26
},
{
"epoch": 5.282051282051282,
"grad_norm": 9.138431549072266,
"learning_rate": 7.920883091822408e-07,
"loss": 2.3475,
"step": 27
},
{
"epoch": 5.282051282051282,
"eval_loss": 2.5455117225646973,
"eval_runtime": 0.1869,
"eval_samples_per_second": 176.553,
"eval_steps_per_second": 26.75,
"step": 27
},
{
"epoch": 5.487179487179487,
"grad_norm": 9.32693099975586,
"learning_rate": 6.909830056250526e-07,
"loss": 2.3665,
"step": 28
},
{
"epoch": 5.487179487179487,
"eval_loss": 2.541745901107788,
"eval_runtime": 0.1868,
"eval_samples_per_second": 176.668,
"eval_steps_per_second": 26.768,
"step": 28
},
{
"epoch": 5.6923076923076925,
"grad_norm": 9.5020751953125,
"learning_rate": 5.932633569241999e-07,
"loss": 2.3862,
"step": 29
},
{
"epoch": 5.6923076923076925,
"eval_loss": 2.5386736392974854,
"eval_runtime": 0.187,
"eval_samples_per_second": 176.452,
"eval_steps_per_second": 26.735,
"step": 29
},
{
"epoch": 5.897435897435898,
"grad_norm": 10.226723670959473,
"learning_rate": 5.000000000000002e-07,
"loss": 2.3784,
"step": 30
},
{
"epoch": 5.897435897435898,
"eval_loss": 2.5360124111175537,
"eval_runtime": 0.186,
"eval_samples_per_second": 177.425,
"eval_steps_per_second": 26.883,
"step": 30
},
{
"epoch": 6.051282051282051,
"grad_norm": 10.043070793151855,
"learning_rate": 4.1221474770752696e-07,
"loss": 2.354,
"step": 31
},
{
"epoch": 6.051282051282051,
"eval_loss": 2.5342884063720703,
"eval_runtime": 0.1871,
"eval_samples_per_second": 176.386,
"eval_steps_per_second": 26.725,
"step": 31
},
{
"epoch": 6.256410256410256,
"grad_norm": 9.647918701171875,
"learning_rate": 3.308693936411421e-07,
"loss": 2.3442,
"step": 32
},
{
"epoch": 6.256410256410256,
"eval_loss": 2.532135248184204,
"eval_runtime": 0.1858,
"eval_samples_per_second": 177.644,
"eval_steps_per_second": 26.916,
"step": 32
},
{
"epoch": 6.461538461538462,
"grad_norm": 9.200613975524902,
"learning_rate": 2.568551745226056e-07,
"loss": 2.3499,
"step": 33
},
{
"epoch": 6.461538461538462,
"eval_loss": 2.5312461853027344,
"eval_runtime": 0.1865,
"eval_samples_per_second": 176.909,
"eval_steps_per_second": 26.804,
"step": 33
},
{
"epoch": 6.666666666666667,
"grad_norm": 9.6244535446167,
"learning_rate": 1.9098300562505264e-07,
"loss": 2.3312,
"step": 34
},
{
"epoch": 6.666666666666667,
"eval_loss": 2.5296669006347656,
"eval_runtime": 0.1862,
"eval_samples_per_second": 177.266,
"eval_steps_per_second": 26.859,
"step": 34
},
{
"epoch": 6.871794871794872,
"grad_norm": 9.38110065460205,
"learning_rate": 1.3397459621556128e-07,
"loss": 2.3551,
"step": 35
},
{
"epoch": 6.871794871794872,
"eval_loss": 2.5289077758789062,
"eval_runtime": 0.1858,
"eval_samples_per_second": 177.582,
"eval_steps_per_second": 26.906,
"step": 35
},
{
"epoch": 7.0256410256410255,
"grad_norm": 9.125926971435547,
"learning_rate": 8.645454235739902e-08,
"loss": 2.3363,
"step": 36
},
{
"epoch": 7.0256410256410255,
"eval_loss": 2.5289089679718018,
"eval_runtime": 0.1865,
"eval_samples_per_second": 176.913,
"eval_steps_per_second": 26.805,
"step": 36
},
{
"epoch": 7.230769230769231,
"grad_norm": 9.84389591217041,
"learning_rate": 4.8943483704846465e-08,
"loss": 2.3691,
"step": 37
},
{
"epoch": 7.230769230769231,
"eval_loss": 2.5284206867218018,
"eval_runtime": 0.188,
"eval_samples_per_second": 175.495,
"eval_steps_per_second": 26.59,
"step": 37
},
{
"epoch": 7.435897435897436,
"grad_norm": 9.293142318725586,
"learning_rate": 2.185239926619431e-08,
"loss": 2.3267,
"step": 38
},
{
"epoch": 7.435897435897436,
"eval_loss": 2.528106689453125,
"eval_runtime": 0.1858,
"eval_samples_per_second": 177.588,
"eval_steps_per_second": 26.907,
"step": 38
},
{
"epoch": 7.641025641025641,
"grad_norm": 9.073277473449707,
"learning_rate": 5.47810463172671e-09,
"loss": 2.3389,
"step": 39
},
{
"epoch": 7.641025641025641,
"eval_loss": 2.528116464614868,
"eval_runtime": 0.186,
"eval_samples_per_second": 177.392,
"eval_steps_per_second": 26.878,
"step": 39
},
{
"epoch": 7.846153846153846,
"grad_norm": 9.473708152770996,
"learning_rate": 0.0,
"loss": 2.1969,
"step": 40
},
{
"epoch": 7.846153846153846,
"eval_loss": 2.5279667377471924,
"eval_runtime": 0.187,
"eval_samples_per_second": 176.514,
"eval_steps_per_second": 26.745,
"step": 40
}
],
"logging_steps": 1,
"max_steps": 40,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 4,
"total_flos": 1.0427550308237312e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}