|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 7.846153846153846, |
|
"eval_steps": 1, |
|
"global_step": 40, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.20512820512820512, |
|
"grad_norm": 32.65580368041992, |
|
"learning_rate": 2e-07, |
|
"loss": 2.9998, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.20512820512820512, |
|
"eval_loss": 3.0381886959075928, |
|
"eval_runtime": 0.1881, |
|
"eval_samples_per_second": 175.434, |
|
"eval_steps_per_second": 26.581, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.41025641025641024, |
|
"grad_norm": 29.269834518432617, |
|
"learning_rate": 4e-07, |
|
"loss": 3.0081, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.41025641025641024, |
|
"eval_loss": 3.0378756523132324, |
|
"eval_runtime": 0.1873, |
|
"eval_samples_per_second": 176.167, |
|
"eval_steps_per_second": 26.692, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 31.252574920654297, |
|
"learning_rate": 6e-07, |
|
"loss": 2.9024, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"eval_loss": 3.0356383323669434, |
|
"eval_runtime": 0.1862, |
|
"eval_samples_per_second": 177.272, |
|
"eval_steps_per_second": 26.859, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.8205128205128205, |
|
"grad_norm": 26.509944915771484, |
|
"learning_rate": 8e-07, |
|
"loss": 2.9814, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.8205128205128205, |
|
"eval_loss": 3.0280070304870605, |
|
"eval_runtime": 0.1851, |
|
"eval_samples_per_second": 178.263, |
|
"eval_steps_per_second": 27.01, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 1.0256410256410255, |
|
"grad_norm": 26.16226577758789, |
|
"learning_rate": 1e-06, |
|
"loss": 2.9813, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 1.0256410256410255, |
|
"eval_loss": 3.0136334896087646, |
|
"eval_runtime": 0.1863, |
|
"eval_samples_per_second": 177.152, |
|
"eval_steps_per_second": 26.841, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 1.1794871794871795, |
|
"grad_norm": 27.74286651611328, |
|
"learning_rate": 1.2e-06, |
|
"loss": 2.9137, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 1.1794871794871795, |
|
"eval_loss": 2.991811990737915, |
|
"eval_runtime": 0.1876, |
|
"eval_samples_per_second": 175.952, |
|
"eval_steps_per_second": 26.659, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 1.3846153846153846, |
|
"grad_norm": 24.41265106201172, |
|
"learning_rate": 1.4e-06, |
|
"loss": 2.9909, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 1.3846153846153846, |
|
"eval_loss": 2.942638635635376, |
|
"eval_runtime": 0.1874, |
|
"eval_samples_per_second": 176.073, |
|
"eval_steps_per_second": 26.678, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 1.5897435897435899, |
|
"grad_norm": 24.35742950439453, |
|
"learning_rate": 1.6e-06, |
|
"loss": 2.8925, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 1.5897435897435899, |
|
"eval_loss": 2.9046568870544434, |
|
"eval_runtime": 0.1871, |
|
"eval_samples_per_second": 176.344, |
|
"eval_steps_per_second": 26.719, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 1.7948717948717947, |
|
"grad_norm": 18.749122619628906, |
|
"learning_rate": 1.8e-06, |
|
"loss": 2.825, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 1.7948717948717947, |
|
"eval_loss": 2.8789775371551514, |
|
"eval_runtime": 0.1878, |
|
"eval_samples_per_second": 175.683, |
|
"eval_steps_per_second": 26.619, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 18.693262100219727, |
|
"learning_rate": 2e-06, |
|
"loss": 2.8329, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 2.794933319091797, |
|
"eval_runtime": 0.1863, |
|
"eval_samples_per_second": 177.152, |
|
"eval_steps_per_second": 26.841, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 2.1538461538461537, |
|
"grad_norm": 14.351330757141113, |
|
"learning_rate": 1.994521895368273e-06, |
|
"loss": 2.6496, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 2.1538461538461537, |
|
"eval_loss": 2.763171911239624, |
|
"eval_runtime": 0.1853, |
|
"eval_samples_per_second": 178.074, |
|
"eval_steps_per_second": 26.981, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 2.358974358974359, |
|
"grad_norm": 14.6412992477417, |
|
"learning_rate": 1.9781476007338054e-06, |
|
"loss": 2.6857, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 2.358974358974359, |
|
"eval_loss": 2.7388267517089844, |
|
"eval_runtime": 0.187, |
|
"eval_samples_per_second": 176.511, |
|
"eval_steps_per_second": 26.744, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 2.564102564102564, |
|
"grad_norm": 16.191911697387695, |
|
"learning_rate": 1.9510565162951534e-06, |
|
"loss": 2.679, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 2.564102564102564, |
|
"eval_loss": 2.719318389892578, |
|
"eval_runtime": 0.1879, |
|
"eval_samples_per_second": 175.654, |
|
"eval_steps_per_second": 26.614, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 2.769230769230769, |
|
"grad_norm": 14.66492748260498, |
|
"learning_rate": 1.9135454576426007e-06, |
|
"loss": 2.6802, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 2.769230769230769, |
|
"eval_loss": 2.674811840057373, |
|
"eval_runtime": 0.1872, |
|
"eval_samples_per_second": 176.31, |
|
"eval_steps_per_second": 26.714, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 2.9743589743589745, |
|
"grad_norm": 12.284189224243164, |
|
"learning_rate": 1.8660254037844386e-06, |
|
"loss": 2.6269, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 2.9743589743589745, |
|
"eval_loss": 2.6451773643493652, |
|
"eval_runtime": 0.1889, |
|
"eval_samples_per_second": 174.656, |
|
"eval_steps_per_second": 26.463, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 3.128205128205128, |
|
"grad_norm": 12.785527229309082, |
|
"learning_rate": 1.8090169943749474e-06, |
|
"loss": 2.5546, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 3.128205128205128, |
|
"eval_loss": 2.628568410873413, |
|
"eval_runtime": 0.1861, |
|
"eval_samples_per_second": 177.366, |
|
"eval_steps_per_second": 26.874, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 12.565117835998535, |
|
"learning_rate": 1.743144825477394e-06, |
|
"loss": 2.574, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"eval_loss": 2.616790294647217, |
|
"eval_runtime": 0.1874, |
|
"eval_samples_per_second": 176.09, |
|
"eval_steps_per_second": 26.68, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 3.5384615384615383, |
|
"grad_norm": 12.94242000579834, |
|
"learning_rate": 1.669130606358858e-06, |
|
"loss": 2.5548, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 3.5384615384615383, |
|
"eval_loss": 2.6054270267486572, |
|
"eval_runtime": 0.1852, |
|
"eval_samples_per_second": 178.208, |
|
"eval_steps_per_second": 27.001, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 3.7435897435897436, |
|
"grad_norm": 11.304039001464844, |
|
"learning_rate": 1.587785252292473e-06, |
|
"loss": 2.5145, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 3.7435897435897436, |
|
"eval_loss": 2.595207691192627, |
|
"eval_runtime": 0.1861, |
|
"eval_samples_per_second": 177.318, |
|
"eval_steps_per_second": 26.866, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 3.948717948717949, |
|
"grad_norm": 11.087238311767578, |
|
"learning_rate": 1.5e-06, |
|
"loss": 2.452, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 3.948717948717949, |
|
"eval_loss": 2.5863306522369385, |
|
"eval_runtime": 0.1862, |
|
"eval_samples_per_second": 177.233, |
|
"eval_steps_per_second": 26.854, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 4.102564102564102, |
|
"grad_norm": 10.603784561157227, |
|
"learning_rate": 1.4067366430758004e-06, |
|
"loss": 2.4647, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 4.102564102564102, |
|
"eval_loss": 2.5786077976226807, |
|
"eval_runtime": 0.1867, |
|
"eval_samples_per_second": 176.801, |
|
"eval_steps_per_second": 26.788, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 4.3076923076923075, |
|
"grad_norm": 10.523798942565918, |
|
"learning_rate": 1.3090169943749473e-06, |
|
"loss": 2.423, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 4.3076923076923075, |
|
"eval_loss": 2.5714633464813232, |
|
"eval_runtime": 0.1854, |
|
"eval_samples_per_second": 178.035, |
|
"eval_steps_per_second": 26.975, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 4.512820512820513, |
|
"grad_norm": 9.499349594116211, |
|
"learning_rate": 1.207911690817759e-06, |
|
"loss": 2.4104, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 4.512820512820513, |
|
"eval_loss": 2.5648255348205566, |
|
"eval_runtime": 0.1877, |
|
"eval_samples_per_second": 175.778, |
|
"eval_steps_per_second": 26.633, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 4.717948717948718, |
|
"grad_norm": 9.946209907531738, |
|
"learning_rate": 1.1045284632676535e-06, |
|
"loss": 2.3664, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 4.717948717948718, |
|
"eval_loss": 2.5592212677001953, |
|
"eval_runtime": 0.1861, |
|
"eval_samples_per_second": 177.323, |
|
"eval_steps_per_second": 26.867, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 4.923076923076923, |
|
"grad_norm": 9.741501808166504, |
|
"learning_rate": 1e-06, |
|
"loss": 2.4211, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 4.923076923076923, |
|
"eval_loss": 2.5535762310028076, |
|
"eval_runtime": 0.1872, |
|
"eval_samples_per_second": 176.274, |
|
"eval_steps_per_second": 26.708, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 5.076923076923077, |
|
"grad_norm": 10.652682304382324, |
|
"learning_rate": 8.954715367323466e-07, |
|
"loss": 2.4291, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 5.076923076923077, |
|
"eval_loss": 2.549236536026001, |
|
"eval_runtime": 0.1887, |
|
"eval_samples_per_second": 174.886, |
|
"eval_steps_per_second": 26.498, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 5.282051282051282, |
|
"grad_norm": 9.138431549072266, |
|
"learning_rate": 7.920883091822408e-07, |
|
"loss": 2.3475, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 5.282051282051282, |
|
"eval_loss": 2.5455117225646973, |
|
"eval_runtime": 0.1869, |
|
"eval_samples_per_second": 176.553, |
|
"eval_steps_per_second": 26.75, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 5.487179487179487, |
|
"grad_norm": 9.32693099975586, |
|
"learning_rate": 6.909830056250526e-07, |
|
"loss": 2.3665, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 5.487179487179487, |
|
"eval_loss": 2.541745901107788, |
|
"eval_runtime": 0.1868, |
|
"eval_samples_per_second": 176.668, |
|
"eval_steps_per_second": 26.768, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 5.6923076923076925, |
|
"grad_norm": 9.5020751953125, |
|
"learning_rate": 5.932633569241999e-07, |
|
"loss": 2.3862, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 5.6923076923076925, |
|
"eval_loss": 2.5386736392974854, |
|
"eval_runtime": 0.187, |
|
"eval_samples_per_second": 176.452, |
|
"eval_steps_per_second": 26.735, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 5.897435897435898, |
|
"grad_norm": 10.226723670959473, |
|
"learning_rate": 5.000000000000002e-07, |
|
"loss": 2.3784, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 5.897435897435898, |
|
"eval_loss": 2.5360124111175537, |
|
"eval_runtime": 0.186, |
|
"eval_samples_per_second": 177.425, |
|
"eval_steps_per_second": 26.883, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 6.051282051282051, |
|
"grad_norm": 10.043070793151855, |
|
"learning_rate": 4.1221474770752696e-07, |
|
"loss": 2.354, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 6.051282051282051, |
|
"eval_loss": 2.5342884063720703, |
|
"eval_runtime": 0.1871, |
|
"eval_samples_per_second": 176.386, |
|
"eval_steps_per_second": 26.725, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 6.256410256410256, |
|
"grad_norm": 9.647918701171875, |
|
"learning_rate": 3.308693936411421e-07, |
|
"loss": 2.3442, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 6.256410256410256, |
|
"eval_loss": 2.532135248184204, |
|
"eval_runtime": 0.1858, |
|
"eval_samples_per_second": 177.644, |
|
"eval_steps_per_second": 26.916, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 6.461538461538462, |
|
"grad_norm": 9.200613975524902, |
|
"learning_rate": 2.568551745226056e-07, |
|
"loss": 2.3499, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 6.461538461538462, |
|
"eval_loss": 2.5312461853027344, |
|
"eval_runtime": 0.1865, |
|
"eval_samples_per_second": 176.909, |
|
"eval_steps_per_second": 26.804, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 6.666666666666667, |
|
"grad_norm": 9.6244535446167, |
|
"learning_rate": 1.9098300562505264e-07, |
|
"loss": 2.3312, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 6.666666666666667, |
|
"eval_loss": 2.5296669006347656, |
|
"eval_runtime": 0.1862, |
|
"eval_samples_per_second": 177.266, |
|
"eval_steps_per_second": 26.859, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 6.871794871794872, |
|
"grad_norm": 9.38110065460205, |
|
"learning_rate": 1.3397459621556128e-07, |
|
"loss": 2.3551, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 6.871794871794872, |
|
"eval_loss": 2.5289077758789062, |
|
"eval_runtime": 0.1858, |
|
"eval_samples_per_second": 177.582, |
|
"eval_steps_per_second": 26.906, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 7.0256410256410255, |
|
"grad_norm": 9.125926971435547, |
|
"learning_rate": 8.645454235739902e-08, |
|
"loss": 2.3363, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 7.0256410256410255, |
|
"eval_loss": 2.5289089679718018, |
|
"eval_runtime": 0.1865, |
|
"eval_samples_per_second": 176.913, |
|
"eval_steps_per_second": 26.805, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 7.230769230769231, |
|
"grad_norm": 9.84389591217041, |
|
"learning_rate": 4.8943483704846465e-08, |
|
"loss": 2.3691, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 7.230769230769231, |
|
"eval_loss": 2.5284206867218018, |
|
"eval_runtime": 0.188, |
|
"eval_samples_per_second": 175.495, |
|
"eval_steps_per_second": 26.59, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 7.435897435897436, |
|
"grad_norm": 9.293142318725586, |
|
"learning_rate": 2.185239926619431e-08, |
|
"loss": 2.3267, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 7.435897435897436, |
|
"eval_loss": 2.528106689453125, |
|
"eval_runtime": 0.1858, |
|
"eval_samples_per_second": 177.588, |
|
"eval_steps_per_second": 26.907, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 7.641025641025641, |
|
"grad_norm": 9.073277473449707, |
|
"learning_rate": 5.47810463172671e-09, |
|
"loss": 2.3389, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 7.641025641025641, |
|
"eval_loss": 2.528116464614868, |
|
"eval_runtime": 0.186, |
|
"eval_samples_per_second": 177.392, |
|
"eval_steps_per_second": 26.878, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 7.846153846153846, |
|
"grad_norm": 9.473708152770996, |
|
"learning_rate": 0.0, |
|
"loss": 2.1969, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 7.846153846153846, |
|
"eval_loss": 2.5279667377471924, |
|
"eval_runtime": 0.187, |
|
"eval_samples_per_second": 176.514, |
|
"eval_steps_per_second": 26.745, |
|
"step": 40 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 40, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 4, |
|
"total_flos": 1.0427550308237312e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|