|
{ |
|
"best_metric": 2.204540491104126, |
|
"best_model_checkpoint": "./output/training_results/C014_llama3-8b-base_pretrain_20240428_005832/checkpoint-130", |
|
"epoch": 4.0, |
|
"eval_steps": 5, |
|
"global_step": 264, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.015151515151515152, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 2.5789, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.015151515151515152, |
|
"eval_loss": 2.6458332538604736, |
|
"eval_runtime": 5.9609, |
|
"eval_samples_per_second": 78.008, |
|
"eval_steps_per_second": 0.671, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.07575757575757576, |
|
"grad_norm": 3.5510944022733923, |
|
"learning_rate": 2.25e-06, |
|
"loss": 2.5672, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.07575757575757576, |
|
"eval_loss": 2.628009080886841, |
|
"eval_runtime": 5.9819, |
|
"eval_samples_per_second": 77.735, |
|
"eval_steps_per_second": 0.669, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.15151515151515152, |
|
"grad_norm": 3.473712979202145, |
|
"learning_rate": 6e-06, |
|
"loss": 2.5751, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.15151515151515152, |
|
"eval_loss": 2.5313849449157715, |
|
"eval_runtime": 5.9441, |
|
"eval_samples_per_second": 78.228, |
|
"eval_steps_per_second": 0.673, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.22727272727272727, |
|
"grad_norm": 2.7296451314204835, |
|
"learning_rate": 9.75e-06, |
|
"loss": 2.418, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.22727272727272727, |
|
"eval_loss": 2.4634220600128174, |
|
"eval_runtime": 6.0122, |
|
"eval_samples_per_second": 77.343, |
|
"eval_steps_per_second": 0.665, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.30303030303030304, |
|
"grad_norm": 2.9793222204470453, |
|
"learning_rate": 1.3500000000000001e-05, |
|
"loss": 2.4701, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.30303030303030304, |
|
"eval_loss": 2.4176573753356934, |
|
"eval_runtime": 5.9735, |
|
"eval_samples_per_second": 77.844, |
|
"eval_steps_per_second": 0.67, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.3787878787878788, |
|
"grad_norm": 2.9152379669367696, |
|
"learning_rate": 1.3097898548149108e-05, |
|
"loss": 2.3904, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.3787878787878788, |
|
"eval_loss": 2.3785245418548584, |
|
"eval_runtime": 5.9994, |
|
"eval_samples_per_second": 77.507, |
|
"eval_steps_per_second": 0.667, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.45454545454545453, |
|
"grad_norm": 2.475241901636818, |
|
"learning_rate": 1.041060545673204e-05, |
|
"loss": 2.3539, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.45454545454545453, |
|
"eval_loss": 2.337780475616455, |
|
"eval_runtime": 5.9932, |
|
"eval_samples_per_second": 77.587, |
|
"eval_steps_per_second": 0.667, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.5303030303030303, |
|
"grad_norm": 2.5312379436441272, |
|
"learning_rate": 8.236247706221891e-06, |
|
"loss": 2.3101, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.5303030303030303, |
|
"eval_loss": 2.3082308769226074, |
|
"eval_runtime": 5.9901, |
|
"eval_samples_per_second": 77.628, |
|
"eval_steps_per_second": 0.668, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.6060606060606061, |
|
"grad_norm": 2.5279244134684804, |
|
"learning_rate": 6.4849612135310325e-06, |
|
"loss": 2.3254, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.6060606060606061, |
|
"eval_loss": 2.2816028594970703, |
|
"eval_runtime": 5.9798, |
|
"eval_samples_per_second": 77.762, |
|
"eval_steps_per_second": 0.669, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.6818181818181818, |
|
"grad_norm": 2.3848893916874836, |
|
"learning_rate": 5.081159821297093e-06, |
|
"loss": 2.2762, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.6818181818181818, |
|
"eval_loss": 2.2614095211029053, |
|
"eval_runtime": 5.9833, |
|
"eval_samples_per_second": 77.716, |
|
"eval_steps_per_second": 0.669, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.7575757575757576, |
|
"grad_norm": 2.55130938777181, |
|
"learning_rate": 3.961509285889694e-06, |
|
"loss": 2.2525, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.7575757575757576, |
|
"eval_loss": 2.2457971572875977, |
|
"eval_runtime": 6.0002, |
|
"eval_samples_per_second": 77.497, |
|
"eval_steps_per_second": 0.667, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 2.2003836383976156, |
|
"learning_rate": 3.073152889221908e-06, |
|
"loss": 2.2777, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"eval_loss": 2.232052803039551, |
|
"eval_runtime": 5.9752, |
|
"eval_samples_per_second": 77.822, |
|
"eval_steps_per_second": 0.669, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 1.938340466235554, |
|
"learning_rate": 2.372162069694911e-06, |
|
"loss": 2.2054, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"eval_loss": 2.2206437587738037, |
|
"eval_runtime": 5.9984, |
|
"eval_samples_per_second": 77.52, |
|
"eval_steps_per_second": 0.667, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.9848484848484849, |
|
"grad_norm": 2.0152410097081375, |
|
"learning_rate": 1.8221877676625323e-06, |
|
"loss": 2.237, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.9848484848484849, |
|
"eval_loss": 2.2112882137298584, |
|
"eval_runtime": 6.0081, |
|
"eval_samples_per_second": 77.396, |
|
"eval_steps_per_second": 0.666, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.0606060606060606, |
|
"grad_norm": 2.206665008674913, |
|
"learning_rate": 1.3932903283558643e-06, |
|
"loss": 1.986, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.0606060606060606, |
|
"eval_loss": 2.2115273475646973, |
|
"eval_runtime": 6.0121, |
|
"eval_samples_per_second": 77.344, |
|
"eval_steps_per_second": 0.665, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.1363636363636362, |
|
"grad_norm": 2.4135250740210816, |
|
"learning_rate": 1.0609278071546894e-06, |
|
"loss": 1.9373, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.1363636363636362, |
|
"eval_loss": 2.221705913543701, |
|
"eval_runtime": 5.966, |
|
"eval_samples_per_second": 77.942, |
|
"eval_steps_per_second": 0.67, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.2121212121212122, |
|
"grad_norm": 2.0909203153994995, |
|
"learning_rate": 8.050843851687484e-07, |
|
"loss": 1.9228, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.2121212121212122, |
|
"eval_loss": 2.2131617069244385, |
|
"eval_runtime": 6.0048, |
|
"eval_samples_per_second": 77.439, |
|
"eval_steps_per_second": 0.666, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.2878787878787878, |
|
"grad_norm": 2.0701966451221723, |
|
"learning_rate": 6.095223338761627e-07, |
|
"loss": 1.9084, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.2878787878787878, |
|
"eval_loss": 2.2117583751678467, |
|
"eval_runtime": 6.0144, |
|
"eval_samples_per_second": 77.314, |
|
"eval_steps_per_second": 0.665, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.3636363636363638, |
|
"grad_norm": 2.0411250764985907, |
|
"learning_rate": 4.611425724763914e-07, |
|
"loss": 1.9684, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.3636363636363638, |
|
"eval_loss": 2.212195634841919, |
|
"eval_runtime": 6.005, |
|
"eval_samples_per_second": 77.436, |
|
"eval_steps_per_second": 0.666, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.4393939393939394, |
|
"grad_norm": 2.095523936778245, |
|
"learning_rate": 3.494403469094348e-07, |
|
"loss": 1.9126, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.4393939393939394, |
|
"eval_loss": 2.2093794345855713, |
|
"eval_runtime": 5.9753, |
|
"eval_samples_per_second": 77.82, |
|
"eval_steps_per_second": 0.669, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.5151515151515151, |
|
"grad_norm": 1.9491500689498569, |
|
"learning_rate": 2.660439312704735e-07, |
|
"loss": 1.9101, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.5151515151515151, |
|
"eval_loss": 2.2066152095794678, |
|
"eval_runtime": 5.988, |
|
"eval_samples_per_second": 77.656, |
|
"eval_steps_per_second": 0.668, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.5909090909090908, |
|
"grad_norm": 1.9297056004219728, |
|
"learning_rate": 2.0432551654866868e-07, |
|
"loss": 1.8496, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.5909090909090908, |
|
"eval_loss": 2.2057933807373047, |
|
"eval_runtime": 6.029, |
|
"eval_samples_per_second": 77.127, |
|
"eval_steps_per_second": 0.663, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 2.0270269069156095, |
|
"learning_rate": 1.590746140201269e-07, |
|
"loss": 1.9154, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"eval_loss": 2.205655097961426, |
|
"eval_runtime": 5.9865, |
|
"eval_samples_per_second": 77.675, |
|
"eval_steps_per_second": 0.668, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.7424242424242424, |
|
"grad_norm": 2.0216556200111957, |
|
"learning_rate": 1.2622536684767967e-07, |
|
"loss": 1.9233, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.7424242424242424, |
|
"eval_loss": 2.2055680751800537, |
|
"eval_runtime": 5.9969, |
|
"eval_samples_per_second": 77.54, |
|
"eval_steps_per_second": 0.667, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.8181818181818183, |
|
"grad_norm": 2.121087680196771, |
|
"learning_rate": 1.0263013894441628e-07, |
|
"loss": 1.9198, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.8181818181818183, |
|
"eval_loss": 2.205195665359497, |
|
"eval_runtime": 5.9814, |
|
"eval_samples_per_second": 77.741, |
|
"eval_steps_per_second": 0.669, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.893939393939394, |
|
"grad_norm": 1.9887616789335223, |
|
"learning_rate": 8.587264024428055e-08, |
|
"loss": 1.9229, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.893939393939394, |
|
"eval_loss": 2.2048099040985107, |
|
"eval_runtime": 6.0178, |
|
"eval_samples_per_second": 77.271, |
|
"eval_steps_per_second": 0.665, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.9696969696969697, |
|
"grad_norm": 1.9737440679073275, |
|
"learning_rate": 7.411465733236604e-08, |
|
"loss": 1.8913, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.9696969696969697, |
|
"eval_loss": 2.204540491104126, |
|
"eval_runtime": 6.0056, |
|
"eval_samples_per_second": 77.428, |
|
"eval_steps_per_second": 0.666, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.0454545454545454, |
|
"grad_norm": 2.037256369037435, |
|
"learning_rate": 6.59711929010128e-08, |
|
"loss": 1.8814, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.0454545454545454, |
|
"eval_loss": 2.204589605331421, |
|
"eval_runtime": 6.0067, |
|
"eval_samples_per_second": 77.414, |
|
"eval_steps_per_second": 0.666, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.121212121212121, |
|
"grad_norm": 2.0713675635613353, |
|
"learning_rate": 6.040948153695873e-08, |
|
"loss": 1.8813, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.121212121212121, |
|
"eval_loss": 2.2050745487213135, |
|
"eval_runtime": 5.9801, |
|
"eval_samples_per_second": 77.758, |
|
"eval_steps_per_second": 0.669, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.196969696969697, |
|
"grad_norm": 1.9602573628685647, |
|
"learning_rate": 5.666794757151726e-08, |
|
"loss": 1.8912, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 2.196969696969697, |
|
"eval_loss": 2.2057695388793945, |
|
"eval_runtime": 6.0053, |
|
"eval_samples_per_second": 77.431, |
|
"eval_steps_per_second": 0.666, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 2.2727272727272725, |
|
"grad_norm": 2.0631081811496133, |
|
"learning_rate": 5.4191707642277796e-08, |
|
"loss": 1.9184, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.2727272727272725, |
|
"eval_loss": 2.2065114974975586, |
|
"eval_runtime": 5.9555, |
|
"eval_samples_per_second": 78.079, |
|
"eval_steps_per_second": 0.672, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.3484848484848486, |
|
"grad_norm": 2.0253318844317705, |
|
"learning_rate": 5.258170056372994e-08, |
|
"loss": 1.8662, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 2.3484848484848486, |
|
"eval_loss": 2.207070827484131, |
|
"eval_runtime": 6.0047, |
|
"eval_samples_per_second": 77.439, |
|
"eval_steps_per_second": 0.666, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 2.4242424242424243, |
|
"grad_norm": 2.073377995618171, |
|
"learning_rate": 5.1554954268425945e-08, |
|
"loss": 1.8809, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.4242424242424243, |
|
"eval_loss": 2.2073893547058105, |
|
"eval_runtime": 6.0042, |
|
"eval_samples_per_second": 77.446, |
|
"eval_steps_per_second": 0.666, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 1.9895760567683491, |
|
"learning_rate": 5.091387798309037e-08, |
|
"loss": 1.8591, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"eval_loss": 2.2076644897460938, |
|
"eval_runtime": 5.9917, |
|
"eval_samples_per_second": 77.608, |
|
"eval_steps_per_second": 0.668, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 2.5757575757575757, |
|
"grad_norm": 2.0833747351029372, |
|
"learning_rate": 5.0522801309078135e-08, |
|
"loss": 1.8731, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.5757575757575757, |
|
"eval_loss": 2.2079408168792725, |
|
"eval_runtime": 6.0146, |
|
"eval_samples_per_second": 77.312, |
|
"eval_steps_per_second": 0.665, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.6515151515151514, |
|
"grad_norm": 2.0037327741523825, |
|
"learning_rate": 5.0290274187738543e-08, |
|
"loss": 1.8948, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 2.6515151515151514, |
|
"eval_loss": 2.208183765411377, |
|
"eval_runtime": 6.0305, |
|
"eval_samples_per_second": 77.108, |
|
"eval_steps_per_second": 0.663, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 2.7272727272727275, |
|
"grad_norm": 2.0305444519091917, |
|
"learning_rate": 5.015589639287439e-08, |
|
"loss": 1.8876, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.7272727272727275, |
|
"eval_loss": 2.208235740661621, |
|
"eval_runtime": 5.9955, |
|
"eval_samples_per_second": 77.558, |
|
"eval_steps_per_second": 0.667, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.8030303030303028, |
|
"grad_norm": 2.0062076076889266, |
|
"learning_rate": 5.0080665589248236e-08, |
|
"loss": 1.8408, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 2.8030303030303028, |
|
"eval_loss": 2.2083210945129395, |
|
"eval_runtime": 6.0004, |
|
"eval_samples_per_second": 77.495, |
|
"eval_steps_per_second": 0.667, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 2.878787878787879, |
|
"grad_norm": 2.043895270641295, |
|
"learning_rate": 5.004002235298783e-08, |
|
"loss": 1.8931, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.878787878787879, |
|
"eval_loss": 2.208212375640869, |
|
"eval_runtime": 6.0002, |
|
"eval_samples_per_second": 77.498, |
|
"eval_steps_per_second": 0.667, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.9545454545454546, |
|
"grad_norm": 1.9689184449811679, |
|
"learning_rate": 5.001893193212864e-08, |
|
"loss": 1.8569, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 2.9545454545454546, |
|
"eval_loss": 2.2080307006835938, |
|
"eval_runtime": 5.9791, |
|
"eval_samples_per_second": 77.772, |
|
"eval_steps_per_second": 0.669, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 3.0303030303030303, |
|
"grad_norm": 2.019410946950212, |
|
"learning_rate": 5.000847883910016e-08, |
|
"loss": 1.8621, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.0303030303030303, |
|
"eval_loss": 2.207866907119751, |
|
"eval_runtime": 6.0003, |
|
"eval_samples_per_second": 77.496, |
|
"eval_steps_per_second": 0.667, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.106060606060606, |
|
"grad_norm": 2.0283442915048644, |
|
"learning_rate": 5.000356435775757e-08, |
|
"loss": 1.8863, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 3.106060606060606, |
|
"eval_loss": 2.207792043685913, |
|
"eval_runtime": 5.9858, |
|
"eval_samples_per_second": 77.684, |
|
"eval_steps_per_second": 0.668, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 3.1818181818181817, |
|
"grad_norm": 2.05494798524839, |
|
"learning_rate": 5.0001391301969795e-08, |
|
"loss": 1.9021, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 3.1818181818181817, |
|
"eval_loss": 2.2078535556793213, |
|
"eval_runtime": 5.9911, |
|
"eval_samples_per_second": 77.615, |
|
"eval_steps_per_second": 0.668, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 3.257575757575758, |
|
"grad_norm": 2.0316955077847023, |
|
"learning_rate": 5.000049730753554e-08, |
|
"loss": 1.8648, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 3.257575757575758, |
|
"eval_loss": 2.2079594135284424, |
|
"eval_runtime": 5.9692, |
|
"eval_samples_per_second": 77.9, |
|
"eval_steps_per_second": 0.67, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 2.0376583248479174, |
|
"learning_rate": 5.0000159841391415e-08, |
|
"loss": 1.8443, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"eval_loss": 2.2080650329589844, |
|
"eval_runtime": 6.0237, |
|
"eval_samples_per_second": 77.195, |
|
"eval_steps_per_second": 0.664, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 3.409090909090909, |
|
"grad_norm": 2.0309390490282864, |
|
"learning_rate": 5.0000045079130105e-08, |
|
"loss": 1.8978, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 3.409090909090909, |
|
"eval_loss": 2.20800518989563, |
|
"eval_runtime": 6.0116, |
|
"eval_samples_per_second": 77.35, |
|
"eval_steps_per_second": 0.665, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 3.484848484848485, |
|
"grad_norm": 2.0870431548602495, |
|
"learning_rate": 5.000001078153535e-08, |
|
"loss": 1.8658, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 3.484848484848485, |
|
"eval_loss": 2.208038806915283, |
|
"eval_runtime": 5.99, |
|
"eval_samples_per_second": 77.629, |
|
"eval_steps_per_second": 0.668, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 3.5606060606060606, |
|
"grad_norm": 2.0234951173098024, |
|
"learning_rate": 5.0000002081285866e-08, |
|
"loss": 1.8706, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 3.5606060606060606, |
|
"eval_loss": 2.207921266555786, |
|
"eval_runtime": 5.9823, |
|
"eval_samples_per_second": 77.73, |
|
"eval_steps_per_second": 0.669, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 3.6363636363636362, |
|
"grad_norm": 2.039329423643573, |
|
"learning_rate": 5.0000000300649115e-08, |
|
"loss": 1.8855, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.6363636363636362, |
|
"eval_loss": 2.2077724933624268, |
|
"eval_runtime": 6.0052, |
|
"eval_samples_per_second": 77.433, |
|
"eval_steps_per_second": 0.666, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.712121212121212, |
|
"grad_norm": 2.048524032150131, |
|
"learning_rate": 5.00000000286923e-08, |
|
"loss": 1.8535, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 3.712121212121212, |
|
"eval_loss": 2.2078235149383545, |
|
"eval_runtime": 6.0225, |
|
"eval_samples_per_second": 77.21, |
|
"eval_steps_per_second": 0.664, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 3.787878787878788, |
|
"grad_norm": 2.117736519999513, |
|
"learning_rate": 5.0000000001441026e-08, |
|
"loss": 1.9062, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 3.787878787878788, |
|
"eval_loss": 2.207879066467285, |
|
"eval_runtime": 5.9714, |
|
"eval_samples_per_second": 77.871, |
|
"eval_steps_per_second": 0.67, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 3.8636363636363638, |
|
"grad_norm": 2.0315594950889775, |
|
"learning_rate": 5.000000000002337e-08, |
|
"loss": 1.8628, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 3.8636363636363638, |
|
"eval_loss": 2.2078306674957275, |
|
"eval_runtime": 6.0145, |
|
"eval_samples_per_second": 77.314, |
|
"eval_steps_per_second": 0.665, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 3.9393939393939394, |
|
"grad_norm": 2.0132729461699994, |
|
"learning_rate": 5.0000000000000024e-08, |
|
"loss": 1.8484, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 3.9393939393939394, |
|
"eval_loss": 2.207735776901245, |
|
"eval_runtime": 6.0067, |
|
"eval_samples_per_second": 77.413, |
|
"eval_steps_per_second": 0.666, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"step": 264, |
|
"total_flos": 27428734894080.0, |
|
"train_loss": 2.005241002097274, |
|
"train_runtime": 8788.5675, |
|
"train_samples_per_second": 1.904, |
|
"train_steps_per_second": 0.03 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 264, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 5, |
|
"total_flos": 27428734894080.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|