|
{ |
|
"best_metric": 0.5798903703689575, |
|
"best_model_checkpoint": "models/checkpoints/checkpoint-16000", |
|
"epoch": 14.47952951820855, |
|
"global_step": 16000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 1.9879336349924587e-05, |
|
"loss": 1.3189, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 1.9758672699849173e-05, |
|
"loss": 1.1007, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"learning_rate": 1.9638009049773755e-05, |
|
"loss": 1.0933, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"learning_rate": 1.951734539969834e-05, |
|
"loss": 1.0187, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"learning_rate": 1.9396681749622927e-05, |
|
"loss": 1.018, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"eval_loss": 0.9501454830169678, |
|
"eval_runtime": 52.6762, |
|
"eval_samples_per_second": 149.194, |
|
"eval_steps_per_second": 9.34, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"learning_rate": 1.9276018099547512e-05, |
|
"loss": 1.0293, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"learning_rate": 1.9155354449472098e-05, |
|
"loss": 0.9875, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"learning_rate": 1.9034690799396684e-05, |
|
"loss": 0.9752, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"learning_rate": 1.891402714932127e-05, |
|
"loss": 0.979, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"learning_rate": 1.8793363499245855e-05, |
|
"loss": 0.9513, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"eval_loss": 0.8483895063400269, |
|
"eval_runtime": 32.7078, |
|
"eval_samples_per_second": 240.279, |
|
"eval_steps_per_second": 15.042, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"learning_rate": 1.867269984917044e-05, |
|
"loss": 0.9691, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"learning_rate": 1.8552036199095026e-05, |
|
"loss": 0.8995, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"learning_rate": 1.843137254901961e-05, |
|
"loss": 0.9392, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"learning_rate": 1.8310708898944194e-05, |
|
"loss": 0.9156, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"learning_rate": 1.819004524886878e-05, |
|
"loss": 0.9142, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"eval_loss": 0.8270823359489441, |
|
"eval_runtime": 30.0805, |
|
"eval_samples_per_second": 261.266, |
|
"eval_steps_per_second": 16.356, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"learning_rate": 1.8069381598793365e-05, |
|
"loss": 0.8579, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"learning_rate": 1.794871794871795e-05, |
|
"loss": 0.8649, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"learning_rate": 1.7828054298642537e-05, |
|
"loss": 0.8935, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"learning_rate": 1.770739064856712e-05, |
|
"loss": 0.8534, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"learning_rate": 1.7586726998491705e-05, |
|
"loss": 0.8427, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"eval_loss": 0.8167164921760559, |
|
"eval_runtime": 29.67, |
|
"eval_samples_per_second": 264.88, |
|
"eval_steps_per_second": 16.582, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"learning_rate": 1.746606334841629e-05, |
|
"loss": 0.8634, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"learning_rate": 1.7345399698340876e-05, |
|
"loss": 0.8735, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"learning_rate": 1.7224736048265462e-05, |
|
"loss": 0.8401, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"learning_rate": 1.7104072398190047e-05, |
|
"loss": 0.8403, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"learning_rate": 1.6983408748114633e-05, |
|
"loss": 0.8212, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"eval_loss": 0.8272273540496826, |
|
"eval_runtime": 30.3829, |
|
"eval_samples_per_second": 258.665, |
|
"eval_steps_per_second": 16.193, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"learning_rate": 1.686274509803922e-05, |
|
"loss": 0.8301, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"learning_rate": 1.6742081447963804e-05, |
|
"loss": 0.8396, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"learning_rate": 1.662141779788839e-05, |
|
"loss": 0.8067, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"learning_rate": 1.6500754147812972e-05, |
|
"loss": 0.8231, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"learning_rate": 1.6380090497737558e-05, |
|
"loss": 0.8125, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"eval_loss": 0.7901880741119385, |
|
"eval_runtime": 30.2297, |
|
"eval_samples_per_second": 259.976, |
|
"eval_steps_per_second": 16.275, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"learning_rate": 1.6259426847662144e-05, |
|
"loss": 0.808, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"learning_rate": 1.613876319758673e-05, |
|
"loss": 0.8359, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"learning_rate": 1.6018099547511315e-05, |
|
"loss": 0.8017, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"learning_rate": 1.5897435897435897e-05, |
|
"loss": 0.8226, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"learning_rate": 1.5776772247360483e-05, |
|
"loss": 0.8121, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"eval_loss": 0.7621824741363525, |
|
"eval_runtime": 30.3432, |
|
"eval_samples_per_second": 259.004, |
|
"eval_steps_per_second": 16.215, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 3.26, |
|
"learning_rate": 1.565610859728507e-05, |
|
"loss": 0.7861, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"learning_rate": 1.5535444947209654e-05, |
|
"loss": 0.7634, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"learning_rate": 1.541478129713424e-05, |
|
"loss": 0.7941, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 3.53, |
|
"learning_rate": 1.5294117647058822e-05, |
|
"loss": 0.7628, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"learning_rate": 1.517345399698341e-05, |
|
"loss": 0.7572, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"eval_loss": 0.7411866188049316, |
|
"eval_runtime": 29.8313, |
|
"eval_samples_per_second": 263.448, |
|
"eval_steps_per_second": 16.493, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"learning_rate": 1.5052790346907995e-05, |
|
"loss": 0.7975, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"learning_rate": 1.4932126696832581e-05, |
|
"loss": 0.7626, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 3.89, |
|
"learning_rate": 1.4811463046757167e-05, |
|
"loss": 0.7467, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"learning_rate": 1.4690799396681752e-05, |
|
"loss": 0.7689, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 4.07, |
|
"learning_rate": 1.4570135746606336e-05, |
|
"loss": 0.7392, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 4.07, |
|
"eval_loss": 0.7343409061431885, |
|
"eval_runtime": 30.138, |
|
"eval_samples_per_second": 260.767, |
|
"eval_steps_per_second": 16.325, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 4.16, |
|
"learning_rate": 1.444947209653092e-05, |
|
"loss": 0.7636, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"learning_rate": 1.4328808446455506e-05, |
|
"loss": 0.768, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 4.34, |
|
"learning_rate": 1.4208144796380091e-05, |
|
"loss": 0.7234, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 4.43, |
|
"learning_rate": 1.4087481146304677e-05, |
|
"loss": 0.7868, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 4.52, |
|
"learning_rate": 1.3966817496229261e-05, |
|
"loss": 0.7538, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 4.52, |
|
"eval_loss": 0.7230245471000671, |
|
"eval_runtime": 29.8163, |
|
"eval_samples_per_second": 263.581, |
|
"eval_steps_per_second": 16.501, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"learning_rate": 1.3846153846153847e-05, |
|
"loss": 0.7261, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 4.71, |
|
"learning_rate": 1.3725490196078432e-05, |
|
"loss": 0.7462, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"learning_rate": 1.3604826546003018e-05, |
|
"loss": 0.7169, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 4.89, |
|
"learning_rate": 1.3484162895927604e-05, |
|
"loss": 0.7542, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 4.98, |
|
"learning_rate": 1.3363499245852188e-05, |
|
"loss": 0.71, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 4.98, |
|
"eval_loss": 0.7268965840339661, |
|
"eval_runtime": 29.6638, |
|
"eval_samples_per_second": 264.936, |
|
"eval_steps_per_second": 16.586, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 5.07, |
|
"learning_rate": 1.3242835595776773e-05, |
|
"loss": 0.7217, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 5.16, |
|
"learning_rate": 1.3122171945701359e-05, |
|
"loss": 0.7226, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 5.25, |
|
"learning_rate": 1.3001508295625945e-05, |
|
"loss": 0.7177, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 5.34, |
|
"learning_rate": 1.288084464555053e-05, |
|
"loss": 0.7038, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 5.43, |
|
"learning_rate": 1.2760180995475113e-05, |
|
"loss": 0.6966, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 5.43, |
|
"eval_loss": 0.7119535803794861, |
|
"eval_runtime": 29.7811, |
|
"eval_samples_per_second": 263.892, |
|
"eval_steps_per_second": 16.521, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 5.52, |
|
"learning_rate": 1.2639517345399698e-05, |
|
"loss": 0.7167, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 5.61, |
|
"learning_rate": 1.2518853695324284e-05, |
|
"loss": 0.6907, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 5.7, |
|
"learning_rate": 1.239819004524887e-05, |
|
"loss": 0.6992, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 5.79, |
|
"learning_rate": 1.2277526395173455e-05, |
|
"loss": 0.7038, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 5.88, |
|
"learning_rate": 1.215686274509804e-05, |
|
"loss": 0.6995, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 5.88, |
|
"eval_loss": 0.7289978861808777, |
|
"eval_runtime": 30.2132, |
|
"eval_samples_per_second": 260.118, |
|
"eval_steps_per_second": 16.284, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 5.97, |
|
"learning_rate": 1.2036199095022625e-05, |
|
"loss": 0.6841, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 6.06, |
|
"learning_rate": 1.191553544494721e-05, |
|
"loss": 0.6827, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 6.15, |
|
"learning_rate": 1.1794871794871796e-05, |
|
"loss": 0.6664, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 6.24, |
|
"learning_rate": 1.1674208144796382e-05, |
|
"loss": 0.6911, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 6.33, |
|
"learning_rate": 1.1553544494720966e-05, |
|
"loss": 0.6719, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 6.33, |
|
"eval_loss": 0.7228043675422668, |
|
"eval_runtime": 31.0054, |
|
"eval_samples_per_second": 253.472, |
|
"eval_steps_per_second": 15.868, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 6.43, |
|
"learning_rate": 1.1432880844645552e-05, |
|
"loss": 0.6785, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 6.52, |
|
"learning_rate": 1.1312217194570137e-05, |
|
"loss": 0.6908, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 6.61, |
|
"learning_rate": 1.1191553544494723e-05, |
|
"loss": 0.6775, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 6.7, |
|
"learning_rate": 1.1070889894419309e-05, |
|
"loss": 0.6566, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 6.79, |
|
"learning_rate": 1.0950226244343893e-05, |
|
"loss": 0.6862, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 6.79, |
|
"eval_loss": 0.6712033152580261, |
|
"eval_runtime": 29.8128, |
|
"eval_samples_per_second": 263.612, |
|
"eval_steps_per_second": 16.503, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 6.88, |
|
"learning_rate": 1.0829562594268476e-05, |
|
"loss": 0.6965, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 6.97, |
|
"learning_rate": 1.0708898944193062e-05, |
|
"loss": 0.6836, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 7.06, |
|
"learning_rate": 1.0588235294117648e-05, |
|
"loss": 0.6684, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 7.15, |
|
"learning_rate": 1.0467571644042233e-05, |
|
"loss": 0.6767, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 7.24, |
|
"learning_rate": 1.0346907993966819e-05, |
|
"loss": 0.6775, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 7.24, |
|
"eval_loss": 0.6799022555351257, |
|
"eval_runtime": 30.3013, |
|
"eval_samples_per_second": 259.362, |
|
"eval_steps_per_second": 16.237, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 7.33, |
|
"learning_rate": 1.0226244343891403e-05, |
|
"loss": 0.6371, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 7.42, |
|
"learning_rate": 1.0105580693815989e-05, |
|
"loss": 0.6623, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 7.51, |
|
"learning_rate": 9.984917043740574e-06, |
|
"loss": 0.6499, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 7.6, |
|
"learning_rate": 9.86425339366516e-06, |
|
"loss": 0.6457, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 7.69, |
|
"learning_rate": 9.743589743589744e-06, |
|
"loss": 0.6283, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 7.69, |
|
"eval_loss": 0.6853426098823547, |
|
"eval_runtime": 29.9239, |
|
"eval_samples_per_second": 262.633, |
|
"eval_steps_per_second": 16.442, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 7.78, |
|
"learning_rate": 9.62292609351433e-06, |
|
"loss": 0.6151, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 7.87, |
|
"learning_rate": 9.502262443438914e-06, |
|
"loss": 0.6431, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 7.96, |
|
"learning_rate": 9.3815987933635e-06, |
|
"loss": 0.6293, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 8.05, |
|
"learning_rate": 9.260935143288085e-06, |
|
"loss": 0.6266, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 8.14, |
|
"learning_rate": 9.14027149321267e-06, |
|
"loss": 0.6028, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 8.14, |
|
"eval_loss": 0.6904362440109253, |
|
"eval_runtime": 31.3612, |
|
"eval_samples_per_second": 250.596, |
|
"eval_steps_per_second": 15.688, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 8.24, |
|
"learning_rate": 9.019607843137256e-06, |
|
"loss": 0.6494, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 8.33, |
|
"learning_rate": 8.89894419306184e-06, |
|
"loss": 0.6014, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 8.42, |
|
"learning_rate": 8.778280542986426e-06, |
|
"loss": 0.6208, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 8.51, |
|
"learning_rate": 8.657616892911012e-06, |
|
"loss": 0.6461, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 8.6, |
|
"learning_rate": 8.536953242835596e-06, |
|
"loss": 0.6291, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 8.6, |
|
"eval_loss": 0.6408899426460266, |
|
"eval_runtime": 31.2717, |
|
"eval_samples_per_second": 251.314, |
|
"eval_steps_per_second": 15.733, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 8.69, |
|
"learning_rate": 8.416289592760181e-06, |
|
"loss": 0.6056, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 8.78, |
|
"learning_rate": 8.295625942684767e-06, |
|
"loss": 0.6052, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 8.87, |
|
"learning_rate": 8.174962292609353e-06, |
|
"loss": 0.6128, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 8.96, |
|
"learning_rate": 8.054298642533938e-06, |
|
"loss": 0.5895, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 9.05, |
|
"learning_rate": 7.933634992458522e-06, |
|
"loss": 0.6304, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 9.05, |
|
"eval_loss": 0.6174328327178955, |
|
"eval_runtime": 30.7682, |
|
"eval_samples_per_second": 255.426, |
|
"eval_steps_per_second": 15.991, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 9.14, |
|
"learning_rate": 7.812971342383108e-06, |
|
"loss": 0.5882, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 9.23, |
|
"learning_rate": 7.692307692307694e-06, |
|
"loss": 0.6304, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 9.32, |
|
"learning_rate": 7.5716440422322776e-06, |
|
"loss": 0.6103, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 9.41, |
|
"learning_rate": 7.450980392156863e-06, |
|
"loss": 0.5952, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 9.5, |
|
"learning_rate": 7.330316742081448e-06, |
|
"loss": 0.6039, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 9.5, |
|
"eval_loss": 0.6438130140304565, |
|
"eval_runtime": 31.9863, |
|
"eval_samples_per_second": 245.699, |
|
"eval_steps_per_second": 15.382, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 9.59, |
|
"learning_rate": 7.209653092006034e-06, |
|
"loss": 0.6079, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 9.68, |
|
"learning_rate": 7.088989441930619e-06, |
|
"loss": 0.6093, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 9.77, |
|
"learning_rate": 6.968325791855204e-06, |
|
"loss": 0.5787, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 9.86, |
|
"learning_rate": 6.84766214177979e-06, |
|
"loss": 0.6138, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 9.95, |
|
"learning_rate": 6.7269984917043755e-06, |
|
"loss": 0.6011, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 9.95, |
|
"eval_loss": 0.6626730561256409, |
|
"eval_runtime": 30.1966, |
|
"eval_samples_per_second": 260.261, |
|
"eval_steps_per_second": 16.293, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 10.05, |
|
"learning_rate": 6.6063348416289595e-06, |
|
"loss": 0.6087, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 10.14, |
|
"learning_rate": 6.485671191553545e-06, |
|
"loss": 0.5929, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 10.23, |
|
"learning_rate": 6.36500754147813e-06, |
|
"loss": 0.5849, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 10.32, |
|
"learning_rate": 6.244343891402716e-06, |
|
"loss": 0.6042, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 10.41, |
|
"learning_rate": 6.123680241327301e-06, |
|
"loss": 0.637, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 10.41, |
|
"eval_loss": 0.6357372403144836, |
|
"eval_runtime": 29.8866, |
|
"eval_samples_per_second": 262.961, |
|
"eval_steps_per_second": 16.462, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 10.5, |
|
"learning_rate": 6.003016591251885e-06, |
|
"loss": 0.5783, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 10.59, |
|
"learning_rate": 5.882352941176471e-06, |
|
"loss": 0.5739, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 10.68, |
|
"learning_rate": 5.761689291101056e-06, |
|
"loss": 0.5617, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 10.77, |
|
"learning_rate": 5.641025641025641e-06, |
|
"loss": 0.5724, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 10.86, |
|
"learning_rate": 5.520361990950227e-06, |
|
"loss": 0.5635, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 10.86, |
|
"eval_loss": 0.6293562054634094, |
|
"eval_runtime": 42.9677, |
|
"eval_samples_per_second": 182.905, |
|
"eval_steps_per_second": 11.45, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 10.95, |
|
"learning_rate": 5.399698340874812e-06, |
|
"loss": 0.6074, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 11.04, |
|
"learning_rate": 5.2790346907993975e-06, |
|
"loss": 0.5755, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 11.13, |
|
"learning_rate": 5.158371040723983e-06, |
|
"loss": 0.5653, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 11.22, |
|
"learning_rate": 5.037707390648567e-06, |
|
"loss": 0.5685, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 11.31, |
|
"learning_rate": 4.917043740573153e-06, |
|
"loss": 0.596, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 11.31, |
|
"eval_loss": 0.619467556476593, |
|
"eval_runtime": 30.8886, |
|
"eval_samples_per_second": 254.431, |
|
"eval_steps_per_second": 15.928, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 11.4, |
|
"learning_rate": 4.7963800904977385e-06, |
|
"loss": 0.5765, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 11.49, |
|
"learning_rate": 4.675716440422323e-06, |
|
"loss": 0.563, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 11.58, |
|
"learning_rate": 4.555052790346908e-06, |
|
"loss": 0.5871, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 11.67, |
|
"learning_rate": 4.434389140271493e-06, |
|
"loss": 0.5345, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 11.76, |
|
"learning_rate": 4.313725490196079e-06, |
|
"loss": 0.5647, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 11.76, |
|
"eval_loss": 0.6173378825187683, |
|
"eval_runtime": 30.5561, |
|
"eval_samples_per_second": 257.199, |
|
"eval_steps_per_second": 16.102, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 11.86, |
|
"learning_rate": 4.193061840120664e-06, |
|
"loss": 0.5798, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 11.95, |
|
"learning_rate": 4.072398190045249e-06, |
|
"loss": 0.5553, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 12.04, |
|
"learning_rate": 3.951734539969834e-06, |
|
"loss": 0.5372, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 12.13, |
|
"learning_rate": 3.83107088989442e-06, |
|
"loss": 0.5432, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 12.22, |
|
"learning_rate": 3.710407239819005e-06, |
|
"loss": 0.5304, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 12.22, |
|
"eval_loss": 0.6232503056526184, |
|
"eval_runtime": 30.2832, |
|
"eval_samples_per_second": 259.517, |
|
"eval_steps_per_second": 16.247, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 12.31, |
|
"learning_rate": 3.58974358974359e-06, |
|
"loss": 0.5423, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 12.4, |
|
"learning_rate": 3.4690799396681753e-06, |
|
"loss": 0.546, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 12.49, |
|
"learning_rate": 3.34841628959276e-06, |
|
"loss": 0.5932, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 12.58, |
|
"learning_rate": 3.2277526395173458e-06, |
|
"loss": 0.5721, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 12.67, |
|
"learning_rate": 3.107088989441931e-06, |
|
"loss": 0.5815, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 12.67, |
|
"eval_loss": 0.6265811324119568, |
|
"eval_runtime": 31.274, |
|
"eval_samples_per_second": 251.295, |
|
"eval_steps_per_second": 15.732, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 12.76, |
|
"learning_rate": 2.986425339366516e-06, |
|
"loss": 0.5646, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 12.85, |
|
"learning_rate": 2.865761689291101e-06, |
|
"loss": 0.5706, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 12.94, |
|
"learning_rate": 2.7450980392156867e-06, |
|
"loss": 0.5561, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 13.03, |
|
"learning_rate": 2.624434389140272e-06, |
|
"loss": 0.5668, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 13.12, |
|
"learning_rate": 2.503770739064857e-06, |
|
"loss": 0.5555, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 13.12, |
|
"eval_loss": 0.6143834590911865, |
|
"eval_runtime": 30.6444, |
|
"eval_samples_per_second": 256.458, |
|
"eval_steps_per_second": 16.055, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 13.21, |
|
"learning_rate": 2.3831070889894425e-06, |
|
"loss": 0.5327, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 13.3, |
|
"learning_rate": 2.2624434389140273e-06, |
|
"loss": 0.5491, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 13.39, |
|
"learning_rate": 2.1417797888386125e-06, |
|
"loss": 0.5633, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 13.48, |
|
"learning_rate": 2.0211161387631978e-06, |
|
"loss": 0.5612, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 13.57, |
|
"learning_rate": 1.9004524886877828e-06, |
|
"loss": 0.5725, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 13.57, |
|
"eval_loss": 0.6175289750099182, |
|
"eval_runtime": 30.1245, |
|
"eval_samples_per_second": 260.884, |
|
"eval_steps_per_second": 16.332, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 13.67, |
|
"learning_rate": 1.7797888386123682e-06, |
|
"loss": 0.5502, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 13.76, |
|
"learning_rate": 1.6591251885369533e-06, |
|
"loss": 0.5529, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 13.85, |
|
"learning_rate": 1.5384615384615387e-06, |
|
"loss": 0.5351, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 13.94, |
|
"learning_rate": 1.4177978883861237e-06, |
|
"loss": 0.5563, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 14.03, |
|
"learning_rate": 1.2971342383107092e-06, |
|
"loss": 0.5616, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 14.03, |
|
"eval_loss": 0.6149749755859375, |
|
"eval_runtime": 31.0977, |
|
"eval_samples_per_second": 252.719, |
|
"eval_steps_per_second": 15.821, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 14.12, |
|
"learning_rate": 1.1764705882352942e-06, |
|
"loss": 0.5404, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 14.21, |
|
"learning_rate": 1.0558069381598795e-06, |
|
"loss": 0.5461, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 14.3, |
|
"learning_rate": 9.351432880844646e-07, |
|
"loss": 0.5449, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 14.39, |
|
"learning_rate": 8.144796380090498e-07, |
|
"loss": 0.5296, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 14.48, |
|
"learning_rate": 6.938159879336351e-07, |
|
"loss": 0.5471, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 14.48, |
|
"eval_loss": 0.5798903703689575, |
|
"eval_runtime": 29.2689, |
|
"eval_samples_per_second": 268.51, |
|
"eval_steps_per_second": 16.81, |
|
"step": 16000 |
|
} |
|
], |
|
"max_steps": 16575, |
|
"num_train_epochs": 15, |
|
"total_flos": 5.59327922919324e+16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|