|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.8, |
|
"eval_steps": 50, |
|
"global_step": 2000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 6.376349449157715, |
|
"learning_rate": 2.5e-06, |
|
"loss": 1.4789, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 1.0021060705184937, |
|
"eval_runtime": 2.0714, |
|
"eval_samples_per_second": 55.034, |
|
"eval_steps_per_second": 2.897, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.7532732486724854, |
|
"learning_rate": 5e-06, |
|
"loss": 1.125, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 0.8670538067817688, |
|
"eval_runtime": 2.0412, |
|
"eval_samples_per_second": 55.848, |
|
"eval_steps_per_second": 2.939, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.5737497806549072, |
|
"learning_rate": 7.5e-06, |
|
"loss": 0.9982, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 0.85986328125, |
|
"eval_runtime": 2.066, |
|
"eval_samples_per_second": 55.18, |
|
"eval_steps_per_second": 2.904, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 3.524716377258301, |
|
"learning_rate": 1e-05, |
|
"loss": 0.9746, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 0.8621886968612671, |
|
"eval_runtime": 2.0577, |
|
"eval_samples_per_second": 55.402, |
|
"eval_steps_per_second": 2.916, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.9173636436462402, |
|
"learning_rate": 1.25e-05, |
|
"loss": 0.9435, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"eval_loss": 0.8596158623695374, |
|
"eval_runtime": 2.0562, |
|
"eval_samples_per_second": 55.441, |
|
"eval_steps_per_second": 2.918, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 3.21992564201355, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.9561, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"eval_loss": 0.8649560809135437, |
|
"eval_runtime": 2.0472, |
|
"eval_samples_per_second": 55.687, |
|
"eval_steps_per_second": 2.931, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 2.842764139175415, |
|
"learning_rate": 1.75e-05, |
|
"loss": 0.9625, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"eval_loss": 0.8620312213897705, |
|
"eval_runtime": 2.0407, |
|
"eval_samples_per_second": 55.863, |
|
"eval_steps_per_second": 2.94, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 3.064265012741089, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9561, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"eval_loss": 0.8754067420959473, |
|
"eval_runtime": 2.0491, |
|
"eval_samples_per_second": 55.634, |
|
"eval_steps_per_second": 2.928, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 3.680624008178711, |
|
"learning_rate": 2.25e-05, |
|
"loss": 0.9811, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"eval_loss": 0.8749663829803467, |
|
"eval_runtime": 2.0344, |
|
"eval_samples_per_second": 56.036, |
|
"eval_steps_per_second": 2.949, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 2.928382396697998, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.9841, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_loss": 0.8785499930381775, |
|
"eval_runtime": 2.0435, |
|
"eval_samples_per_second": 55.786, |
|
"eval_steps_per_second": 2.936, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 3.388023853302002, |
|
"learning_rate": 2.7500000000000004e-05, |
|
"loss": 1.0304, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"eval_loss": 0.8839182257652283, |
|
"eval_runtime": 2.0365, |
|
"eval_samples_per_second": 55.979, |
|
"eval_steps_per_second": 2.946, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 3.6457326412200928, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0091, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"eval_loss": 0.8980669975280762, |
|
"eval_runtime": 2.0434, |
|
"eval_samples_per_second": 55.79, |
|
"eval_steps_per_second": 2.936, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 2.8867459297180176, |
|
"learning_rate": 3.2500000000000004e-05, |
|
"loss": 1.0373, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"eval_loss": 0.8973696231842041, |
|
"eval_runtime": 2.0404, |
|
"eval_samples_per_second": 55.871, |
|
"eval_steps_per_second": 2.941, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 2.924246311187744, |
|
"learning_rate": 3.5e-05, |
|
"loss": 1.042, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"eval_loss": 0.9169337153434753, |
|
"eval_runtime": 2.0757, |
|
"eval_samples_per_second": 54.922, |
|
"eval_steps_per_second": 2.891, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 3.8138821125030518, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 1.0676, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"eval_loss": 0.9358024001121521, |
|
"eval_runtime": 2.0481, |
|
"eval_samples_per_second": 55.661, |
|
"eval_steps_per_second": 2.93, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 2.544848918914795, |
|
"learning_rate": 4e-05, |
|
"loss": 1.1405, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"eval_loss": 0.9551197290420532, |
|
"eval_runtime": 2.0608, |
|
"eval_samples_per_second": 55.319, |
|
"eval_steps_per_second": 2.912, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 3.607945203781128, |
|
"learning_rate": 4.25e-05, |
|
"loss": 1.1238, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"eval_loss": 0.9666525721549988, |
|
"eval_runtime": 2.0401, |
|
"eval_samples_per_second": 55.879, |
|
"eval_steps_per_second": 2.941, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 2.847774028778076, |
|
"learning_rate": 4.5e-05, |
|
"loss": 1.134, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"eval_loss": 1.0043387413024902, |
|
"eval_runtime": 2.0654, |
|
"eval_samples_per_second": 55.196, |
|
"eval_steps_per_second": 2.905, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 3.035200595855713, |
|
"learning_rate": 4.75e-05, |
|
"loss": 1.1589, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"eval_loss": 1.0095112323760986, |
|
"eval_runtime": 2.103, |
|
"eval_samples_per_second": 54.208, |
|
"eval_steps_per_second": 2.853, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 2.9163053035736084, |
|
"learning_rate": 5e-05, |
|
"loss": 1.1372, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 1.0114259719848633, |
|
"eval_runtime": 2.0471, |
|
"eval_samples_per_second": 55.688, |
|
"eval_steps_per_second": 2.931, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 2.7075846195220947, |
|
"learning_rate": 4.9996192378909786e-05, |
|
"loss": 1.2279, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"eval_loss": 1.0414971113204956, |
|
"eval_runtime": 2.0438, |
|
"eval_samples_per_second": 55.778, |
|
"eval_steps_per_second": 2.936, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 3.5521507263183594, |
|
"learning_rate": 4.99847706754774e-05, |
|
"loss": 1.2282, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"eval_loss": 1.0306421518325806, |
|
"eval_runtime": 2.0353, |
|
"eval_samples_per_second": 56.013, |
|
"eval_steps_per_second": 2.948, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 2.973623037338257, |
|
"learning_rate": 4.996573836886435e-05, |
|
"loss": 1.2439, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"eval_loss": 1.0502970218658447, |
|
"eval_runtime": 2.0477, |
|
"eval_samples_per_second": 55.673, |
|
"eval_steps_per_second": 2.93, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 3.2443981170654297, |
|
"learning_rate": 4.993910125649561e-05, |
|
"loss": 1.2674, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"eval_loss": 1.052767038345337, |
|
"eval_runtime": 2.0476, |
|
"eval_samples_per_second": 55.676, |
|
"eval_steps_per_second": 2.93, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 2.528109073638916, |
|
"learning_rate": 4.990486745229364e-05, |
|
"loss": 1.2429, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 1.0944527387619019, |
|
"eval_runtime": 2.0463, |
|
"eval_samples_per_second": 55.711, |
|
"eval_steps_per_second": 2.932, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 3.5652294158935547, |
|
"learning_rate": 4.9863047384206835e-05, |
|
"loss": 1.2405, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"eval_loss": 1.0788123607635498, |
|
"eval_runtime": 2.0584, |
|
"eval_samples_per_second": 55.383, |
|
"eval_steps_per_second": 2.915, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 3.1418027877807617, |
|
"learning_rate": 4.9813653791033057e-05, |
|
"loss": 1.2664, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"eval_loss": 1.077215552330017, |
|
"eval_runtime": 2.0417, |
|
"eval_samples_per_second": 55.836, |
|
"eval_steps_per_second": 2.939, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 3.247063159942627, |
|
"learning_rate": 4.975670171853926e-05, |
|
"loss": 1.2368, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"eval_loss": 1.0988303422927856, |
|
"eval_runtime": 2.0525, |
|
"eval_samples_per_second": 55.543, |
|
"eval_steps_per_second": 2.923, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 2.791402816772461, |
|
"learning_rate": 4.9692208514878444e-05, |
|
"loss": 1.214, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"eval_loss": 1.093959093093872, |
|
"eval_runtime": 2.0478, |
|
"eval_samples_per_second": 55.67, |
|
"eval_steps_per_second": 2.93, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 3.39119815826416, |
|
"learning_rate": 4.962019382530521e-05, |
|
"loss": 1.2605, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"eval_loss": 1.0913000106811523, |
|
"eval_runtime": 2.0609, |
|
"eval_samples_per_second": 55.317, |
|
"eval_steps_per_second": 2.911, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 2.8593010902404785, |
|
"learning_rate": 4.9540679586191605e-05, |
|
"loss": 1.2856, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"eval_loss": 1.1060646772384644, |
|
"eval_runtime": 2.0505, |
|
"eval_samples_per_second": 55.597, |
|
"eval_steps_per_second": 2.926, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 3.9253203868865967, |
|
"learning_rate": 4.9453690018345144e-05, |
|
"loss": 1.2385, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_loss": 1.1065127849578857, |
|
"eval_runtime": 2.0451, |
|
"eval_samples_per_second": 55.743, |
|
"eval_steps_per_second": 2.934, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 3.433211326599121, |
|
"learning_rate": 4.9359251619630886e-05, |
|
"loss": 1.2696, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"eval_loss": 1.1171408891677856, |
|
"eval_runtime": 2.0491, |
|
"eval_samples_per_second": 55.635, |
|
"eval_steps_per_second": 2.928, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 2.958655595779419, |
|
"learning_rate": 4.925739315689991e-05, |
|
"loss": 1.2774, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"eval_loss": 1.1090198755264282, |
|
"eval_runtime": 2.0549, |
|
"eval_samples_per_second": 55.476, |
|
"eval_steps_per_second": 2.92, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 2.845395565032959, |
|
"learning_rate": 4.914814565722671e-05, |
|
"loss": 1.2598, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"eval_loss": 1.1252377033233643, |
|
"eval_runtime": 2.0564, |
|
"eval_samples_per_second": 55.437, |
|
"eval_steps_per_second": 2.918, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 3.3043181896209717, |
|
"learning_rate": 4.9031542398457974e-05, |
|
"loss": 1.2897, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"eval_loss": 1.1197612285614014, |
|
"eval_runtime": 2.0511, |
|
"eval_samples_per_second": 55.58, |
|
"eval_steps_per_second": 2.925, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 2.8986546993255615, |
|
"learning_rate": 4.890761889907589e-05, |
|
"loss": 1.2801, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"eval_loss": 1.0936975479125977, |
|
"eval_runtime": 2.0514, |
|
"eval_samples_per_second": 55.57, |
|
"eval_steps_per_second": 2.925, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 2.980234384536743, |
|
"learning_rate": 4.877641290737884e-05, |
|
"loss": 1.2732, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"eval_loss": 1.1040586233139038, |
|
"eval_runtime": 2.0664, |
|
"eval_samples_per_second": 55.169, |
|
"eval_steps_per_second": 2.904, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 2.337164878845215, |
|
"learning_rate": 4.8637964389982926e-05, |
|
"loss": 1.2395, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"eval_loss": 1.1020458936691284, |
|
"eval_runtime": 2.0426, |
|
"eval_samples_per_second": 55.811, |
|
"eval_steps_per_second": 2.937, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 2.535869836807251, |
|
"learning_rate": 4.849231551964771e-05, |
|
"loss": 1.2581, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 1.106950283050537, |
|
"eval_runtime": 2.0587, |
|
"eval_samples_per_second": 55.375, |
|
"eval_steps_per_second": 2.914, |
|
"step": 2000 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 10000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 2000, |
|
"total_flos": 7.650574067145114e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|