|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 8.104816696762272, |
|
"global_step": 193000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.0002955626928115623, |
|
"loss": 1.1705, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"eval_loss": 0.22145646810531616, |
|
"eval_runtime": 5082.7114, |
|
"eval_samples_per_second": 4.02, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"learning_rate": 0.0002892236825423657, |
|
"loss": 0.2914, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"eval_loss": 0.19062571227550507, |
|
"eval_runtime": 5086.3406, |
|
"eval_samples_per_second": 4.018, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"learning_rate": 0.00028288467227316906, |
|
"loss": 0.2617, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"eval_loss": 0.1784171611070633, |
|
"eval_runtime": 4895.8348, |
|
"eval_samples_per_second": 4.174, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"learning_rate": 0.0002765456620039724, |
|
"loss": 0.2449, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"eval_loss": 0.16641439497470856, |
|
"eval_runtime": 4945.3495, |
|
"eval_samples_per_second": 4.132, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"learning_rate": 0.0002702066517347758, |
|
"loss": 0.2264, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"eval_loss": 0.15650227665901184, |
|
"eval_runtime": 4999.732, |
|
"eval_samples_per_second": 4.087, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"learning_rate": 0.00026386764146557915, |
|
"loss": 0.2093, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"eval_loss": 0.15184776484966278, |
|
"eval_runtime": 4877.0685, |
|
"eval_samples_per_second": 4.19, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"learning_rate": 0.0002575286311963825, |
|
"loss": 0.2003, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"eval_loss": 0.14190027117729187, |
|
"eval_runtime": 4879.5538, |
|
"eval_samples_per_second": 4.188, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"learning_rate": 0.0002511896209271859, |
|
"loss": 0.1962, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"eval_loss": 0.14596430957317352, |
|
"eval_runtime": 4860.8654, |
|
"eval_samples_per_second": 4.204, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"learning_rate": 0.00024485061065798925, |
|
"loss": 0.19, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"eval_loss": 0.13479308784008026, |
|
"eval_runtime": 4872.4496, |
|
"eval_samples_per_second": 4.194, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"learning_rate": 0.00023851160038879262, |
|
"loss": 0.1769, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"eval_loss": 0.13018357753753662, |
|
"eval_runtime": 4872.3296, |
|
"eval_samples_per_second": 4.194, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"learning_rate": 0.00023217259011959596, |
|
"loss": 0.1674, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"eval_loss": 0.1303720772266388, |
|
"eval_runtime": 4874.4649, |
|
"eval_samples_per_second": 4.192, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"learning_rate": 0.00022583357985039935, |
|
"loss": 0.1655, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"eval_loss": 0.12317115068435669, |
|
"eval_runtime": 4882.2049, |
|
"eval_samples_per_second": 4.186, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"learning_rate": 0.00021949456958120271, |
|
"loss": 0.1608, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"eval_loss": 0.12056649476289749, |
|
"eval_runtime": 4877.3831, |
|
"eval_samples_per_second": 4.19, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"learning_rate": 0.00021315555931200605, |
|
"loss": 0.1565, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"eval_loss": 0.11486475169658661, |
|
"eval_runtime": 4876.7317, |
|
"eval_samples_per_second": 4.19, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"learning_rate": 0.00020681654904280945, |
|
"loss": 0.146, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"eval_loss": 0.11829441785812378, |
|
"eval_runtime": 4833.5153, |
|
"eval_samples_per_second": 4.228, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"learning_rate": 0.00020047753877361279, |
|
"loss": 0.1403, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"eval_loss": 0.11143175512552261, |
|
"eval_runtime": 4853.9814, |
|
"eval_samples_per_second": 4.21, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 3.57, |
|
"learning_rate": 0.00019413852850441618, |
|
"loss": 0.1376, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 3.57, |
|
"eval_loss": 0.11027190089225769, |
|
"eval_runtime": 4858.6043, |
|
"eval_samples_per_second": 4.206, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 3.78, |
|
"learning_rate": 0.00018779951823521952, |
|
"loss": 0.1337, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 3.78, |
|
"eval_loss": 0.10872453451156616, |
|
"eval_runtime": 4864.6388, |
|
"eval_samples_per_second": 4.201, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"learning_rate": 0.00018146050796602288, |
|
"loss": 0.1325, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"eval_loss": 0.10718829929828644, |
|
"eval_runtime": 4921.9134, |
|
"eval_samples_per_second": 4.152, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"learning_rate": 0.00017512149769682625, |
|
"loss": 0.1195, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"eval_loss": 0.10430513322353363, |
|
"eval_runtime": 4864.6263, |
|
"eval_samples_per_second": 4.201, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 4.41, |
|
"learning_rate": 0.00016878248742762961, |
|
"loss": 0.118, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 4.41, |
|
"eval_loss": 0.1070966124534607, |
|
"eval_runtime": 4879.2783, |
|
"eval_samples_per_second": 4.188, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"learning_rate": 0.00016244347715843295, |
|
"loss": 0.1173, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"eval_loss": 0.10433077067136765, |
|
"eval_runtime": 4876.4984, |
|
"eval_samples_per_second": 4.191, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 4.83, |
|
"learning_rate": 0.00015610446688923635, |
|
"loss": 0.115, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 4.83, |
|
"eval_loss": 0.09682977199554443, |
|
"eval_runtime": 4893.4652, |
|
"eval_samples_per_second": 4.176, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 5.04, |
|
"learning_rate": 0.0001497654566200397, |
|
"loss": 0.1102, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 5.04, |
|
"eval_loss": 0.09630288183689117, |
|
"eval_runtime": 4914.2049, |
|
"eval_samples_per_second": 4.158, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 5.25, |
|
"learning_rate": 0.00014342644635084308, |
|
"loss": 0.1019, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 5.25, |
|
"eval_loss": 0.0918075293302536, |
|
"eval_runtime": 4893.9499, |
|
"eval_samples_per_second": 4.176, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 5.46, |
|
"learning_rate": 0.00013708743608164644, |
|
"loss": 0.1014, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 5.46, |
|
"eval_loss": 0.09067174792289734, |
|
"eval_runtime": 4891.4795, |
|
"eval_samples_per_second": 4.178, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 5.67, |
|
"learning_rate": 0.0001307484258124498, |
|
"loss": 0.1, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 5.67, |
|
"eval_loss": 0.08851899951696396, |
|
"eval_runtime": 4884.1688, |
|
"eval_samples_per_second": 4.184, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 5.88, |
|
"learning_rate": 0.00012440941554325318, |
|
"loss": 0.0971, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 5.88, |
|
"eval_loss": 0.08720648288726807, |
|
"eval_runtime": 4896.4597, |
|
"eval_samples_per_second": 4.173, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 6.09, |
|
"learning_rate": 0.00011807040527405654, |
|
"loss": 0.0921, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 6.09, |
|
"eval_loss": 0.08666499704122543, |
|
"eval_runtime": 5117.0953, |
|
"eval_samples_per_second": 3.993, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 6.3, |
|
"learning_rate": 0.00011173139500485991, |
|
"loss": 0.0884, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 6.3, |
|
"eval_loss": 0.0831904485821724, |
|
"eval_runtime": 4879.0536, |
|
"eval_samples_per_second": 4.188, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 6.51, |
|
"learning_rate": 0.00010539238473566326, |
|
"loss": 0.0864, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 6.51, |
|
"eval_loss": 0.08337873965501785, |
|
"eval_runtime": 4900.7475, |
|
"eval_samples_per_second": 4.17, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 6.72, |
|
"learning_rate": 9.905337446646663e-05, |
|
"loss": 0.0861, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 6.72, |
|
"eval_loss": 0.08155979961156845, |
|
"eval_runtime": 4919.8246, |
|
"eval_samples_per_second": 4.154, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 6.93, |
|
"learning_rate": 9.271436419726999e-05, |
|
"loss": 0.083, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 6.93, |
|
"eval_loss": 0.08167865127325058, |
|
"eval_runtime": 4921.4971, |
|
"eval_samples_per_second": 4.152, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 7.14, |
|
"learning_rate": 8.637535392807336e-05, |
|
"loss": 0.0769, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 7.14, |
|
"eval_loss": 0.0775604099035263, |
|
"eval_runtime": 4893.4221, |
|
"eval_samples_per_second": 4.176, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 7.35, |
|
"learning_rate": 8.003634365887672e-05, |
|
"loss": 0.0749, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 7.35, |
|
"eval_loss": 0.07773936539888382, |
|
"eval_runtime": 4914.0163, |
|
"eval_samples_per_second": 4.159, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 7.56, |
|
"learning_rate": 7.369733338968009e-05, |
|
"loss": 0.0735, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 7.56, |
|
"eval_loss": 0.07420430332422256, |
|
"eval_runtime": 4934.0827, |
|
"eval_samples_per_second": 4.142, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 7.77, |
|
"learning_rate": 6.735832312048346e-05, |
|
"loss": 0.0715, |
|
"step": 185000 |
|
}, |
|
{ |
|
"epoch": 7.77, |
|
"eval_loss": 0.07269106060266495, |
|
"eval_runtime": 4925.1046, |
|
"eval_samples_per_second": 4.149, |
|
"step": 185000 |
|
}, |
|
{ |
|
"epoch": 7.98, |
|
"learning_rate": 6.1019312851286814e-05, |
|
"loss": 0.0702, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 7.98, |
|
"eval_loss": 0.07183075696229935, |
|
"eval_runtime": 4948.9933, |
|
"eval_samples_per_second": 4.129, |
|
"step": 190000 |
|
} |
|
], |
|
"max_steps": 238130, |
|
"num_train_epochs": 10, |
|
"total_flos": 5.387421756388246e+20, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|