|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.0, |
|
"eval_steps": 100, |
|
"global_step": 915, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.00019781420765027324, |
|
"loss": 2.1723, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.00019562841530054644, |
|
"loss": 1.7534, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.00019344262295081967, |
|
"loss": 1.4381, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"learning_rate": 0.0001912568306010929, |
|
"loss": 1.3181, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"learning_rate": 0.00018907103825136615, |
|
"loss": 1.2203, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"learning_rate": 0.00018688524590163935, |
|
"loss": 1.0679, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"learning_rate": 0.00018469945355191258, |
|
"loss": 1.0329, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"learning_rate": 0.0001825136612021858, |
|
"loss": 0.9817, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"learning_rate": 0.00018032786885245904, |
|
"loss": 0.866, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"learning_rate": 0.00017814207650273224, |
|
"loss": 0.8634, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"eval_accuracy": 0.6975, |
|
"eval_loss": 0.9265598654747009, |
|
"eval_runtime": 17.6341, |
|
"eval_samples_per_second": 68.05, |
|
"eval_steps_per_second": 8.506, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"learning_rate": 0.00017595628415300547, |
|
"loss": 0.6979, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"learning_rate": 0.0001737704918032787, |
|
"loss": 0.7132, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"learning_rate": 0.00017158469945355192, |
|
"loss": 0.6692, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"learning_rate": 0.00016939890710382515, |
|
"loss": 0.6, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"learning_rate": 0.00016721311475409838, |
|
"loss": 0.5497, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"learning_rate": 0.0001650273224043716, |
|
"loss": 0.4835, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"learning_rate": 0.0001628415300546448, |
|
"loss": 0.4699, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"learning_rate": 0.00016065573770491804, |
|
"loss": 0.4311, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"learning_rate": 0.00015846994535519127, |
|
"loss": 0.4363, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"learning_rate": 0.0001562841530054645, |
|
"loss": 0.3225, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"eval_accuracy": 0.7325, |
|
"eval_loss": 0.8994238376617432, |
|
"eval_runtime": 16.2421, |
|
"eval_samples_per_second": 73.882, |
|
"eval_steps_per_second": 9.235, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"learning_rate": 0.0001540983606557377, |
|
"loss": 0.3329, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"learning_rate": 0.00015191256830601093, |
|
"loss": 0.3262, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"learning_rate": 0.00014972677595628418, |
|
"loss": 0.3394, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"learning_rate": 0.00014754098360655738, |
|
"loss": 0.3319, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"learning_rate": 0.0001453551912568306, |
|
"loss": 0.2554, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"learning_rate": 0.00014316939890710384, |
|
"loss": 0.2912, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"learning_rate": 0.00014098360655737707, |
|
"loss": 0.2801, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"learning_rate": 0.00013879781420765027, |
|
"loss": 0.2443, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"learning_rate": 0.0001366120218579235, |
|
"loss": 0.278, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"learning_rate": 0.00013442622950819673, |
|
"loss": 0.2353, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"eval_accuracy": 0.73, |
|
"eval_loss": 0.968270480632782, |
|
"eval_runtime": 17.23, |
|
"eval_samples_per_second": 69.646, |
|
"eval_steps_per_second": 8.706, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"learning_rate": 0.00013224043715846995, |
|
"loss": 0.2092, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"learning_rate": 0.00013005464480874316, |
|
"loss": 0.1959, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"learning_rate": 0.0001278688524590164, |
|
"loss": 0.1841, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"learning_rate": 0.00012568306010928964, |
|
"loss": 0.2678, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"learning_rate": 0.00012349726775956284, |
|
"loss": 0.2366, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"learning_rate": 0.00012131147540983607, |
|
"loss": 0.2198, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"learning_rate": 0.0001191256830601093, |
|
"loss": 0.1459, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"learning_rate": 0.00011693989071038251, |
|
"loss": 0.0904, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"learning_rate": 0.00011475409836065574, |
|
"loss": 0.1093, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"learning_rate": 0.00011256830601092896, |
|
"loss": 0.1119, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"eval_accuracy": 0.7491666666666666, |
|
"eval_loss": 0.9247021675109863, |
|
"eval_runtime": 16.092, |
|
"eval_samples_per_second": 74.571, |
|
"eval_steps_per_second": 9.321, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"learning_rate": 0.00011038251366120218, |
|
"loss": 0.1183, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"learning_rate": 0.00010819672131147543, |
|
"loss": 0.1155, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"learning_rate": 0.00010601092896174864, |
|
"loss": 0.138, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"learning_rate": 0.00010382513661202187, |
|
"loss": 0.1124, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"learning_rate": 0.00010163934426229508, |
|
"loss": 0.1535, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"learning_rate": 9.945355191256831e-05, |
|
"loss": 0.1895, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"learning_rate": 9.726775956284153e-05, |
|
"loss": 0.128, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"learning_rate": 9.508196721311476e-05, |
|
"loss": 0.0633, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"learning_rate": 9.289617486338798e-05, |
|
"loss": 0.0788, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"learning_rate": 9.071038251366121e-05, |
|
"loss": 0.049, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"eval_accuracy": 0.7566666666666667, |
|
"eval_loss": 0.9662973880767822, |
|
"eval_runtime": 15.9004, |
|
"eval_samples_per_second": 75.47, |
|
"eval_steps_per_second": 9.434, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"learning_rate": 8.852459016393443e-05, |
|
"loss": 0.0829, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"learning_rate": 8.633879781420766e-05, |
|
"loss": 0.0821, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"learning_rate": 8.415300546448088e-05, |
|
"loss": 0.0554, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"learning_rate": 8.19672131147541e-05, |
|
"loss": 0.0756, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"learning_rate": 7.978142076502733e-05, |
|
"loss": 0.136, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"learning_rate": 7.759562841530054e-05, |
|
"loss": 0.0903, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"learning_rate": 7.540983606557377e-05, |
|
"loss": 0.0868, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"learning_rate": 7.3224043715847e-05, |
|
"loss": 0.0523, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 3.22, |
|
"learning_rate": 7.103825136612023e-05, |
|
"loss": 0.0396, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"learning_rate": 6.885245901639344e-05, |
|
"loss": 0.0537, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"eval_accuracy": 0.7566666666666667, |
|
"eval_loss": 1.0557572841644287, |
|
"eval_runtime": 16.1696, |
|
"eval_samples_per_second": 74.213, |
|
"eval_steps_per_second": 9.277, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 0.064, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"learning_rate": 6.44808743169399e-05, |
|
"loss": 0.0206, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"learning_rate": 6.229508196721313e-05, |
|
"loss": 0.028, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"learning_rate": 6.010928961748634e-05, |
|
"loss": 0.026, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"learning_rate": 5.792349726775956e-05, |
|
"loss": 0.0229, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 3.61, |
|
"learning_rate": 5.5737704918032785e-05, |
|
"loss": 0.032, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"learning_rate": 5.355191256830602e-05, |
|
"loss": 0.023, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 3.72, |
|
"learning_rate": 5.136612021857924e-05, |
|
"loss": 0.0269, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 3.77, |
|
"learning_rate": 4.918032786885246e-05, |
|
"loss": 0.023, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"learning_rate": 4.6994535519125685e-05, |
|
"loss": 0.0274, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"eval_accuracy": 0.7691666666666667, |
|
"eval_loss": 1.0343540906906128, |
|
"eval_runtime": 16.2987, |
|
"eval_samples_per_second": 73.626, |
|
"eval_steps_per_second": 9.203, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"learning_rate": 4.4808743169398906e-05, |
|
"loss": 0.0197, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 3.93, |
|
"learning_rate": 4.262295081967213e-05, |
|
"loss": 0.0195, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"learning_rate": 4.0437158469945356e-05, |
|
"loss": 0.0122, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 4.04, |
|
"learning_rate": 3.825136612021858e-05, |
|
"loss": 0.0156, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 4.1, |
|
"learning_rate": 3.6065573770491806e-05, |
|
"loss": 0.0115, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 4.15, |
|
"learning_rate": 3.387978142076503e-05, |
|
"loss": 0.0118, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 4.21, |
|
"learning_rate": 3.1693989071038256e-05, |
|
"loss": 0.0123, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 4.26, |
|
"learning_rate": 2.9508196721311478e-05, |
|
"loss": 0.0108, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 4.32, |
|
"learning_rate": 2.7322404371584703e-05, |
|
"loss": 0.0112, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 4.37, |
|
"learning_rate": 2.5136612021857924e-05, |
|
"loss": 0.0102, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 4.37, |
|
"eval_accuracy": 0.7941666666666667, |
|
"eval_loss": 0.9259100556373596, |
|
"eval_runtime": 16.5205, |
|
"eval_samples_per_second": 72.637, |
|
"eval_steps_per_second": 9.08, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 4.43, |
|
"learning_rate": 2.295081967213115e-05, |
|
"loss": 0.0101, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"learning_rate": 2.0765027322404374e-05, |
|
"loss": 0.0099, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 4.54, |
|
"learning_rate": 1.85792349726776e-05, |
|
"loss": 0.0097, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 4.59, |
|
"learning_rate": 1.6393442622950818e-05, |
|
"loss": 0.011, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 4.64, |
|
"learning_rate": 1.4207650273224044e-05, |
|
"loss": 0.0094, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 4.7, |
|
"learning_rate": 1.2021857923497268e-05, |
|
"loss": 0.01, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"learning_rate": 9.836065573770493e-06, |
|
"loss": 0.0096, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 4.81, |
|
"learning_rate": 7.650273224043716e-06, |
|
"loss": 0.0098, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 4.86, |
|
"learning_rate": 5.46448087431694e-06, |
|
"loss": 0.0163, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 4.92, |
|
"learning_rate": 3.278688524590164e-06, |
|
"loss": 0.0095, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 4.92, |
|
"eval_accuracy": 0.785, |
|
"eval_loss": 0.9604464769363403, |
|
"eval_runtime": 17.3811, |
|
"eval_samples_per_second": 69.04, |
|
"eval_steps_per_second": 8.63, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 4.97, |
|
"learning_rate": 1.092896174863388e-06, |
|
"loss": 0.0092, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"step": 915, |
|
"total_flos": 3.4029172406502605e+18, |
|
"train_loss": 0.2756463099698551, |
|
"train_runtime": 974.1891, |
|
"train_samples_per_second": 45.073, |
|
"train_steps_per_second": 0.939 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 915, |
|
"num_train_epochs": 5, |
|
"save_steps": 1000, |
|
"total_flos": 3.4029172406502605e+18, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|