|
{ |
|
"best_metric": 0.675777792930603, |
|
"best_model_checkpoint": "trainer/checkpoint-372428", |
|
"epoch": 50.0, |
|
"eval_steps": 500, |
|
"global_step": 396200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.0, |
|
"learning_rate": 2e-05, |
|
"loss": 2.2513, |
|
"step": 7924 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.2939835786819458, |
|
"eval_runtime": 37.6044, |
|
"eval_samples_per_second": 749.168, |
|
"eval_steps_per_second": 23.428, |
|
"step": 7924 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"learning_rate": 4e-05, |
|
"loss": 1.3752, |
|
"step": 15848 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 1.1416432857513428, |
|
"eval_runtime": 37.8169, |
|
"eval_samples_per_second": 744.958, |
|
"eval_steps_per_second": 23.296, |
|
"step": 15848 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"learning_rate": 4.9473684210526315e-05, |
|
"loss": 1.2436, |
|
"step": 23772 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 1.0676215887069702, |
|
"eval_runtime": 37.7184, |
|
"eval_samples_per_second": 746.904, |
|
"eval_steps_per_second": 23.357, |
|
"step": 23772 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"learning_rate": 4.842105263157895e-05, |
|
"loss": 1.1458, |
|
"step": 31696 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 0.9978280067443848, |
|
"eval_runtime": 36.3705, |
|
"eval_samples_per_second": 774.584, |
|
"eval_steps_per_second": 24.223, |
|
"step": 31696 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"learning_rate": 4.736842105263158e-05, |
|
"loss": 1.0841, |
|
"step": 39620 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 0.9508912563323975, |
|
"eval_runtime": 36.5102, |
|
"eval_samples_per_second": 771.62, |
|
"eval_steps_per_second": 24.13, |
|
"step": 39620 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"learning_rate": 4.6315789473684214e-05, |
|
"loss": 1.0386, |
|
"step": 47544 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 0.9356514811515808, |
|
"eval_runtime": 36.323, |
|
"eval_samples_per_second": 775.598, |
|
"eval_steps_per_second": 24.255, |
|
"step": 47544 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"learning_rate": 4.5263157894736846e-05, |
|
"loss": 0.9989, |
|
"step": 55468 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 0.9180737137794495, |
|
"eval_runtime": 36.4427, |
|
"eval_samples_per_second": 773.05, |
|
"eval_steps_per_second": 24.175, |
|
"step": 55468 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"learning_rate": 4.421052631578947e-05, |
|
"loss": 0.9686, |
|
"step": 63392 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 0.8952454328536987, |
|
"eval_runtime": 36.1822, |
|
"eval_samples_per_second": 778.615, |
|
"eval_steps_per_second": 24.349, |
|
"step": 63392 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"learning_rate": 4.3157894736842105e-05, |
|
"loss": 0.9426, |
|
"step": 71316 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 0.8876378536224365, |
|
"eval_runtime": 36.5182, |
|
"eval_samples_per_second": 771.451, |
|
"eval_steps_per_second": 24.125, |
|
"step": 71316 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"learning_rate": 4.210526315789474e-05, |
|
"loss": 0.9198, |
|
"step": 79240 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 0.8818822503089905, |
|
"eval_runtime": 36.3065, |
|
"eval_samples_per_second": 775.949, |
|
"eval_steps_per_second": 24.266, |
|
"step": 79240 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"learning_rate": 4.105263157894737e-05, |
|
"loss": 0.9053, |
|
"step": 87164 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_loss": 0.8349147439002991, |
|
"eval_runtime": 36.3771, |
|
"eval_samples_per_second": 774.444, |
|
"eval_steps_per_second": 24.219, |
|
"step": 87164 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"learning_rate": 4e-05, |
|
"loss": 0.8881, |
|
"step": 95088 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_loss": 0.8407663702964783, |
|
"eval_runtime": 36.4047, |
|
"eval_samples_per_second": 773.857, |
|
"eval_steps_per_second": 24.2, |
|
"step": 95088 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"learning_rate": 3.894736842105263e-05, |
|
"loss": 0.8704, |
|
"step": 103012 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_loss": 0.8339666128158569, |
|
"eval_runtime": 36.314, |
|
"eval_samples_per_second": 775.79, |
|
"eval_steps_per_second": 24.261, |
|
"step": 103012 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"learning_rate": 3.789473684210527e-05, |
|
"loss": 0.8533, |
|
"step": 110936 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_loss": 0.8264057636260986, |
|
"eval_runtime": 36.2648, |
|
"eval_samples_per_second": 776.841, |
|
"eval_steps_per_second": 24.294, |
|
"step": 110936 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"learning_rate": 3.6842105263157895e-05, |
|
"loss": 0.8418, |
|
"step": 118860 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_loss": 0.8100990653038025, |
|
"eval_runtime": 36.5073, |
|
"eval_samples_per_second": 771.682, |
|
"eval_steps_per_second": 24.132, |
|
"step": 118860 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"learning_rate": 3.578947368421053e-05, |
|
"loss": 0.8307, |
|
"step": 126784 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_loss": 0.8106787800788879, |
|
"eval_runtime": 36.2767, |
|
"eval_samples_per_second": 776.586, |
|
"eval_steps_per_second": 24.286, |
|
"step": 126784 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"learning_rate": 3.473684210526316e-05, |
|
"loss": 0.815, |
|
"step": 134708 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_loss": 0.7991083264350891, |
|
"eval_runtime": 36.2711, |
|
"eval_samples_per_second": 776.707, |
|
"eval_steps_per_second": 24.289, |
|
"step": 134708 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"learning_rate": 3.368421052631579e-05, |
|
"loss": 0.8015, |
|
"step": 142632 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_loss": 0.7952774167060852, |
|
"eval_runtime": 37.5556, |
|
"eval_samples_per_second": 750.142, |
|
"eval_steps_per_second": 23.459, |
|
"step": 142632 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"learning_rate": 3.2631578947368426e-05, |
|
"loss": 0.7894, |
|
"step": 150556 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_loss": 0.7720882296562195, |
|
"eval_runtime": 36.9729, |
|
"eval_samples_per_second": 761.964, |
|
"eval_steps_per_second": 23.828, |
|
"step": 150556 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"learning_rate": 3.157894736842105e-05, |
|
"loss": 0.7789, |
|
"step": 158480 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_loss": 0.7802249789237976, |
|
"eval_runtime": 37.1298, |
|
"eval_samples_per_second": 758.744, |
|
"eval_steps_per_second": 23.728, |
|
"step": 158480 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"learning_rate": 3.0526315789473684e-05, |
|
"loss": 0.7678, |
|
"step": 166404 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_loss": 0.7610885500907898, |
|
"eval_runtime": 36.2573, |
|
"eval_samples_per_second": 777.002, |
|
"eval_steps_per_second": 24.299, |
|
"step": 166404 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"learning_rate": 2.9473684210526314e-05, |
|
"loss": 0.7534, |
|
"step": 174328 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_loss": 0.782088577747345, |
|
"eval_runtime": 37.4893, |
|
"eval_samples_per_second": 751.467, |
|
"eval_steps_per_second": 23.5, |
|
"step": 174328 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"learning_rate": 2.842105263157895e-05, |
|
"loss": 0.7502, |
|
"step": 182252 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_loss": 0.7673630714416504, |
|
"eval_runtime": 36.6312, |
|
"eval_samples_per_second": 769.07, |
|
"eval_steps_per_second": 24.051, |
|
"step": 182252 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"learning_rate": 2.7368421052631583e-05, |
|
"loss": 0.7345, |
|
"step": 190176 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_loss": 0.7627705335617065, |
|
"eval_runtime": 37.0149, |
|
"eval_samples_per_second": 761.099, |
|
"eval_steps_per_second": 23.801, |
|
"step": 190176 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"learning_rate": 2.6315789473684212e-05, |
|
"loss": 0.7264, |
|
"step": 198100 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"eval_loss": 0.7561437487602234, |
|
"eval_runtime": 37.2677, |
|
"eval_samples_per_second": 755.937, |
|
"eval_steps_per_second": 23.64, |
|
"step": 198100 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"learning_rate": 2.5263157894736845e-05, |
|
"loss": 0.7142, |
|
"step": 206024 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_loss": 0.7509896159172058, |
|
"eval_runtime": 37.3161, |
|
"eval_samples_per_second": 754.956, |
|
"eval_steps_per_second": 23.609, |
|
"step": 206024 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"learning_rate": 2.4210526315789474e-05, |
|
"loss": 0.7018, |
|
"step": 213948 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"eval_loss": 0.7464780807495117, |
|
"eval_runtime": 36.7538, |
|
"eval_samples_per_second": 766.505, |
|
"eval_steps_per_second": 23.97, |
|
"step": 213948 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"learning_rate": 2.3157894736842107e-05, |
|
"loss": 0.6897, |
|
"step": 221872 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_loss": 0.7344001531600952, |
|
"eval_runtime": 37.0652, |
|
"eval_samples_per_second": 760.066, |
|
"eval_steps_per_second": 23.769, |
|
"step": 221872 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"learning_rate": 2.2105263157894736e-05, |
|
"loss": 0.682, |
|
"step": 229796 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"eval_loss": 0.7429642081260681, |
|
"eval_runtime": 37.2732, |
|
"eval_samples_per_second": 755.824, |
|
"eval_steps_per_second": 23.636, |
|
"step": 229796 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"learning_rate": 2.105263157894737e-05, |
|
"loss": 0.6754, |
|
"step": 237720 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_loss": 0.7481978535652161, |
|
"eval_runtime": 37.0222, |
|
"eval_samples_per_second": 760.949, |
|
"eval_steps_per_second": 23.797, |
|
"step": 237720 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6679, |
|
"step": 245644 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"eval_loss": 0.7224923968315125, |
|
"eval_runtime": 37.2439, |
|
"eval_samples_per_second": 756.419, |
|
"eval_steps_per_second": 23.655, |
|
"step": 245644 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"learning_rate": 1.8947368421052634e-05, |
|
"loss": 0.6566, |
|
"step": 253568 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_loss": 0.715844452381134, |
|
"eval_runtime": 36.5457, |
|
"eval_samples_per_second": 770.871, |
|
"eval_steps_per_second": 24.107, |
|
"step": 253568 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"learning_rate": 1.7894736842105264e-05, |
|
"loss": 0.6492, |
|
"step": 261492 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"eval_loss": 0.7234057188034058, |
|
"eval_runtime": 36.3822, |
|
"eval_samples_per_second": 774.335, |
|
"eval_steps_per_second": 24.215, |
|
"step": 261492 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"learning_rate": 1.6842105263157896e-05, |
|
"loss": 0.642, |
|
"step": 269416 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"eval_loss": 0.7132413983345032, |
|
"eval_runtime": 36.3793, |
|
"eval_samples_per_second": 774.396, |
|
"eval_steps_per_second": 24.217, |
|
"step": 269416 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"learning_rate": 1.5789473684210526e-05, |
|
"loss": 0.6342, |
|
"step": 277340 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"eval_loss": 0.7007443904876709, |
|
"eval_runtime": 36.3032, |
|
"eval_samples_per_second": 776.02, |
|
"eval_steps_per_second": 24.268, |
|
"step": 277340 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"learning_rate": 1.4736842105263157e-05, |
|
"loss": 0.6236, |
|
"step": 285264 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"eval_loss": 0.69706791639328, |
|
"eval_runtime": 37.0755, |
|
"eval_samples_per_second": 759.854, |
|
"eval_steps_per_second": 23.762, |
|
"step": 285264 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"learning_rate": 1.3684210526315791e-05, |
|
"loss": 0.6146, |
|
"step": 293188 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"eval_loss": 0.6900755167007446, |
|
"eval_runtime": 36.5007, |
|
"eval_samples_per_second": 771.822, |
|
"eval_steps_per_second": 24.137, |
|
"step": 293188 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"learning_rate": 1.2631578947368422e-05, |
|
"loss": 0.6087, |
|
"step": 301112 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"eval_loss": 0.6962341666221619, |
|
"eval_runtime": 36.6631, |
|
"eval_samples_per_second": 768.402, |
|
"eval_steps_per_second": 24.03, |
|
"step": 301112 |
|
}, |
|
{ |
|
"epoch": 39.0, |
|
"learning_rate": 1.1578947368421053e-05, |
|
"loss": 0.5989, |
|
"step": 309036 |
|
}, |
|
{ |
|
"epoch": 39.0, |
|
"eval_loss": 0.7045713067054749, |
|
"eval_runtime": 36.6758, |
|
"eval_samples_per_second": 768.136, |
|
"eval_steps_per_second": 24.021, |
|
"step": 309036 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"learning_rate": 1.0526315789473684e-05, |
|
"loss": 0.5924, |
|
"step": 316960 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"eval_loss": 0.6984645128250122, |
|
"eval_runtime": 36.7394, |
|
"eval_samples_per_second": 766.807, |
|
"eval_steps_per_second": 23.98, |
|
"step": 316960 |
|
}, |
|
{ |
|
"epoch": 41.0, |
|
"learning_rate": 9.473684210526317e-06, |
|
"loss": 0.5827, |
|
"step": 324884 |
|
}, |
|
{ |
|
"epoch": 41.0, |
|
"eval_loss": 0.6994604468345642, |
|
"eval_runtime": 36.8305, |
|
"eval_samples_per_second": 764.91, |
|
"eval_steps_per_second": 23.92, |
|
"step": 324884 |
|
}, |
|
{ |
|
"epoch": 42.0, |
|
"learning_rate": 8.421052631578948e-06, |
|
"loss": 0.5731, |
|
"step": 332808 |
|
}, |
|
{ |
|
"epoch": 42.0, |
|
"eval_loss": 0.6827645301818848, |
|
"eval_runtime": 36.7809, |
|
"eval_samples_per_second": 765.941, |
|
"eval_steps_per_second": 23.953, |
|
"step": 332808 |
|
}, |
|
{ |
|
"epoch": 43.0, |
|
"learning_rate": 7.3684210526315784e-06, |
|
"loss": 0.5718, |
|
"step": 340732 |
|
}, |
|
{ |
|
"epoch": 43.0, |
|
"eval_loss": 0.7020975947380066, |
|
"eval_runtime": 36.6627, |
|
"eval_samples_per_second": 768.411, |
|
"eval_steps_per_second": 24.03, |
|
"step": 340732 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"learning_rate": 6.315789473684211e-06, |
|
"loss": 0.5663, |
|
"step": 348656 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"eval_loss": 0.6774910092353821, |
|
"eval_runtime": 36.8941, |
|
"eval_samples_per_second": 763.59, |
|
"eval_steps_per_second": 23.879, |
|
"step": 348656 |
|
}, |
|
{ |
|
"epoch": 45.0, |
|
"learning_rate": 5.263157894736842e-06, |
|
"loss": 0.5575, |
|
"step": 356580 |
|
}, |
|
{ |
|
"epoch": 45.0, |
|
"eval_loss": 0.683965265750885, |
|
"eval_runtime": 36.8405, |
|
"eval_samples_per_second": 764.703, |
|
"eval_steps_per_second": 23.914, |
|
"step": 356580 |
|
}, |
|
{ |
|
"epoch": 46.0, |
|
"learning_rate": 4.210526315789474e-06, |
|
"loss": 0.5524, |
|
"step": 364504 |
|
}, |
|
{ |
|
"epoch": 46.0, |
|
"eval_loss": 0.6812577247619629, |
|
"eval_runtime": 36.3196, |
|
"eval_samples_per_second": 775.669, |
|
"eval_steps_per_second": 24.257, |
|
"step": 364504 |
|
}, |
|
{ |
|
"epoch": 47.0, |
|
"learning_rate": 3.1578947368421056e-06, |
|
"loss": 0.5499, |
|
"step": 372428 |
|
}, |
|
{ |
|
"epoch": 47.0, |
|
"eval_loss": 0.675777792930603, |
|
"eval_runtime": 36.282, |
|
"eval_samples_per_second": 776.473, |
|
"eval_steps_per_second": 24.282, |
|
"step": 372428 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"learning_rate": 2.105263157894737e-06, |
|
"loss": 0.541, |
|
"step": 380352 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"eval_loss": 0.678913950920105, |
|
"eval_runtime": 36.4409, |
|
"eval_samples_per_second": 773.087, |
|
"eval_steps_per_second": 24.176, |
|
"step": 380352 |
|
}, |
|
{ |
|
"epoch": 49.0, |
|
"learning_rate": 1.0526315789473685e-06, |
|
"loss": 0.5372, |
|
"step": 388276 |
|
}, |
|
{ |
|
"epoch": 49.0, |
|
"eval_loss": 0.6796761155128479, |
|
"eval_runtime": 36.5663, |
|
"eval_samples_per_second": 770.435, |
|
"eval_steps_per_second": 24.093, |
|
"step": 388276 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"learning_rate": 0.0, |
|
"loss": 0.535, |
|
"step": 396200 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"eval_loss": 0.6791965365409851, |
|
"eval_runtime": 36.555, |
|
"eval_samples_per_second": 770.674, |
|
"eval_steps_per_second": 24.101, |
|
"step": 396200 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 396200, |
|
"num_train_epochs": 50, |
|
"save_steps": 500, |
|
"total_flos": 7.079826371258392e+17, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|