|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.2465230735200596, |
|
"global_step": 325000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 1.993087621312246e-06, |
|
"loss": 0.2839, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 1.986175242624492e-06, |
|
"loss": 0.2584, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 1.979262863936738e-06, |
|
"loss": 0.2684, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 1.972350485248984e-06, |
|
"loss": 0.2725, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 1.96543810656123e-06, |
|
"loss": 0.2631, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"eval_accuracy": 0.9506088495254517, |
|
"eval_loss": 0.2637203335762024, |
|
"eval_runtime": 690.6403, |
|
"eval_samples_per_second": 46.495, |
|
"eval_steps_per_second": 11.624, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 1.958525727873476e-06, |
|
"loss": 0.2808, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"learning_rate": 1.951613349185722e-06, |
|
"loss": 0.2601, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"learning_rate": 1.944700970497968e-06, |
|
"loss": 0.2746, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"learning_rate": 1.937788591810214e-06, |
|
"loss": 0.2688, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"learning_rate": 1.93087621312246e-06, |
|
"loss": 0.2619, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"eval_accuracy": 0.9507022500038147, |
|
"eval_loss": 0.2595760226249695, |
|
"eval_runtime": 690.7368, |
|
"eval_samples_per_second": 46.488, |
|
"eval_steps_per_second": 11.622, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"learning_rate": 1.9239638344347054e-06, |
|
"loss": 0.2765, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"learning_rate": 1.9170514557469514e-06, |
|
"loss": 0.2736, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"learning_rate": 1.9101390770591978e-06, |
|
"loss": 0.2616, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"learning_rate": 1.9032266983714436e-06, |
|
"loss": 0.2558, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"learning_rate": 1.8963143196836895e-06, |
|
"loss": 0.2653, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"eval_accuracy": 0.9514496326446533, |
|
"eval_loss": 0.2495056539773941, |
|
"eval_runtime": 689.7436, |
|
"eval_samples_per_second": 46.555, |
|
"eval_steps_per_second": 11.639, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"learning_rate": 1.8894019409959353e-06, |
|
"loss": 0.2594, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"learning_rate": 1.8824895623081813e-06, |
|
"loss": 0.2678, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"learning_rate": 1.8755771836204273e-06, |
|
"loss": 0.2648, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"learning_rate": 1.8686648049326735e-06, |
|
"loss": 0.2671, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"learning_rate": 1.8617524262449193e-06, |
|
"loss": 0.262, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"eval_accuracy": 0.9511382579803467, |
|
"eval_loss": 0.25494277477264404, |
|
"eval_runtime": 689.2307, |
|
"eval_samples_per_second": 46.59, |
|
"eval_steps_per_second": 11.648, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"learning_rate": 1.8548400475571653e-06, |
|
"loss": 0.273, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"learning_rate": 1.8479276688694113e-06, |
|
"loss": 0.265, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"learning_rate": 1.841015290181657e-06, |
|
"loss": 0.2677, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"learning_rate": 1.8341029114939032e-06, |
|
"loss": 0.2644, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"learning_rate": 1.8271905328061492e-06, |
|
"loss": 0.2667, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"eval_accuracy": 0.9517922401428223, |
|
"eval_loss": 0.2526009976863861, |
|
"eval_runtime": 689.2471, |
|
"eval_samples_per_second": 46.589, |
|
"eval_steps_per_second": 11.647, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"learning_rate": 1.8202781541183952e-06, |
|
"loss": 0.262, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"learning_rate": 1.813365775430641e-06, |
|
"loss": 0.2689, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"learning_rate": 1.806453396742887e-06, |
|
"loss": 0.2713, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"learning_rate": 1.7995410180551332e-06, |
|
"loss": 0.2522, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"learning_rate": 1.7926286393673792e-06, |
|
"loss": 0.2425, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"eval_accuracy": 0.9516676664352417, |
|
"eval_loss": 0.2566453516483307, |
|
"eval_runtime": 689.1693, |
|
"eval_samples_per_second": 46.594, |
|
"eval_steps_per_second": 11.649, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"learning_rate": 1.785716260679625e-06, |
|
"loss": 0.2445, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"learning_rate": 1.778803881991871e-06, |
|
"loss": 0.2353, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"learning_rate": 1.771891503304117e-06, |
|
"loss": 0.2478, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"learning_rate": 1.764979124616363e-06, |
|
"loss": 0.2461, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"learning_rate": 1.758066745928609e-06, |
|
"loss": 0.2506, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"eval_accuracy": 0.9521347880363464, |
|
"eval_loss": 0.25962916016578674, |
|
"eval_runtime": 689.0728, |
|
"eval_samples_per_second": 46.6, |
|
"eval_steps_per_second": 11.65, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"learning_rate": 1.751154367240855e-06, |
|
"loss": 0.245, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"learning_rate": 1.7442419885531009e-06, |
|
"loss": 0.2516, |
|
"step": 185000 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"learning_rate": 1.7373296098653467e-06, |
|
"loss": 0.2392, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"learning_rate": 1.7304172311775929e-06, |
|
"loss": 0.2495, |
|
"step": 195000 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"learning_rate": 1.7235048524898389e-06, |
|
"loss": 0.2485, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"eval_accuracy": 0.9533492922782898, |
|
"eval_loss": 0.2515329122543335, |
|
"eval_runtime": 689.5664, |
|
"eval_samples_per_second": 46.567, |
|
"eval_steps_per_second": 11.642, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"learning_rate": 1.7165924738020846e-06, |
|
"loss": 0.2311, |
|
"step": 205000 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"learning_rate": 1.7096800951143306e-06, |
|
"loss": 0.2335, |
|
"step": 210000 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"learning_rate": 1.7027677164265766e-06, |
|
"loss": 0.2568, |
|
"step": 215000 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"learning_rate": 1.6958553377388226e-06, |
|
"loss": 0.2392, |
|
"step": 220000 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"learning_rate": 1.6889429590510686e-06, |
|
"loss": 0.2411, |
|
"step": 225000 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"eval_accuracy": 0.9527575969696045, |
|
"eval_loss": 0.25586625933647156, |
|
"eval_runtime": 688.9498, |
|
"eval_samples_per_second": 46.609, |
|
"eval_steps_per_second": 11.653, |
|
"step": 225000 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"learning_rate": 1.6820305803633146e-06, |
|
"loss": 0.2474, |
|
"step": 230000 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"learning_rate": 1.6751182016755606e-06, |
|
"loss": 0.2511, |
|
"step": 235000 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"learning_rate": 1.6682058229878063e-06, |
|
"loss": 0.2435, |
|
"step": 240000 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"learning_rate": 1.6612934443000523e-06, |
|
"loss": 0.2485, |
|
"step": 245000 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"learning_rate": 1.6543810656122985e-06, |
|
"loss": 0.234, |
|
"step": 250000 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"eval_accuracy": 0.9526330828666687, |
|
"eval_loss": 0.2587934136390686, |
|
"eval_runtime": 688.8864, |
|
"eval_samples_per_second": 46.613, |
|
"eval_steps_per_second": 11.654, |
|
"step": 250000 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"learning_rate": 1.6474686869245445e-06, |
|
"loss": 0.2534, |
|
"step": 255000 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"learning_rate": 1.6405563082367903e-06, |
|
"loss": 0.2432, |
|
"step": 260000 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"learning_rate": 1.6336439295490363e-06, |
|
"loss": 0.2466, |
|
"step": 265000 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"learning_rate": 1.6267315508612823e-06, |
|
"loss": 0.2393, |
|
"step": 270000 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"learning_rate": 1.6198191721735283e-06, |
|
"loss": 0.242, |
|
"step": 275000 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"eval_accuracy": 0.9526330828666687, |
|
"eval_loss": 0.2543439269065857, |
|
"eval_runtime": 688.3982, |
|
"eval_samples_per_second": 46.646, |
|
"eval_steps_per_second": 11.662, |
|
"step": 275000 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"learning_rate": 1.6129067934857743e-06, |
|
"loss": 0.2455, |
|
"step": 280000 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"learning_rate": 1.6059944147980203e-06, |
|
"loss": 0.2413, |
|
"step": 285000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"learning_rate": 1.5990820361102662e-06, |
|
"loss": 0.241, |
|
"step": 290000 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"learning_rate": 1.592169657422512e-06, |
|
"loss": 0.2401, |
|
"step": 295000 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"learning_rate": 1.5852572787347582e-06, |
|
"loss": 0.217, |
|
"step": 300000 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"eval_accuracy": 0.9530379176139832, |
|
"eval_loss": 0.26496848464012146, |
|
"eval_runtime": 688.7791, |
|
"eval_samples_per_second": 46.62, |
|
"eval_steps_per_second": 11.655, |
|
"step": 300000 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"learning_rate": 1.5783449000470042e-06, |
|
"loss": 0.2281, |
|
"step": 305000 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"learning_rate": 1.57143252135925e-06, |
|
"loss": 0.2203, |
|
"step": 310000 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"learning_rate": 1.564520142671496e-06, |
|
"loss": 0.219, |
|
"step": 315000 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"learning_rate": 1.557607763983742e-06, |
|
"loss": 0.2287, |
|
"step": 320000 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"learning_rate": 1.5506953852959882e-06, |
|
"loss": 0.2272, |
|
"step": 325000 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"eval_accuracy": 0.9527264833450317, |
|
"eval_loss": 0.2502051889896393, |
|
"eval_runtime": 688.3338, |
|
"eval_samples_per_second": 46.65, |
|
"eval_steps_per_second": 11.663, |
|
"step": 325000 |
|
} |
|
], |
|
"max_steps": 1446680, |
|
"num_train_epochs": 10, |
|
"total_flos": 3.420489354301194e+17, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|