|
{ |
|
"best_metric": 0.4994313716888428, |
|
"best_model_checkpoint": "./vit-weight-decay-1e-2/checkpoint-5457", |
|
"epoch": 27.0, |
|
"eval_steps": 500, |
|
"global_step": 8667, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 27.05959129333496, |
|
"learning_rate": 2.6004922067268256e-05, |
|
"loss": 1.7124, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.6924410540915396, |
|
"eval_f1": 0.603022144115533, |
|
"eval_loss": 0.8697461485862732, |
|
"eval_precision": 0.6656411815509768, |
|
"eval_recall": 0.6924410540915396, |
|
"eval_runtime": 24.2005, |
|
"eval_samples_per_second": 119.171, |
|
"eval_steps_per_second": 14.917, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 31.123794555664062, |
|
"learning_rate": 5.233798195242002e-05, |
|
"loss": 1.1476, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.6990291262135923, |
|
"eval_f1": 0.714853056600681, |
|
"eval_loss": 0.7271208763122559, |
|
"eval_precision": 0.7684437503161472, |
|
"eval_recall": 0.6990291262135923, |
|
"eval_runtime": 23.1919, |
|
"eval_samples_per_second": 124.354, |
|
"eval_steps_per_second": 15.566, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 4.652539253234863, |
|
"learning_rate": 7.867104183757178e-05, |
|
"loss": 1.0734, |
|
"step": 963 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.7687239944521498, |
|
"eval_f1": 0.7417221561170182, |
|
"eval_loss": 0.6440889239311218, |
|
"eval_precision": 0.7568176957187778, |
|
"eval_recall": 0.7687239944521498, |
|
"eval_runtime": 23.9098, |
|
"eval_samples_per_second": 120.62, |
|
"eval_steps_per_second": 15.098, |
|
"step": 963 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 26.467485427856445, |
|
"learning_rate": 9.99845966779335e-05, |
|
"loss": 1.0271, |
|
"step": 1284 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.7773925104022191, |
|
"eval_f1": 0.7814315937772335, |
|
"eval_loss": 0.5854852199554443, |
|
"eval_precision": 0.78834249113171, |
|
"eval_recall": 0.7773925104022191, |
|
"eval_runtime": 23.2363, |
|
"eval_samples_per_second": 124.116, |
|
"eval_steps_per_second": 15.536, |
|
"step": 1284 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 9.265125274658203, |
|
"learning_rate": 9.93971225198763e-05, |
|
"loss": 0.9158, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.7635228848821082, |
|
"eval_f1": 0.7661961115296833, |
|
"eval_loss": 0.700226366519928, |
|
"eval_precision": 0.7929662231425049, |
|
"eval_recall": 0.7635228848821082, |
|
"eval_runtime": 23.9812, |
|
"eval_samples_per_second": 120.261, |
|
"eval_steps_per_second": 15.053, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 7.993426322937012, |
|
"learning_rate": 9.796799913911281e-05, |
|
"loss": 0.9167, |
|
"step": 1926 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.7812066574202496, |
|
"eval_f1": 0.7900429974573581, |
|
"eval_loss": 0.5867117047309875, |
|
"eval_precision": 0.806466933291998, |
|
"eval_recall": 0.7812066574202496, |
|
"eval_runtime": 23.5885, |
|
"eval_samples_per_second": 122.263, |
|
"eval_steps_per_second": 15.304, |
|
"step": 1926 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 8.043280601501465, |
|
"learning_rate": 9.572157654878572e-05, |
|
"loss": 0.786, |
|
"step": 2247 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.7340499306518724, |
|
"eval_f1": 0.751486114744647, |
|
"eval_loss": 0.6516677737236023, |
|
"eval_precision": 0.8047120902571854, |
|
"eval_recall": 0.7340499306518724, |
|
"eval_runtime": 24.7074, |
|
"eval_samples_per_second": 116.726, |
|
"eval_steps_per_second": 14.611, |
|
"step": 2247 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 3.0447545051574707, |
|
"learning_rate": 9.26961302542397e-05, |
|
"loss": 0.7406, |
|
"step": 2568 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.7066574202496533, |
|
"eval_f1": 0.7330088250984578, |
|
"eval_loss": 0.6647158265113831, |
|
"eval_precision": 0.8133714017605812, |
|
"eval_recall": 0.7066574202496533, |
|
"eval_runtime": 24.3728, |
|
"eval_samples_per_second": 118.329, |
|
"eval_steps_per_second": 14.812, |
|
"step": 2568 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 3.4238717555999756, |
|
"learning_rate": 8.89432090986511e-05, |
|
"loss": 0.682, |
|
"step": 2889 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.8228155339805825, |
|
"eval_f1": 0.8207044848679155, |
|
"eval_loss": 0.510608434677124, |
|
"eval_precision": 0.8230905088203688, |
|
"eval_recall": 0.8228155339805825, |
|
"eval_runtime": 24.4807, |
|
"eval_samples_per_second": 117.807, |
|
"eval_steps_per_second": 14.746, |
|
"step": 2889 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 11.989164352416992, |
|
"learning_rate": 8.45267569518721e-05, |
|
"loss": 0.6427, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.8165742024965326, |
|
"eval_f1": 0.8221884365803362, |
|
"eval_loss": 0.5032415390014648, |
|
"eval_precision": 0.835376334433007, |
|
"eval_recall": 0.8165742024965326, |
|
"eval_runtime": 23.2354, |
|
"eval_samples_per_second": 124.121, |
|
"eval_steps_per_second": 15.537, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"grad_norm": 34.21263885498047, |
|
"learning_rate": 7.952202320752798e-05, |
|
"loss": 0.5663, |
|
"step": 3531 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.8151872399445215, |
|
"eval_f1": 0.8216396746544562, |
|
"eval_loss": 0.5357819199562073, |
|
"eval_precision": 0.8325502393616235, |
|
"eval_recall": 0.8151872399445215, |
|
"eval_runtime": 24.4554, |
|
"eval_samples_per_second": 117.929, |
|
"eval_steps_per_second": 14.762, |
|
"step": 3531 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 44.03895568847656, |
|
"learning_rate": 7.401428065178325e-05, |
|
"loss": 0.5395, |
|
"step": 3852 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.8248959778085991, |
|
"eval_f1": 0.8298561156988427, |
|
"eval_loss": 0.5487632155418396, |
|
"eval_precision": 0.8391637957530821, |
|
"eval_recall": 0.8248959778085991, |
|
"eval_runtime": 23.3681, |
|
"eval_samples_per_second": 123.416, |
|
"eval_steps_per_second": 15.448, |
|
"step": 3852 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"grad_norm": 27.093250274658203, |
|
"learning_rate": 6.80973725492743e-05, |
|
"loss": 0.4468, |
|
"step": 4173 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.8231622746185853, |
|
"eval_f1": 0.8260339295474823, |
|
"eval_loss": 0.578988254070282, |
|
"eval_precision": 0.8397229815779756, |
|
"eval_recall": 0.8231622746185853, |
|
"eval_runtime": 24.8132, |
|
"eval_samples_per_second": 116.229, |
|
"eval_steps_per_second": 14.549, |
|
"step": 4173 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"grad_norm": 5.494964122772217, |
|
"learning_rate": 6.187211370157784e-05, |
|
"loss": 0.4247, |
|
"step": 4494 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.8415395284327323, |
|
"eval_f1": 0.8448790763195109, |
|
"eval_loss": 0.5437958240509033, |
|
"eval_precision": 0.8570099067237934, |
|
"eval_recall": 0.8415395284327323, |
|
"eval_runtime": 23.4933, |
|
"eval_samples_per_second": 122.758, |
|
"eval_steps_per_second": 15.366, |
|
"step": 4494 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"grad_norm": 5.022720813751221, |
|
"learning_rate": 5.544457272166217e-05, |
|
"loss": 0.3495, |
|
"step": 4815 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.8453536754507628, |
|
"eval_f1": 0.846725089255697, |
|
"eval_loss": 0.5135474801063538, |
|
"eval_precision": 0.8518606648557052, |
|
"eval_recall": 0.8453536754507628, |
|
"eval_runtime": 22.9349, |
|
"eval_samples_per_second": 125.747, |
|
"eval_steps_per_second": 15.74, |
|
"step": 4815 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"grad_norm": 21.08028221130371, |
|
"learning_rate": 4.894460661440583e-05, |
|
"loss": 0.3039, |
|
"step": 5136 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.8408460471567267, |
|
"eval_f1": 0.8447631505343497, |
|
"eval_loss": 0.5631198287010193, |
|
"eval_precision": 0.8520329480153485, |
|
"eval_recall": 0.8408460471567267, |
|
"eval_runtime": 22.9052, |
|
"eval_samples_per_second": 125.91, |
|
"eval_steps_per_second": 15.761, |
|
"step": 5136 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"grad_norm": 0.3130456805229187, |
|
"learning_rate": 4.244239774409037e-05, |
|
"loss": 0.2602, |
|
"step": 5457 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.8602635228848821, |
|
"eval_f1": 0.8599959330449201, |
|
"eval_loss": 0.4994313716888428, |
|
"eval_precision": 0.8617536928422816, |
|
"eval_recall": 0.8602635228848821, |
|
"eval_runtime": 24.0474, |
|
"eval_samples_per_second": 119.93, |
|
"eval_steps_per_second": 15.012, |
|
"step": 5457 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"grad_norm": 13.50483226776123, |
|
"learning_rate": 3.606895852147351e-05, |
|
"loss": 0.2616, |
|
"step": 5778 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.8564493758668515, |
|
"eval_f1": 0.8585397395861224, |
|
"eval_loss": 0.5405685901641846, |
|
"eval_precision": 0.8621724878350435, |
|
"eval_recall": 0.8564493758668515, |
|
"eval_runtime": 23.9163, |
|
"eval_samples_per_second": 120.587, |
|
"eval_steps_per_second": 15.094, |
|
"step": 5778 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"grad_norm": 0.11348175257444382, |
|
"learning_rate": 2.9932882319894417e-05, |
|
"loss": 0.1876, |
|
"step": 6099 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.848127600554785, |
|
"eval_f1": 0.8525273952669322, |
|
"eval_loss": 0.5612274408340454, |
|
"eval_precision": 0.8629290477513907, |
|
"eval_recall": 0.848127600554785, |
|
"eval_runtime": 24.2059, |
|
"eval_samples_per_second": 119.145, |
|
"eval_steps_per_second": 14.914, |
|
"step": 6099 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 1.7572256326675415, |
|
"learning_rate": 2.4138718220394167e-05, |
|
"loss": 0.2052, |
|
"step": 6420 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.8429264909847434, |
|
"eval_f1": 0.8427565344137952, |
|
"eval_loss": 0.6802518367767334, |
|
"eval_precision": 0.8502279276035353, |
|
"eval_recall": 0.8429264909847434, |
|
"eval_runtime": 23.6557, |
|
"eval_samples_per_second": 121.916, |
|
"eval_steps_per_second": 15.261, |
|
"step": 6420 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"grad_norm": 11.348849296569824, |
|
"learning_rate": 1.8785189659922232e-05, |
|
"loss": 0.1533, |
|
"step": 6741 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_accuracy": 0.8734396671289875, |
|
"eval_f1": 0.870883732217841, |
|
"eval_loss": 0.546351432800293, |
|
"eval_precision": 0.8698420453848273, |
|
"eval_recall": 0.8734396671289875, |
|
"eval_runtime": 23.8505, |
|
"eval_samples_per_second": 120.92, |
|
"eval_steps_per_second": 15.136, |
|
"step": 6741 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"grad_norm": 54.87451171875, |
|
"learning_rate": 1.396351233934956e-05, |
|
"loss": 0.1175, |
|
"step": 7062 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_accuracy": 0.8685852981969486, |
|
"eval_f1": 0.8672514399266257, |
|
"eval_loss": 0.5572792291641235, |
|
"eval_precision": 0.8667361486336195, |
|
"eval_recall": 0.8685852981969486, |
|
"eval_runtime": 23.4628, |
|
"eval_samples_per_second": 122.918, |
|
"eval_steps_per_second": 15.386, |
|
"step": 7062 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"grad_norm": 0.37277576327323914, |
|
"learning_rate": 9.755840051487997e-06, |
|
"loss": 0.1218, |
|
"step": 7383 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_accuracy": 0.8703190013869625, |
|
"eval_f1": 0.8669113876377326, |
|
"eval_loss": 0.6043308973312378, |
|
"eval_precision": 0.8680623901339135, |
|
"eval_recall": 0.8703190013869625, |
|
"eval_runtime": 23.6979, |
|
"eval_samples_per_second": 121.698, |
|
"eval_steps_per_second": 15.233, |
|
"step": 7383 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"grad_norm": 1.1696586608886719, |
|
"learning_rate": 6.233864909760889e-06, |
|
"loss": 0.114, |
|
"step": 7704 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_accuracy": 0.871012482662968, |
|
"eval_f1": 0.8692580136075093, |
|
"eval_loss": 0.5944604277610779, |
|
"eval_precision": 0.8705845000186183, |
|
"eval_recall": 0.871012482662968, |
|
"eval_runtime": 22.9273, |
|
"eval_samples_per_second": 125.789, |
|
"eval_steps_per_second": 15.745, |
|
"step": 7704 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"grad_norm": 0.24613961577415466, |
|
"learning_rate": 3.457595827424931e-06, |
|
"loss": 0.104, |
|
"step": 8025 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"eval_accuracy": 0.8765603328710125, |
|
"eval_f1": 0.8751858009450587, |
|
"eval_loss": 0.5850355625152588, |
|
"eval_precision": 0.8753430843268125, |
|
"eval_recall": 0.8765603328710125, |
|
"eval_runtime": 23.185, |
|
"eval_samples_per_second": 124.391, |
|
"eval_steps_per_second": 15.57, |
|
"step": 8025 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"grad_norm": 15.901623725891113, |
|
"learning_rate": 1.4743360601349622e-06, |
|
"loss": 0.0752, |
|
"step": 8346 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_accuracy": 0.8782940360610264, |
|
"eval_f1": 0.8757212451946566, |
|
"eval_loss": 0.5867504477500916, |
|
"eval_precision": 0.8747328684004348, |
|
"eval_recall": 0.8782940360610264, |
|
"eval_runtime": 24.8645, |
|
"eval_samples_per_second": 115.989, |
|
"eval_steps_per_second": 14.519, |
|
"step": 8346 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"grad_norm": 0.1058092936873436, |
|
"learning_rate": 3.1787723291717977e-07, |
|
"loss": 0.1309, |
|
"step": 8667 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"eval_accuracy": 0.8786407766990292, |
|
"eval_f1": 0.876119808222944, |
|
"eval_loss": 0.5839141011238098, |
|
"eval_precision": 0.875311708483151, |
|
"eval_recall": 0.8786407766990292, |
|
"eval_runtime": 24.8375, |
|
"eval_samples_per_second": 116.115, |
|
"eval_steps_per_second": 14.534, |
|
"step": 8667 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"step": 8667, |
|
"total_flos": 1.0729711727592948e+19, |
|
"train_loss": 0.5187536703752126, |
|
"train_runtime": 3224.0652, |
|
"train_samples_per_second": 159.054, |
|
"train_steps_per_second": 9.956 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 32100, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 100, |
|
"save_steps": 500, |
|
"total_flos": 1.0729711727592948e+19, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|