End of training

Browse files

Files changed (5) hide show

README.md +5 -5
all_results.json +16 -0
eval_results.json +11 -0
train_results.json +8 -0
trainer_state.json +1571 -0

README.md CHANGED Viewed

@@ -23,11 +23,11 @@ should probably proofread and complete it, then remove this comment. -->
 This model is a fine-tuned version of [facebook/deit-base-patch16-224](https://huggingface.co/facebook/deit-base-patch16-224) on the medmnist-v2 dataset.
 It achieves the following results on the evaluation set:
-- Loss: 0.0745
-- Accuracy: 0.9870
-- Precision: 0.9877
-- Recall: 0.9863
-- F1: 0.9868
 ## Model description

 This model is a fine-tuned version of [facebook/deit-base-patch16-224](https://huggingface.co/facebook/deit-base-patch16-224) on the medmnist-v2 dataset.
 It achieves the following results on the evaluation set:
+- Loss: 0.2795
+- Accuracy: 0.9240
+- Precision: 0.9199
+- Recall: 0.9123
+- F1: 0.9154
 ## Model description

all_results.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "epoch": 9.99,
+    "eval_accuracy": 0.9240445089501693,
+    "eval_f1": 0.9153681639608009,
+    "eval_loss": 0.2795476019382477,
+    "eval_precision": 0.9198770454998186,
+    "eval_recall": 0.912344054362017,
+    "eval_runtime": 43.4915,
+    "eval_samples_per_second": 190.106,
+    "eval_steps_per_second": 11.887,
+    "total_flos": 1.0133154899356189e+19,
+    "train_loss": 0.5248493043072705,
+    "train_runtime": 1518.4893,
+    "train_samples_per_second": 85.611,
+    "train_steps_per_second": 1.337
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+    "epoch": 9.99,
+    "eval_accuracy": 0.9240445089501693,
+    "eval_f1": 0.9153681639608009,
+    "eval_loss": 0.2795476019382477,
+    "eval_precision": 0.9198770454998186,
+    "eval_recall": 0.912344054362017,
+    "eval_runtime": 43.4915,
+    "eval_samples_per_second": 190.106,
+    "eval_steps_per_second": 11.887
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 9.99,
+    "total_flos": 1.0133154899356189e+19,
+    "train_loss": 0.5248493043072705,
+    "train_runtime": 1518.4893,
+    "train_samples_per_second": 85.611,
+    "train_steps_per_second": 1.337
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1571 @@

+{
+  "best_metric": 0.9870401337792643,
+  "best_model_checkpoint": "deit-base-patch16-224-finetuned-lora-medmnistv2/checkpoint-2030",
+  "epoch": 9.98769987699877,
+  "eval_steps": 500,
+  "global_step": 2030,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.05,
+      "grad_norm": 3.303891658782959,
+      "learning_rate": 0.004975369458128079,
+      "loss": 1.8368,
+      "step": 10
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.481412649154663,
+      "learning_rate": 0.004950738916256157,
+      "loss": 1.2897,
+      "step": 20
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 2.0582990646362305,
+      "learning_rate": 0.0049261083743842365,
+      "loss": 1.0672,
+      "step": 30
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.254044532775879,
+      "learning_rate": 0.004901477832512316,
+      "loss": 0.9723,
+      "step": 40
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 2.3351266384124756,
+      "learning_rate": 0.004876847290640395,
+      "loss": 0.917,
+      "step": 50
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.9788981676101685,
+      "learning_rate": 0.004852216748768473,
+      "loss": 0.8483,
+      "step": 60
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.460303544998169,
+      "learning_rate": 0.004827586206896552,
+      "loss": 0.7848,
+      "step": 70
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 3.4298815727233887,
+      "learning_rate": 0.004802955665024631,
+      "loss": 0.8819,
+      "step": 80
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.531933069229126,
+      "learning_rate": 0.004778325123152709,
+      "loss": 0.7758,
+      "step": 90
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.608288049697876,
+      "learning_rate": 0.004753694581280788,
+      "loss": 0.7678,
+      "step": 100
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.9291895627975464,
+      "learning_rate": 0.004729064039408867,
+      "loss": 0.7848,
+      "step": 110
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.8717544078826904,
+      "learning_rate": 0.004704433497536946,
+      "loss": 0.7742,
+      "step": 120
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 4.476926803588867,
+      "learning_rate": 0.004679802955665025,
+      "loss": 0.8906,
+      "step": 130
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 2.5952930450439453,
+      "learning_rate": 0.004655172413793103,
+      "loss": 0.8464,
+      "step": 140
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.8514671325683594,
+      "learning_rate": 0.004630541871921182,
+      "loss": 0.9079,
+      "step": 150
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.962122917175293,
+      "learning_rate": 0.004605911330049261,
+      "loss": 0.81,
+      "step": 160
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.9573622941970825,
+      "learning_rate": 0.00458128078817734,
+      "loss": 0.8099,
+      "step": 170
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.8094934225082397,
+      "learning_rate": 0.004559113300492611,
+      "loss": 0.8105,
+      "step": 180
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 2.7324118614196777,
+      "learning_rate": 0.00453448275862069,
+      "loss": 0.8123,
+      "step": 190
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 2.355945348739624,
+      "learning_rate": 0.004509852216748769,
+      "loss": 0.7947,
+      "step": 200
+    },
+    {
+      "epoch": 1.0,
+      "eval_accuracy": 0.8975752508361204,
+      "eval_f1": 0.863152897342088,
+      "eval_loss": 0.3122749328613281,
+      "eval_precision": 0.909035520710901,
+      "eval_recall": 0.8450098410817735,
+      "eval_runtime": 12.4051,
+      "eval_samples_per_second": 192.824,
+      "eval_steps_per_second": 12.092,
+      "step": 203
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 1.4632035493850708,
+      "learning_rate": 0.004485221674876847,
+      "loss": 0.8797,
+      "step": 210
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 2.434492349624634,
+      "learning_rate": 0.004460591133004926,
+      "loss": 0.6601,
+      "step": 220
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 1.9131174087524414,
+      "learning_rate": 0.004435960591133005,
+      "loss": 0.7093,
+      "step": 230
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 1.491714358329773,
+      "learning_rate": 0.004411330049261084,
+      "loss": 0.7247,
+      "step": 240
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 1.9442235231399536,
+      "learning_rate": 0.004386699507389163,
+      "loss": 0.7182,
+      "step": 250
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 1.9666441679000854,
+      "learning_rate": 0.004362068965517241,
+      "loss": 0.733,
+      "step": 260
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 1.89641273021698,
+      "learning_rate": 0.00433743842364532,
+      "loss": 0.6678,
+      "step": 270
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 1.3621200323104858,
+      "learning_rate": 0.004312807881773399,
+      "loss": 0.7066,
+      "step": 280
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 1.8440511226654053,
+      "learning_rate": 0.004288177339901478,
+      "loss": 0.673,
+      "step": 290
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 1.712856650352478,
+      "learning_rate": 0.0042635467980295565,
+      "loss": 0.7424,
+      "step": 300
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 1.674052357673645,
+      "learning_rate": 0.004238916256157636,
+      "loss": 0.6104,
+      "step": 310
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 1.5503976345062256,
+      "learning_rate": 0.004214285714285715,
+      "loss": 0.6868,
+      "step": 320
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 1.0976303815841675,
+      "learning_rate": 0.004189655172413793,
+      "loss": 0.703,
+      "step": 330
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 1.5332385301589966,
+      "learning_rate": 0.004165024630541872,
+      "loss": 0.6599,
+      "step": 340
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 1.4168181419372559,
+      "learning_rate": 0.004140394088669951,
+      "loss": 0.7273,
+      "step": 350
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 2.414102792739868,
+      "learning_rate": 0.00411576354679803,
+      "loss": 0.6551,
+      "step": 360
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 1.9595593214035034,
+      "learning_rate": 0.004091133004926108,
+      "loss": 0.7608,
+      "step": 370
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 1.0985585451126099,
+      "learning_rate": 0.0040665024630541875,
+      "loss": 0.6946,
+      "step": 380
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 2.288224458694458,
+      "learning_rate": 0.004041871921182267,
+      "loss": 0.7381,
+      "step": 390
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 1.385890245437622,
+      "learning_rate": 0.004017241379310345,
+      "loss": 0.6703,
+      "step": 400
+    },
+    {
+      "epoch": 2.0,
+      "eval_accuracy": 0.9607023411371237,
+      "eval_f1": 0.9535454483827537,
+      "eval_loss": 0.14003877341747284,
+      "eval_precision": 0.9589551276899428,
+      "eval_recall": 0.9543228146341872,
+      "eval_runtime": 12.5652,
+      "eval_samples_per_second": 190.367,
+      "eval_steps_per_second": 11.938,
+      "step": 406
+    },
+    {
+      "epoch": 2.02,
+      "grad_norm": 2.190495729446411,
+      "learning_rate": 0.003992610837438423,
+      "loss": 0.6558,
+      "step": 410
+    },
+    {
+      "epoch": 2.07,
+      "grad_norm": 2.319401979446411,
+      "learning_rate": 0.003967980295566502,
+      "loss": 0.618,
+      "step": 420
+    },
+    {
+      "epoch": 2.12,
+      "grad_norm": 1.5089792013168335,
+      "learning_rate": 0.003943349753694581,
+      "loss": 0.6622,
+      "step": 430
+    },
+    {
+      "epoch": 2.16,
+      "grad_norm": 1.2998738288879395,
+      "learning_rate": 0.00391871921182266,
+      "loss": 0.6039,
+      "step": 440
+    },
+    {
+      "epoch": 2.21,
+      "grad_norm": 1.5582971572875977,
+      "learning_rate": 0.003894088669950739,
+      "loss": 0.585,
+      "step": 450
+    },
+    {
+      "epoch": 2.26,
+      "grad_norm": 2.1443846225738525,
+      "learning_rate": 0.0038694581280788176,
+      "loss": 0.6739,
+      "step": 460
+    },
+    {
+      "epoch": 2.31,
+      "grad_norm": 1.1868767738342285,
+      "learning_rate": 0.0038448275862068967,
+      "loss": 0.6598,
+      "step": 470
+    },
+    {
+      "epoch": 2.36,
+      "grad_norm": 1.5321897268295288,
+      "learning_rate": 0.0038201970443349754,
+      "loss": 0.6058,
+      "step": 480
+    },
+    {
+      "epoch": 2.41,
+      "grad_norm": 1.2971707582473755,
+      "learning_rate": 0.0037955665024630545,
+      "loss": 0.6025,
+      "step": 490
+    },
+    {
+      "epoch": 2.46,
+      "grad_norm": 1.0405155420303345,
+      "learning_rate": 0.003770935960591133,
+      "loss": 0.6544,
+      "step": 500
+    },
+    {
+      "epoch": 2.51,
+      "grad_norm": 2.27400541305542,
+      "learning_rate": 0.0037463054187192118,
+      "loss": 0.639,
+      "step": 510
+    },
+    {
+      "epoch": 2.56,
+      "grad_norm": 1.8367363214492798,
+      "learning_rate": 0.003721674876847291,
+      "loss": 0.6922,
+      "step": 520
+    },
+    {
+      "epoch": 2.61,
+      "grad_norm": 1.6862225532531738,
+      "learning_rate": 0.0036970443349753695,
+      "loss": 0.6698,
+      "step": 530
+    },
+    {
+      "epoch": 2.66,
+      "grad_norm": 1.1783074140548706,
+      "learning_rate": 0.0036724137931034486,
+      "loss": 0.648,
+      "step": 540
+    },
+    {
+      "epoch": 2.71,
+      "grad_norm": 1.327495813369751,
+      "learning_rate": 0.0036477832512315273,
+      "loss": 0.5485,
+      "step": 550
+    },
+    {
+      "epoch": 2.76,
+      "grad_norm": 1.2704271078109741,
+      "learning_rate": 0.0036231527093596064,
+      "loss": 0.6125,
+      "step": 560
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 1.412690281867981,
+      "learning_rate": 0.003598522167487685,
+      "loss": 0.5872,
+      "step": 570
+    },
+    {
+      "epoch": 2.85,
+      "grad_norm": 1.5771632194519043,
+      "learning_rate": 0.0035738916256157637,
+      "loss": 0.541,
+      "step": 580
+    },
+    {
+      "epoch": 2.9,
+      "grad_norm": 1.2916010618209839,
+      "learning_rate": 0.0035492610837438428,
+      "loss": 0.6637,
+      "step": 590
+    },
+    {
+      "epoch": 2.95,
+      "grad_norm": 1.0224180221557617,
+      "learning_rate": 0.003524630541871921,
+      "loss": 0.5941,
+      "step": 600
+    },
+    {
+      "epoch": 3.0,
+      "eval_accuracy": 0.9698996655518395,
+      "eval_f1": 0.9649463741495116,
+      "eval_loss": 0.11816500872373581,
+      "eval_precision": 0.9646998653209304,
+      "eval_recall": 0.9681395759866063,
+      "eval_runtime": 12.5344,
+      "eval_samples_per_second": 190.834,
+      "eval_steps_per_second": 11.967,
+      "step": 609
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 1.8427205085754395,
+      "learning_rate": 0.0034999999999999996,
+      "loss": 0.6179,
+      "step": 610
+    },
+    {
+      "epoch": 3.05,
+      "grad_norm": 1.1675821542739868,
+      "learning_rate": 0.0034753694581280787,
+      "loss": 0.6263,
+      "step": 620
+    },
+    {
+      "epoch": 3.1,
+      "grad_norm": 1.6908611059188843,
+      "learning_rate": 0.0034507389162561574,
+      "loss": 0.7175,
+      "step": 630
+    },
+    {
+      "epoch": 3.15,
+      "grad_norm": 1.5712032318115234,
+      "learning_rate": 0.0034261083743842365,
+      "loss": 0.6474,
+      "step": 640
+    },
+    {
+      "epoch": 3.2,
+      "grad_norm": 1.8690963983535767,
+      "learning_rate": 0.003401477832512315,
+      "loss": 0.5849,
+      "step": 650
+    },
+    {
+      "epoch": 3.25,
+      "grad_norm": 1.6917773485183716,
+      "learning_rate": 0.0033768472906403942,
+      "loss": 0.5954,
+      "step": 660
+    },
+    {
+      "epoch": 3.3,
+      "grad_norm": 1.4844752550125122,
+      "learning_rate": 0.003352216748768473,
+      "loss": 0.6284,
+      "step": 670
+    },
+    {
+      "epoch": 3.35,
+      "grad_norm": 1.444581389427185,
+      "learning_rate": 0.003327586206896552,
+      "loss": 0.5529,
+      "step": 680
+    },
+    {
+      "epoch": 3.39,
+      "grad_norm": 1.3921010494232178,
+      "learning_rate": 0.0033029556650246306,
+      "loss": 0.6022,
+      "step": 690
+    },
+    {
+      "epoch": 3.44,
+      "grad_norm": 1.3489701747894287,
+      "learning_rate": 0.0032783251231527093,
+      "loss": 0.6314,
+      "step": 700
+    },
+    {
+      "epoch": 3.49,
+      "grad_norm": 1.7876464128494263,
+      "learning_rate": 0.0032536945812807884,
+      "loss": 0.5322,
+      "step": 710
+    },
+    {
+      "epoch": 3.54,
+      "grad_norm": 1.2738828659057617,
+      "learning_rate": 0.003229064039408867,
+      "loss": 0.5869,
+      "step": 720
+    },
+    {
+      "epoch": 3.59,
+      "grad_norm": 1.5368149280548096,
+      "learning_rate": 0.003204433497536946,
+      "loss": 0.5659,
+      "step": 730
+    },
+    {
+      "epoch": 3.64,
+      "grad_norm": 1.890324354171753,
+      "learning_rate": 0.0031798029556650248,
+      "loss": 0.5506,
+      "step": 740
+    },
+    {
+      "epoch": 3.69,
+      "grad_norm": 1.553797721862793,
+      "learning_rate": 0.003155172413793104,
+      "loss": 0.6645,
+      "step": 750
+    },
+    {
+      "epoch": 3.74,
+      "grad_norm": 1.3873250484466553,
+      "learning_rate": 0.0031305418719211825,
+      "loss": 0.5013,
+      "step": 760
+    },
+    {
+      "epoch": 3.79,
+      "grad_norm": 1.6613869667053223,
+      "learning_rate": 0.0031059113300492616,
+      "loss": 0.4614,
+      "step": 770
+    },
+    {
+      "epoch": 3.84,
+      "grad_norm": 1.3628942966461182,
+      "learning_rate": 0.00308128078817734,
+      "loss": 0.5874,
+      "step": 780
+    },
+    {
+      "epoch": 3.89,
+      "grad_norm": 1.5102113485336304,
+      "learning_rate": 0.0030566502463054185,
+      "loss": 0.5564,
+      "step": 790
+    },
+    {
+      "epoch": 3.94,
+      "grad_norm": 1.4211273193359375,
+      "learning_rate": 0.0030320197044334976,
+      "loss": 0.6018,
+      "step": 800
+    },
+    {
+      "epoch": 3.99,
+      "grad_norm": 1.8017326593399048,
+      "learning_rate": 0.0030073891625615762,
+      "loss": 0.5837,
+      "step": 810
+    },
+    {
+      "epoch": 4.0,
+      "eval_accuracy": 0.967809364548495,
+      "eval_f1": 0.9551154966770515,
+      "eval_loss": 0.10157252848148346,
+      "eval_precision": 0.9557533496633682,
+      "eval_recall": 0.9586131276038764,
+      "eval_runtime": 12.6536,
+      "eval_samples_per_second": 189.038,
+      "eval_steps_per_second": 11.854,
+      "step": 813
+    },
+    {
+      "epoch": 4.03,
+      "grad_norm": 1.3088445663452148,
+      "learning_rate": 0.002982758620689655,
+      "loss": 0.5057,
+      "step": 820
+    },
+    {
+      "epoch": 4.08,
+      "grad_norm": 1.200412631034851,
+      "learning_rate": 0.002958128078817734,
+      "loss": 0.5485,
+      "step": 830
+    },
+    {
+      "epoch": 4.13,
+      "grad_norm": 1.6468169689178467,
+      "learning_rate": 0.0029334975369458127,
+      "loss": 0.5171,
+      "step": 840
+    },
+    {
+      "epoch": 4.18,
+      "grad_norm": 1.0748703479766846,
+      "learning_rate": 0.0029088669950738917,
+      "loss": 0.5664,
+      "step": 850
+    },
+    {
+      "epoch": 4.23,
+      "grad_norm": 1.396888256072998,
+      "learning_rate": 0.0028842364532019704,
+      "loss": 0.4641,
+      "step": 860
+    },
+    {
+      "epoch": 4.28,
+      "grad_norm": 1.0845372676849365,
+      "learning_rate": 0.0028596059113300495,
+      "loss": 0.5789,
+      "step": 870
+    },
+    {
+      "epoch": 4.33,
+      "grad_norm": 1.4134384393692017,
+      "learning_rate": 0.002834975369458128,
+      "loss": 0.4361,
+      "step": 880
+    },
+    {
+      "epoch": 4.38,
+      "grad_norm": 0.7656651735305786,
+      "learning_rate": 0.002810344827586207,
+      "loss": 0.5938,
+      "step": 890
+    },
+    {
+      "epoch": 4.43,
+      "grad_norm": 1.459712028503418,
+      "learning_rate": 0.002785714285714286,
+      "loss": 0.5146,
+      "step": 900
+    },
+    {
+      "epoch": 4.48,
+      "grad_norm": 1.2046053409576416,
+      "learning_rate": 0.0027610837438423646,
+      "loss": 0.4882,
+      "step": 910
+    },
+    {
+      "epoch": 4.53,
+      "grad_norm": 1.1301757097244263,
+      "learning_rate": 0.0027364532019704436,
+      "loss": 0.4728,
+      "step": 920
+    },
+    {
+      "epoch": 4.58,
+      "grad_norm": 1.255055546760559,
+      "learning_rate": 0.0027118226600985223,
+      "loss": 0.4384,
+      "step": 930
+    },
+    {
+      "epoch": 4.62,
+      "grad_norm": 1.3792164325714111,
+      "learning_rate": 0.0026871921182266014,
+      "loss": 0.5357,
+      "step": 940
+    },
+    {
+      "epoch": 4.67,
+      "grad_norm": 1.3066402673721313,
+      "learning_rate": 0.00266256157635468,
+      "loss": 0.5361,
+      "step": 950
+    },
+    {
+      "epoch": 4.72,
+      "grad_norm": 1.2377945184707642,
+      "learning_rate": 0.002637931034482759,
+      "loss": 0.5334,
+      "step": 960
+    },
+    {
+      "epoch": 4.77,
+      "grad_norm": 1.3673447370529175,
+      "learning_rate": 0.0026133004926108374,
+      "loss": 0.4896,
+      "step": 970
+    },
+    {
+      "epoch": 4.82,
+      "grad_norm": 1.8711413145065308,
+      "learning_rate": 0.002588669950738916,
+      "loss": 0.4729,
+      "step": 980
+    },
+    {
+      "epoch": 4.87,
+      "grad_norm": 1.1367807388305664,
+      "learning_rate": 0.002564039408866995,
+      "loss": 0.483,
+      "step": 990
+    },
+    {
+      "epoch": 4.92,
+      "grad_norm": 1.5432896614074707,
+      "learning_rate": 0.0025394088669950738,
+      "loss": 0.5477,
+      "step": 1000
+    },
+    {
+      "epoch": 4.97,
+      "grad_norm": 1.0067399740219116,
+      "learning_rate": 0.0025147783251231524,
+      "loss": 0.5193,
+      "step": 1010
+    },
+    {
+      "epoch": 5.0,
+      "eval_accuracy": 0.9790969899665551,
+      "eval_f1": 0.9675306891159455,
+      "eval_loss": 0.08001040667295456,
+      "eval_precision": 0.9700832487729493,
+      "eval_recall": 0.9684116828701858,
+      "eval_runtime": 12.5434,
+      "eval_samples_per_second": 190.698,
+      "eval_steps_per_second": 11.958,
+      "step": 1016
+    },
+    {
+      "epoch": 5.02,
+      "grad_norm": 1.8359023332595825,
+      "learning_rate": 0.0024901477832512315,
+      "loss": 0.5087,
+      "step": 1020
+    },
+    {
+      "epoch": 5.07,
+      "grad_norm": 1.3961881399154663,
+      "learning_rate": 0.00246551724137931,
+      "loss": 0.4565,
+      "step": 1030
+    },
+    {
+      "epoch": 5.12,
+      "grad_norm": 1.9095091819763184,
+      "learning_rate": 0.0024408866995073893,
+      "loss": 0.4432,
+      "step": 1040
+    },
+    {
+      "epoch": 5.17,
+      "grad_norm": 1.2952779531478882,
+      "learning_rate": 0.002416256157635468,
+      "loss": 0.4392,
+      "step": 1050
+    },
+    {
+      "epoch": 5.22,
+      "grad_norm": 1.309617042541504,
+      "learning_rate": 0.002391625615763547,
+      "loss": 0.5255,
+      "step": 1060
+    },
+    {
+      "epoch": 5.26,
+      "grad_norm": 1.497014045715332,
+      "learning_rate": 0.0023669950738916257,
+      "loss": 0.5122,
+      "step": 1070
+    },
+    {
+      "epoch": 5.31,
+      "grad_norm": 1.3211737871170044,
+      "learning_rate": 0.0023423645320197048,
+      "loss": 0.5529,
+      "step": 1080
+    },
+    {
+      "epoch": 5.36,
+      "grad_norm": 0.9946479797363281,
+      "learning_rate": 0.0023177339901477834,
+      "loss": 0.4708,
+      "step": 1090
+    },
+    {
+      "epoch": 5.41,
+      "grad_norm": 0.8456437587738037,
+      "learning_rate": 0.002293103448275862,
+      "loss": 0.4935,
+      "step": 1100
+    },
+    {
+      "epoch": 5.46,
+      "grad_norm": 1.495175838470459,
+      "learning_rate": 0.0022684729064039407,
+      "loss": 0.4975,
+      "step": 1110
+    },
+    {
+      "epoch": 5.51,
+      "grad_norm": 1.6447827816009521,
+      "learning_rate": 0.00224384236453202,
+      "loss": 0.4138,
+      "step": 1120
+    },
+    {
+      "epoch": 5.56,
+      "grad_norm": 0.8438058495521545,
+      "learning_rate": 0.0022192118226600985,
+      "loss": 0.4502,
+      "step": 1130
+    },
+    {
+      "epoch": 5.61,
+      "grad_norm": 1.3904708623886108,
+      "learning_rate": 0.0021945812807881776,
+      "loss": 0.4681,
+      "step": 1140
+    },
+    {
+      "epoch": 5.66,
+      "grad_norm": 1.498844861984253,
+      "learning_rate": 0.0021699507389162562,
+      "loss": 0.4637,
+      "step": 1150
+    },
+    {
+      "epoch": 5.71,
+      "grad_norm": 1.1716539859771729,
+      "learning_rate": 0.002145320197044335,
+      "loss": 0.5183,
+      "step": 1160
+    },
+    {
+      "epoch": 5.76,
+      "grad_norm": 0.7125697135925293,
+      "learning_rate": 0.002120689655172414,
+      "loss": 0.4307,
+      "step": 1170
+    },
+    {
+      "epoch": 5.81,
+      "grad_norm": 1.341647744178772,
+      "learning_rate": 0.0020960591133004926,
+      "loss": 0.4988,
+      "step": 1180
+    },
+    {
+      "epoch": 5.85,
+      "grad_norm": 1.4662394523620605,
+      "learning_rate": 0.0020714285714285717,
+      "loss": 0.4398,
+      "step": 1190
+    },
+    {
+      "epoch": 5.9,
+      "grad_norm": 1.7114837169647217,
+      "learning_rate": 0.0020467980295566504,
+      "loss": 0.4488,
+      "step": 1200
+    },
+    {
+      "epoch": 5.95,
+      "grad_norm": 1.0667368173599243,
+      "learning_rate": 0.002022167487684729,
+      "loss": 0.5513,
+      "step": 1210
+    },
+    {
+      "epoch": 6.0,
+      "eval_accuracy": 0.9862040133779264,
+      "eval_f1": 0.9840490701292556,
+      "eval_loss": 0.0578995905816555,
+      "eval_precision": 0.9830589209967975,
+      "eval_recall": 0.985517150491058,
+      "eval_runtime": 12.5187,
+      "eval_samples_per_second": 191.074,
+      "eval_steps_per_second": 11.982,
+      "step": 1219
+    },
+    {
+      "epoch": 6.0,
+      "grad_norm": 1.1651352643966675,
+      "learning_rate": 0.0019975369458128077,
+      "loss": 0.4321,
+      "step": 1220
+    },
+    {
+      "epoch": 6.05,
+      "grad_norm": 1.0694313049316406,
+      "learning_rate": 0.0019729064039408868,
+      "loss": 0.4343,
+      "step": 1230
+    },
+    {
+      "epoch": 6.1,
+      "grad_norm": 1.5686174631118774,
+      "learning_rate": 0.0019482758620689657,
+      "loss": 0.367,
+      "step": 1240
+    },
+    {
+      "epoch": 6.15,
+      "grad_norm": 0.7148666977882385,
+      "learning_rate": 0.0019236453201970443,
+      "loss": 0.4364,
+      "step": 1250
+    },
+    {
+      "epoch": 6.2,
+      "grad_norm": 1.4920200109481812,
+      "learning_rate": 0.0018990147783251232,
+      "loss": 0.4814,
+      "step": 1260
+    },
+    {
+      "epoch": 6.25,
+      "grad_norm": 1.0870678424835205,
+      "learning_rate": 0.001874384236453202,
+      "loss": 0.4145,
+      "step": 1270
+    },
+    {
+      "epoch": 6.3,
+      "grad_norm": 1.0466927289962769,
+      "learning_rate": 0.001849753694581281,
+      "loss": 0.4296,
+      "step": 1280
+    },
+    {
+      "epoch": 6.35,
+      "grad_norm": 0.9908223748207092,
+      "learning_rate": 0.0018251231527093596,
+      "loss": 0.4183,
+      "step": 1290
+    },
+    {
+      "epoch": 6.4,
+      "grad_norm": 0.6582946181297302,
+      "learning_rate": 0.0018004926108374385,
+      "loss": 0.4099,
+      "step": 1300
+    },
+    {
+      "epoch": 6.45,
+      "grad_norm": 1.3454304933547974,
+      "learning_rate": 0.0017758620689655171,
+      "loss": 0.367,
+      "step": 1310
+    },
+    {
+      "epoch": 6.49,
+      "grad_norm": 1.3359636068344116,
+      "learning_rate": 0.001751231527093596,
+      "loss": 0.4025,
+      "step": 1320
+    },
+    {
+      "epoch": 6.54,
+      "grad_norm": 1.2285734415054321,
+      "learning_rate": 0.0017266009852216749,
+      "loss": 0.4675,
+      "step": 1330
+    },
+    {
+      "epoch": 6.59,
+      "grad_norm": 0.9923570156097412,
+      "learning_rate": 0.0017019704433497537,
+      "loss": 0.3958,
+      "step": 1340
+    },
+    {
+      "epoch": 6.64,
+      "grad_norm": 0.8746837973594666,
+      "learning_rate": 0.0016773399014778326,
+      "loss": 0.4365,
+      "step": 1350
+    },
+    {
+      "epoch": 6.69,
+      "grad_norm": 0.8892514705657959,
+      "learning_rate": 0.0016527093596059115,
+      "loss": 0.4296,
+      "step": 1360
+    },
+    {
+      "epoch": 6.74,
+      "grad_norm": 1.2088005542755127,
+      "learning_rate": 0.0016280788177339904,
+      "loss": 0.3881,
+      "step": 1370
+    },
+    {
+      "epoch": 6.79,
+      "grad_norm": 1.0085664987564087,
+      "learning_rate": 0.0016034482758620688,
+      "loss": 0.4745,
+      "step": 1380
+    },
+    {
+      "epoch": 6.84,
+      "grad_norm": 1.0228571891784668,
+      "learning_rate": 0.0015788177339901477,
+      "loss": 0.3763,
+      "step": 1390
+    },
+    {
+      "epoch": 6.89,
+      "grad_norm": 0.8100888133049011,
+      "learning_rate": 0.0015541871921182266,
+      "loss": 0.4198,
+      "step": 1400
+    },
+    {
+      "epoch": 6.94,
+      "grad_norm": 1.2440354824066162,
+      "learning_rate": 0.0015295566502463054,
+      "loss": 0.4113,
+      "step": 1410
+    },
+    {
+      "epoch": 6.99,
+      "grad_norm": 0.6661180257797241,
+      "learning_rate": 0.0015049261083743843,
+      "loss": 0.4343,
+      "step": 1420
+    },
+    {
+      "epoch": 7.0,
+      "eval_accuracy": 0.9832775919732442,
+      "eval_f1": 0.9834794316662396,
+      "eval_loss": 0.07752905040979385,
+      "eval_precision": 0.985796063365073,
+      "eval_recall": 0.981752686688599,
+      "eval_runtime": 12.5648,
+      "eval_samples_per_second": 190.373,
+      "eval_steps_per_second": 11.938,
+      "step": 1422
+    },
+    {
+      "epoch": 7.04,
+      "grad_norm": 0.591243326663971,
+      "learning_rate": 0.0014802955665024632,
+      "loss": 0.3291,
+      "step": 1430
+    },
+    {
+      "epoch": 7.08,
+      "grad_norm": 0.8764331936836243,
+      "learning_rate": 0.001455665024630542,
+      "loss": 0.3704,
+      "step": 1440
+    },
+    {
+      "epoch": 7.13,
+      "grad_norm": 1.115868330001831,
+      "learning_rate": 0.0014310344827586207,
+      "loss": 0.3625,
+      "step": 1450
+    },
+    {
+      "epoch": 7.18,
+      "grad_norm": 1.1736584901809692,
+      "learning_rate": 0.0014064039408866996,
+      "loss": 0.3571,
+      "step": 1460
+    },
+    {
+      "epoch": 7.23,
+      "grad_norm": 0.9778345227241516,
+      "learning_rate": 0.0013817733990147782,
+      "loss": 0.3594,
+      "step": 1470
+    },
+    {
+      "epoch": 7.28,
+      "grad_norm": 1.1396944522857666,
+      "learning_rate": 0.0013571428571428571,
+      "loss": 0.3615,
+      "step": 1480
+    },
+    {
+      "epoch": 7.33,
+      "grad_norm": 1.2598211765289307,
+      "learning_rate": 0.001332512315270936,
+      "loss": 0.3802,
+      "step": 1490
+    },
+    {
+      "epoch": 7.38,
+      "grad_norm": 1.1756126880645752,
+      "learning_rate": 0.0013078817733990149,
+      "loss": 0.4429,
+      "step": 1500
+    },
+    {
+      "epoch": 7.43,
+      "grad_norm": 0.9109674096107483,
+      "learning_rate": 0.0012832512315270935,
+      "loss": 0.3578,
+      "step": 1510
+    },
+    {
+      "epoch": 7.48,
+      "grad_norm": 0.7428516745567322,
+      "learning_rate": 0.0012586206896551724,
+      "loss": 0.3705,
+      "step": 1520
+    },
+    {
+      "epoch": 7.53,
+      "grad_norm": 1.3957030773162842,
+      "learning_rate": 0.0012339901477832513,
+      "loss": 0.3769,
+      "step": 1530
+    },
+    {
+      "epoch": 7.58,
+      "grad_norm": 1.0507686138153076,
+      "learning_rate": 0.00120935960591133,
+      "loss": 0.3525,
+      "step": 1540
+    },
+    {
+      "epoch": 7.63,
+      "grad_norm": 0.8914185762405396,
+      "learning_rate": 0.0011847290640394088,
+      "loss": 0.4804,
+      "step": 1550
+    },
+    {
+      "epoch": 7.68,
+      "grad_norm": 0.8193994760513306,
+      "learning_rate": 0.0011600985221674877,
+      "loss": 0.298,
+      "step": 1560
+    },
+    {
+      "epoch": 7.72,
+      "grad_norm": 0.9263984560966492,
+      "learning_rate": 0.0011354679802955665,
+      "loss": 0.3142,
+      "step": 1570
+    },
+    {
+      "epoch": 7.77,
+      "grad_norm": 1.8249924182891846,
+      "learning_rate": 0.0011108374384236454,
+      "loss": 0.4135,
+      "step": 1580
+    },
+    {
+      "epoch": 7.82,
+      "grad_norm": 0.659723162651062,
+      "learning_rate": 0.001086206896551724,
+      "loss": 0.3844,
+      "step": 1590
+    },
+    {
+      "epoch": 7.87,
+      "grad_norm": 0.7200958132743835,
+      "learning_rate": 0.001061576354679803,
+      "loss": 0.3627,
+      "step": 1600
+    },
+    {
+      "epoch": 7.92,
+      "grad_norm": 1.055242657661438,
+      "learning_rate": 0.0010369458128078818,
+      "loss": 0.3522,
+      "step": 1610
+    },
+    {
+      "epoch": 7.97,
+      "grad_norm": 1.0147466659545898,
+      "learning_rate": 0.0010123152709359607,
+      "loss": 0.3942,
+      "step": 1620
+    },
+    {
+      "epoch": 8.0,
+      "eval_accuracy": 0.9832775919732442,
+      "eval_f1": 0.9816549969154011,
+      "eval_loss": 0.07823298126459122,
+      "eval_precision": 0.9812899149698605,
+      "eval_recall": 0.9827399465830431,
+      "eval_runtime": 12.6409,
+      "eval_samples_per_second": 189.227,
+      "eval_steps_per_second": 11.866,
+      "step": 1626
+    },
+    {
+      "epoch": 8.02,
+      "grad_norm": 1.2122970819473267,
+      "learning_rate": 0.0009876847290640393,
+      "loss": 0.2658,
+      "step": 1630
+    },
+    {
+      "epoch": 8.07,
+      "grad_norm": 0.6683902740478516,
+      "learning_rate": 0.0009630541871921182,
+      "loss": 0.3499,
+      "step": 1640
+    },
+    {
+      "epoch": 8.12,
+      "grad_norm": 1.0198993682861328,
+      "learning_rate": 0.0009384236453201971,
+      "loss": 0.4031,
+      "step": 1650
+    },
+    {
+      "epoch": 8.17,
+      "grad_norm": 1.2388522624969482,
+      "learning_rate": 0.0009137931034482759,
+      "loss": 0.3164,
+      "step": 1660
+    },
+    {
+      "epoch": 8.22,
+      "grad_norm": 1.2226431369781494,
+      "learning_rate": 0.0008891625615763547,
+      "loss": 0.299,
+      "step": 1670
+    },
+    {
+      "epoch": 8.27,
+      "grad_norm": 1.0016721487045288,
+      "learning_rate": 0.0008645320197044335,
+      "loss": 0.3315,
+      "step": 1680
+    },
+    {
+      "epoch": 8.31,
+      "grad_norm": 1.0766950845718384,
+      "learning_rate": 0.0008399014778325123,
+      "loss": 0.286,
+      "step": 1690
+    },
+    {
+      "epoch": 8.36,
+      "grad_norm": 1.8925853967666626,
+      "learning_rate": 0.0008152709359605911,
+      "loss": 0.3745,
+      "step": 1700
+    },
+    {
+      "epoch": 8.41,
+      "grad_norm": 0.8409897685050964,
+      "learning_rate": 0.00079064039408867,
+      "loss": 0.3276,
+      "step": 1710
+    },
+    {
+      "epoch": 8.46,
+      "grad_norm": 1.1315199136734009,
+      "learning_rate": 0.0007660098522167489,
+      "loss": 0.3275,
+      "step": 1720
+    },
+    {
+      "epoch": 8.51,
+      "grad_norm": 1.05160391330719,
+      "learning_rate": 0.0007413793103448275,
+      "loss": 0.3072,
+      "step": 1730
+    },
+    {
+      "epoch": 8.56,
+      "grad_norm": 1.0058565139770508,
+      "learning_rate": 0.0007167487684729064,
+      "loss": 0.3413,
+      "step": 1740
+    },
+    {
+      "epoch": 8.61,
+      "grad_norm": 0.9650315046310425,
+      "learning_rate": 0.0006921182266009853,
+      "loss": 0.365,
+      "step": 1750
+    },
+    {
+      "epoch": 8.66,
+      "grad_norm": 0.6396649479866028,
+      "learning_rate": 0.0006674876847290641,
+      "loss": 0.3271,
+      "step": 1760
+    },
+    {
+      "epoch": 8.71,
+      "grad_norm": 0.7196962833404541,
+      "learning_rate": 0.0006428571428571428,
+      "loss": 0.3683,
+      "step": 1770
+    },
+    {
+      "epoch": 8.76,
+      "grad_norm": 0.8004360198974609,
+      "learning_rate": 0.0006182266009852217,
+      "loss": 0.3687,
+      "step": 1780
+    },
+    {
+      "epoch": 8.81,
+      "grad_norm": 0.9620378613471985,
+      "learning_rate": 0.0005935960591133005,
+      "loss": 0.3298,
+      "step": 1790
+    },
+    {
+      "epoch": 8.86,
+      "grad_norm": 0.7279284596443176,
+      "learning_rate": 0.0005689655172413793,
+      "loss": 0.3109,
+      "step": 1800
+    },
+    {
+      "epoch": 8.91,
+      "grad_norm": 1.2889859676361084,
+      "learning_rate": 0.0005443349753694581,
+      "loss": 0.4205,
+      "step": 1810
+    },
+    {
+      "epoch": 8.95,
+      "grad_norm": 0.8951707482337952,
+      "learning_rate": 0.000519704433497537,
+      "loss": 0.2971,
+      "step": 1820
+    },
+    {
+      "epoch": 9.0,
+      "eval_accuracy": 0.9862040133779264,
+      "eval_f1": 0.9872927641297526,
+      "eval_loss": 0.08386523276567459,
+      "eval_precision": 0.9884096313888006,
+      "eval_recall": 0.9865651085702777,
+      "eval_runtime": 12.5898,
+      "eval_samples_per_second": 189.995,
+      "eval_steps_per_second": 11.914,
+      "step": 1829
+    },
+    {
+      "epoch": 9.0,
+      "grad_norm": 0.6368119120597839,
+      "learning_rate": 0.0004950738916256157,
+      "loss": 0.3692,
+      "step": 1830
+    },
+    {
+      "epoch": 9.05,
+      "grad_norm": 0.5298008322715759,
+      "learning_rate": 0.0004704433497536946,
+      "loss": 0.3112,
+      "step": 1840
+    },
+    {
+      "epoch": 9.1,
+      "grad_norm": 0.5324183702468872,
+      "learning_rate": 0.0004458128078817734,
+      "loss": 0.3389,
+      "step": 1850
+    },
+    {
+      "epoch": 9.15,
+      "grad_norm": 0.6283653974533081,
+      "learning_rate": 0.0004211822660098522,
+      "loss": 0.3328,
+      "step": 1860
+    },
+    {
+      "epoch": 9.2,
+      "grad_norm": 0.9130664467811584,
+      "learning_rate": 0.0003965517241379311,
+      "loss": 0.2774,
+      "step": 1870
+    },
+    {
+      "epoch": 9.25,
+      "grad_norm": 0.569354772567749,
+      "learning_rate": 0.00037192118226600984,
+      "loss": 0.3055,
+      "step": 1880
+    },
+    {
+      "epoch": 9.3,
+      "grad_norm": 0.91834557056427,
+      "learning_rate": 0.0003472906403940887,
+      "loss": 0.3108,
+      "step": 1890
+    },
+    {
+      "epoch": 9.35,
+      "grad_norm": 1.2413830757141113,
+      "learning_rate": 0.0003226600985221675,
+      "loss": 0.2896,
+      "step": 1900
+    },
+    {
+      "epoch": 9.4,
+      "grad_norm": 0.7163951396942139,
+      "learning_rate": 0.0002980295566502463,
+      "loss": 0.2883,
+      "step": 1910
+    },
+    {
+      "epoch": 9.45,
+      "grad_norm": 1.121543288230896,
+      "learning_rate": 0.0002733990147783251,
+      "loss": 0.3061,
+      "step": 1920
+    },
+    {
+      "epoch": 9.5,
+      "grad_norm": 0.8933872580528259,
+      "learning_rate": 0.00024876847290640394,
+      "loss": 0.3087,
+      "step": 1930
+    },
+    {
+      "epoch": 9.54,
+      "grad_norm": 0.7040281295776367,
+      "learning_rate": 0.00022413793103448276,
+      "loss": 0.2723,
+      "step": 1940
+    },
+    {
+      "epoch": 9.59,
+      "grad_norm": 0.3641883432865143,
+      "learning_rate": 0.00019950738916256158,
+      "loss": 0.2397,
+      "step": 1950
+    },
+    {
+      "epoch": 9.64,
+      "grad_norm": 1.092432975769043,
+      "learning_rate": 0.0001748768472906404,
+      "loss": 0.2763,
+      "step": 1960
+    },
+    {
+      "epoch": 9.69,
+      "grad_norm": 0.6717754602432251,
+      "learning_rate": 0.00015024630541871922,
+      "loss": 0.2921,
+      "step": 1970
+    },
+    {
+      "epoch": 9.74,
+      "grad_norm": 0.8007289171218872,
+      "learning_rate": 0.00012561576354679804,
+      "loss": 0.3365,
+      "step": 1980
+    },
+    {
+      "epoch": 9.79,
+      "grad_norm": 0.9652357697486877,
+      "learning_rate": 0.00010098522167487686,
+      "loss": 0.2998,
+      "step": 1990
+    },
+    {
+      "epoch": 9.84,
+      "grad_norm": 0.6942909955978394,
+      "learning_rate": 7.635467980295568e-05,
+      "loss": 0.2917,
+      "step": 2000
+    },
+    {
+      "epoch": 9.89,
+      "grad_norm": 0.5785544514656067,
+      "learning_rate": 5.172413793103448e-05,
+      "loss": 0.2662,
+      "step": 2010
+    },
+    {
+      "epoch": 9.94,
+      "grad_norm": 0.7549653649330139,
+      "learning_rate": 2.70935960591133e-05,
+      "loss": 0.2989,
+      "step": 2020
+    },
+    {
+      "epoch": 9.99,
+      "grad_norm": 1.3010107278823853,
+      "learning_rate": 2.463054187192118e-06,
+      "loss": 0.3242,
+      "step": 2030
+    },
+    {
+      "epoch": 9.99,
+      "eval_accuracy": 0.9870401337792643,
+      "eval_f1": 0.9868360981525407,
+      "eval_loss": 0.07449871301651001,
+      "eval_precision": 0.9876698762890729,
+      "eval_recall": 0.9863397416476135,
+      "eval_runtime": 12.5861,
+      "eval_samples_per_second": 190.05,
+      "eval_steps_per_second": 11.918,
+      "step": 2030
+    },
+    {
+      "epoch": 9.99,
+      "step": 2030,
+      "total_flos": 1.0133154899356189e+19,
+      "train_loss": 0.5248493043072705,
+      "train_runtime": 1518.4893,
+      "train_samples_per_second": 85.611,
+      "train_steps_per_second": 1.337
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 2030,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 500,
+  "total_flos": 1.0133154899356189e+19,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}