Model save

Browse files

Files changed (9) hide show

README.md +31 -19
all_results.json +13 -0
eval_results.json +8 -0
model.safetensors +1 -1
runs/Mar29_19-44-52_44990517b672/events.out.tfevents.1711742478.44990517b672.3784.2 +3 -0
runs/Mar29_20-03-14_44990517b672/events.out.tfevents.1711742611.44990517b672.3784.3 +3 -0
train_results.json +8 -0
trainer_state.json +1329 -0
training_args.bin +1 -1

README.md CHANGED Viewed

@@ -17,8 +17,8 @@ should probably proofread and complete it, then remove this comment. -->
 This model is a fine-tuned version of [google/vit-base-patch16-224-in21K](https://huggingface.co/google/vit-base-patch16-224-in21K) on an unknown dataset.
 It achieves the following results on the evaluation set:
-- Loss: 0.9121
-- Accuracy: 0.7710
 ## Model description
@@ -43,29 +43,41 @@ The following hyperparameters were used during training:
 - seed: 42
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: linear
-- num_epochs: 4
 - mixed_precision_training: Native AMP
 ### Training results
 | Training Loss | Epoch | Step | Validation Loss | Accuracy |
 |:-------------:|:-----:|:----:|:---------------:|:--------:|
-| 3.1307        | 0.24  | 100  | 3.0180          | 0.2093   |
-| 2.7372        | 0.48  | 200  | 2.5301          | 0.2762   |
-| 2.4969        | 0.73  | 300  | 2.1760          | 0.3439   |
-| 2.1973        | 0.97  | 400  | 2.0103          | 0.3756   |
-| 1.8847        | 1.21  | 500  | 1.8402          | 0.4108   |
-| 1.746         | 1.45  | 600  | 1.7051          | 0.4803   |
-| 1.8698        | 1.69  | 700  | 1.5985          | 0.4889   |
-| 1.7261        | 1.94  | 800  | 1.4312          | 0.5840   |
-| 1.7385        | 2.18  | 900  | 1.3585          | 0.6286   |
-| 1.5873        | 2.42  | 1000 | 1.2374          | 0.6758   |
-| 1.4775        | 2.66  | 1100 | 1.1352          | 0.7024   |
-| 1.2697        | 2.91  | 1200 | 1.1044          | 0.7093   |
-| 1.2137        | 3.15  | 1300 | 1.0006          | 0.7616   |
-| 1.423         | 3.39  | 1400 | 0.9589          | 0.7744   |
-| 1.0098        | 3.63  | 1500 | 0.9360          | 0.7684   |
-| 1.1325        | 3.87  | 1600 | 0.9121          | 0.7710   |
 ### Framework versions

 This model is a fine-tuned version of [google/vit-base-patch16-224-in21K](https://huggingface.co/google/vit-base-patch16-224-in21K) on an unknown dataset.
 It achieves the following results on the evaluation set:
+- Loss: 0.5081
+- Accuracy: 0.8696
 ## Model description
 - seed: 42
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: linear
+- num_epochs: 7
 - mixed_precision_training: Native AMP
 ### Training results
 | Training Loss | Epoch | Step | Validation Loss | Accuracy |
 |:-------------:|:-----:|:----:|:---------------:|:--------:|
+| 1.3563        | 0.24  | 100  | 1.1495          | 0.6750   |
+| 1.3393        | 0.48  | 200  | 1.0388          | 0.7204   |
+| 1.2033        | 0.73  | 300  | 0.9324          | 0.7547   |
+| 0.9672        | 0.97  | 400  | 0.8558          | 0.7659   |
+| 0.8674        | 1.21  | 500  | 0.8456          | 0.7616   |
+| 0.8277        | 1.45  | 600  | 0.7563          | 0.7959   |
+| 0.8703        | 1.69  | 700  | 0.8465          | 0.7539   |
+| 0.893         | 1.94  | 800  | 0.6881          | 0.8002   |
+| 0.9454        | 2.18  | 900  | 0.7211          | 0.8027   |
+| 0.8109        | 2.42  | 1000 | 0.6369          | 0.8285   |
+| 0.8762        | 2.66  | 1100 | 0.6336          | 0.8396   |
+| 0.8034        | 2.91  | 1200 | 0.6580          | 0.8165   |
+| 0.5833        | 3.15  | 1300 | 0.5828          | 0.8439   |
+| 0.8811        | 3.39  | 1400 | 0.6564          | 0.8259   |
+| 0.5639        | 3.63  | 1500 | 0.5737          | 0.8439   |
+| 0.639         | 3.87  | 1600 | 0.5609          | 0.8379   |
+| 0.6455        | 4.12  | 1700 | 0.5820          | 0.8370   |
+| 0.5402        | 4.36  | 1800 | 0.5797          | 0.8345   |
+| 0.5311        | 4.6   | 1900 | 0.5511          | 0.8456   |
+| 0.5734        | 4.84  | 2000 | 0.5444          | 0.8508   |
+| 0.5206        | 5.08  | 2100 | 0.5326          | 0.8636   |
+| 0.6272        | 5.33  | 2200 | 0.5478          | 0.8525   |
+| 0.5124        | 5.57  | 2300 | 0.5296          | 0.8688   |
+| 0.5659        | 5.81  | 2400 | 0.5181          | 0.8705   |
+| 0.4212        | 6.05  | 2500 | 0.5200          | 0.8611   |
+| 0.4338        | 6.3   | 2600 | 0.5135          | 0.8731   |
+| 0.3407        | 6.54  | 2700 | 0.5147          | 0.8722   |
+| 0.4043        | 6.78  | 2800 | 0.5081          | 0.8696   |
 ### Framework versions

all_results.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+    "epoch": 4.0,
+    "eval_accuracy": 0.7710120068610634,
+    "eval_loss": 0.912144124507904,
+    "eval_runtime": 6.7842,
+    "eval_samples_per_second": 171.871,
+    "eval_steps_per_second": 21.521,
+    "total_flos": 2.047635634195759e+18,
+    "train_loss": 1.8191430680543978,
+    "train_runtime": 594.0822,
+    "train_samples_per_second": 44.458,
+    "train_steps_per_second": 2.781
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 4.0,
+    "eval_accuracy": 0.7710120068610634,
+    "eval_loss": 0.912144124507904,
+    "eval_runtime": 6.7842,
+    "eval_samples_per_second": 171.871,
+    "eval_steps_per_second": 21.521
+}

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:808ea45f936d03a7fc517107e6638c9c74952f79ce7f1f2082bf4a417f12605c
 size 343377784

 version https://git-lfs.github.com/spec/v1
+oid sha256:eb9b70ee282c426c2a645c30197368de15a5908c06f2c905de4194ed11a5c4ed
 size 343377784

runs/Mar29_19-44-52_44990517b672/events.out.tfevents.1711742478.44990517b672.3784.2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3636374ba0e100e35ba985b7def150b74b2d498b9b67d3dfddd2dc95add95a23
+size 411

runs/Mar29_20-03-14_44990517b672/events.out.tfevents.1711742611.44990517b672.3784.3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3c50db23ca1c40405186375018a2238b4a607bb59484fe5484ecc67db25f05dc
+size 77831

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 4.0,
+    "total_flos": 2.047635634195759e+18,
+    "train_loss": 1.8191430680543978,
+    "train_runtime": 594.0822,
+    "train_samples_per_second": 44.458,
+    "train_steps_per_second": 2.781
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1329 @@

+{
+  "best_metric": 0.912144124507904,
+  "best_model_checkpoint": "finetuned-cards-blackjack/checkpoint-1600",
+  "epoch": 4.0,
+  "eval_steps": 100,
+  "global_step": 1652,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.02,
+      "grad_norm": 1.466597557067871,
+      "learning_rate": 0.0001987893462469734,
+      "loss": 3.9543,
+      "step": 10
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 1.9476360082626343,
+      "learning_rate": 0.00019757869249394675,
+      "loss": 3.8868,
+      "step": 20
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 1.6487232446670532,
+      "learning_rate": 0.0001963680387409201,
+      "loss": 3.8185,
+      "step": 30
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 1.8101606369018555,
+      "learning_rate": 0.00019515738498789345,
+      "loss": 3.6559,
+      "step": 40
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 1.7900973558425903,
+      "learning_rate": 0.00019394673123486684,
+      "loss": 3.559,
+      "step": 50
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 1.7922214269638062,
+      "learning_rate": 0.0001927360774818402,
+      "loss": 3.4135,
+      "step": 60
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 1.9818700551986694,
+      "learning_rate": 0.00019152542372881357,
+      "loss": 3.3906,
+      "step": 70
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 1.9315565824508667,
+      "learning_rate": 0.00019031476997578695,
+      "loss": 3.3191,
+      "step": 80
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 1.9850099086761475,
+      "learning_rate": 0.0001891041162227603,
+      "loss": 3.2122,
+      "step": 90
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 1.9584887027740479,
+      "learning_rate": 0.00018789346246973366,
+      "loss": 3.1307,
+      "step": 100
+    },
+    {
+      "epoch": 0.24,
+      "eval_accuracy": 0.20926243567753003,
+      "eval_loss": 3.017998456954956,
+      "eval_runtime": 6.19,
+      "eval_samples_per_second": 188.367,
+      "eval_steps_per_second": 23.586,
+      "step": 100
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 2.39744234085083,
+      "learning_rate": 0.00018668280871670702,
+      "loss": 3.0667,
+      "step": 110
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 1.891518473625183,
+      "learning_rate": 0.0001854721549636804,
+      "loss": 2.984,
+      "step": 120
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 1.9065883159637451,
+      "learning_rate": 0.00018426150121065375,
+      "loss": 2.8457,
+      "step": 130
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 2.126429796218872,
+      "learning_rate": 0.00018305084745762714,
+      "loss": 2.9638,
+      "step": 140
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.9387011528015137,
+      "learning_rate": 0.00018184019370460052,
+      "loss": 2.7348,
+      "step": 150
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.923202633857727,
+      "learning_rate": 0.00018062953995157384,
+      "loss": 2.8489,
+      "step": 160
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 2.581446409225464,
+      "learning_rate": 0.00017941888619854723,
+      "loss": 2.7744,
+      "step": 170
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.6987770795822144,
+      "learning_rate": 0.00017820823244552058,
+      "loss": 2.6428,
+      "step": 180
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.9667104482650757,
+      "learning_rate": 0.00017699757869249396,
+      "loss": 2.6952,
+      "step": 190
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 3.4282023906707764,
+      "learning_rate": 0.00017578692493946732,
+      "loss": 2.7372,
+      "step": 200
+    },
+    {
+      "epoch": 0.48,
+      "eval_accuracy": 0.27615780445969124,
+      "eval_loss": 2.530054807662964,
+      "eval_runtime": 6.2338,
+      "eval_samples_per_second": 187.045,
+      "eval_steps_per_second": 23.421,
+      "step": 200
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.9124583005905151,
+      "learning_rate": 0.0001745762711864407,
+      "loss": 2.6423,
+      "step": 210
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 2.4269683361053467,
+      "learning_rate": 0.00017336561743341405,
+      "loss": 2.668,
+      "step": 220
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.9838333129882812,
+      "learning_rate": 0.0001721549636803874,
+      "loss": 2.5786,
+      "step": 230
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 3.200087070465088,
+      "learning_rate": 0.0001709443099273608,
+      "loss": 2.5938,
+      "step": 240
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 2.93118953704834,
+      "learning_rate": 0.00016973365617433414,
+      "loss": 2.4526,
+      "step": 250
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 2.555947780609131,
+      "learning_rate": 0.00016852300242130752,
+      "loss": 2.41,
+      "step": 260
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 2.9446065425872803,
+      "learning_rate": 0.00016731234866828088,
+      "loss": 2.4537,
+      "step": 270
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 3.393993377685547,
+      "learning_rate": 0.00016610169491525423,
+      "loss": 2.4256,
+      "step": 280
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 2.721825122833252,
+      "learning_rate": 0.00016489104116222762,
+      "loss": 2.4719,
+      "step": 290
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 3.2610368728637695,
+      "learning_rate": 0.00016368038740920097,
+      "loss": 2.4969,
+      "step": 300
+    },
+    {
+      "epoch": 0.73,
+      "eval_accuracy": 0.3439108061749571,
+      "eval_loss": 2.175961971282959,
+      "eval_runtime": 6.2327,
+      "eval_samples_per_second": 187.079,
+      "eval_steps_per_second": 23.425,
+      "step": 300
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 3.067995309829712,
+      "learning_rate": 0.00016246973365617435,
+      "loss": 2.4904,
+      "step": 310
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 2.7957141399383545,
+      "learning_rate": 0.0001612590799031477,
+      "loss": 2.3913,
+      "step": 320
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 2.281586170196533,
+      "learning_rate": 0.0001600484261501211,
+      "loss": 2.1749,
+      "step": 330
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 2.4833972454071045,
+      "learning_rate": 0.00015883777239709444,
+      "loss": 2.4058,
+      "step": 340
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 2.5052073001861572,
+      "learning_rate": 0.0001576271186440678,
+      "loss": 2.3236,
+      "step": 350
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 2.479684352874756,
+      "learning_rate": 0.00015641646489104115,
+      "loss": 2.373,
+      "step": 360
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 3.6352992057800293,
+      "learning_rate": 0.00015520581113801453,
+      "loss": 2.3282,
+      "step": 370
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 2.748934030532837,
+      "learning_rate": 0.00015399515738498791,
+      "loss": 2.2062,
+      "step": 380
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 2.0645978450775146,
+      "learning_rate": 0.00015278450363196127,
+      "loss": 2.1253,
+      "step": 390
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 2.2856009006500244,
+      "learning_rate": 0.00015157384987893465,
+      "loss": 2.1973,
+      "step": 400
+    },
+    {
+      "epoch": 0.97,
+      "eval_accuracy": 0.37564322469982847,
+      "eval_loss": 2.0102577209472656,
+      "eval_runtime": 5.9741,
+      "eval_samples_per_second": 195.175,
+      "eval_steps_per_second": 24.439,
+      "step": 400
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 2.866960048675537,
+      "learning_rate": 0.00015036319612590798,
+      "loss": 2.214,
+      "step": 410
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 3.171844482421875,
+      "learning_rate": 0.00014915254237288136,
+      "loss": 2.0948,
+      "step": 420
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 3.6916253566741943,
+      "learning_rate": 0.00014794188861985471,
+      "loss": 2.0649,
+      "step": 430
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 2.3281314373016357,
+      "learning_rate": 0.0001467312348668281,
+      "loss": 2.0633,
+      "step": 440
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 3.370180368423462,
+      "learning_rate": 0.00014552058111380148,
+      "loss": 1.9949,
+      "step": 450
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 2.5389626026153564,
+      "learning_rate": 0.00014430992736077483,
+      "loss": 2.086,
+      "step": 460
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 2.47526216506958,
+      "learning_rate": 0.00014309927360774819,
+      "loss": 2.0443,
+      "step": 470
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 2.821577548980713,
+      "learning_rate": 0.00014188861985472154,
+      "loss": 2.0808,
+      "step": 480
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 2.978994369506836,
+      "learning_rate": 0.00014067796610169492,
+      "loss": 2.1278,
+      "step": 490
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 3.1431379318237305,
+      "learning_rate": 0.00013946731234866828,
+      "loss": 1.8847,
+      "step": 500
+    },
+    {
+      "epoch": 1.21,
+      "eval_accuracy": 0.41080617495711835,
+      "eval_loss": 1.8402307033538818,
+      "eval_runtime": 6.2119,
+      "eval_samples_per_second": 187.705,
+      "eval_steps_per_second": 23.503,
+      "step": 500
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 3.1350502967834473,
+      "learning_rate": 0.00013825665859564166,
+      "loss": 2.02,
+      "step": 510
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 2.63952374458313,
+      "learning_rate": 0.00013704600484261504,
+      "loss": 2.1684,
+      "step": 520
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 2.7914199829101562,
+      "learning_rate": 0.00013583535108958837,
+      "loss": 1.8532,
+      "step": 530
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 4.124698638916016,
+      "learning_rate": 0.00013462469733656175,
+      "loss": 1.9593,
+      "step": 540
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 3.0953214168548584,
+      "learning_rate": 0.0001334140435835351,
+      "loss": 2.0143,
+      "step": 550
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 3.626241683959961,
+      "learning_rate": 0.00013220338983050849,
+      "loss": 2.0349,
+      "step": 560
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 3.22306752204895,
+      "learning_rate": 0.00013099273607748184,
+      "loss": 1.9283,
+      "step": 570
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 2.6860299110412598,
+      "learning_rate": 0.00012978208232445522,
+      "loss": 1.9022,
+      "step": 580
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 3.2099533081054688,
+      "learning_rate": 0.00012857142857142858,
+      "loss": 1.9102,
+      "step": 590
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 2.5889129638671875,
+      "learning_rate": 0.00012736077481840193,
+      "loss": 1.746,
+      "step": 600
+    },
+    {
+      "epoch": 1.45,
+      "eval_accuracy": 0.48027444253859347,
+      "eval_loss": 1.7051318883895874,
+      "eval_runtime": 6.0914,
+      "eval_samples_per_second": 191.418,
+      "eval_steps_per_second": 23.968,
+      "step": 600
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 2.6496353149414062,
+      "learning_rate": 0.0001261501210653753,
+      "loss": 1.8087,
+      "step": 610
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 2.2695322036743164,
+      "learning_rate": 0.00012493946731234867,
+      "loss": 1.9172,
+      "step": 620
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 3.144073724746704,
+      "learning_rate": 0.00012372881355932205,
+      "loss": 1.8943,
+      "step": 630
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 2.9001333713531494,
+      "learning_rate": 0.0001225181598062954,
+      "loss": 1.9463,
+      "step": 640
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 2.5096278190612793,
+      "learning_rate": 0.00012130750605326877,
+      "loss": 1.8045,
+      "step": 650
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 2.2238059043884277,
+      "learning_rate": 0.00012009685230024215,
+      "loss": 1.9322,
+      "step": 660
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 2.7545368671417236,
+      "learning_rate": 0.00011888619854721549,
+      "loss": 1.7305,
+      "step": 670
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 2.8309366703033447,
+      "learning_rate": 0.00011767554479418887,
+      "loss": 1.8587,
+      "step": 680
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 5.093832492828369,
+      "learning_rate": 0.00011646489104116223,
+      "loss": 1.8362,
+      "step": 690
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 2.4374847412109375,
+      "learning_rate": 0.0001152542372881356,
+      "loss": 1.8698,
+      "step": 700
+    },
+    {
+      "epoch": 1.69,
+      "eval_accuracy": 0.4888507718696398,
+      "eval_loss": 1.5985045433044434,
+      "eval_runtime": 6.4332,
+      "eval_samples_per_second": 181.249,
+      "eval_steps_per_second": 22.695,
+      "step": 700
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 2.8519837856292725,
+      "learning_rate": 0.00011404358353510895,
+      "loss": 1.8736,
+      "step": 710
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 2.8379719257354736,
+      "learning_rate": 0.00011283292978208233,
+      "loss": 1.6395,
+      "step": 720
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 3.884648323059082,
+      "learning_rate": 0.00011174334140435836,
+      "loss": 1.7938,
+      "step": 730
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 3.2592883110046387,
+      "learning_rate": 0.00011053268765133173,
+      "loss": 1.6813,
+      "step": 740
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 5.118261337280273,
+      "learning_rate": 0.00010932203389830508,
+      "loss": 1.9414,
+      "step": 750
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 2.822026491165161,
+      "learning_rate": 0.00010811138014527846,
+      "loss": 1.7598,
+      "step": 760
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 2.8540070056915283,
+      "learning_rate": 0.00010690072639225182,
+      "loss": 1.7024,
+      "step": 770
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 4.354470252990723,
+      "learning_rate": 0.00010569007263922519,
+      "loss": 1.8987,
+      "step": 780
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 3.528857707977295,
+      "learning_rate": 0.00010447941888619854,
+      "loss": 1.7933,
+      "step": 790
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 2.76985764503479,
+      "learning_rate": 0.00010326876513317192,
+      "loss": 1.7261,
+      "step": 800
+    },
+    {
+      "epoch": 1.94,
+      "eval_accuracy": 0.5840480274442539,
+      "eval_loss": 1.4311938285827637,
+      "eval_runtime": 6.2955,
+      "eval_samples_per_second": 185.213,
+      "eval_steps_per_second": 23.191,
+      "step": 800
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 3.15104079246521,
+      "learning_rate": 0.00010205811138014529,
+      "loss": 1.8079,
+      "step": 810
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 3.0211942195892334,
+      "learning_rate": 0.00010084745762711865,
+      "loss": 1.611,
+      "step": 820
+    },
+    {
+      "epoch": 2.01,
+      "grad_norm": 2.527198076248169,
+      "learning_rate": 9.963680387409201e-05,
+      "loss": 1.7344,
+      "step": 830
+    },
+    {
+      "epoch": 2.03,
+      "grad_norm": 3.654705762863159,
+      "learning_rate": 9.842615012106537e-05,
+      "loss": 1.5921,
+      "step": 840
+    },
+    {
+      "epoch": 2.06,
+      "grad_norm": 2.6901042461395264,
+      "learning_rate": 9.721549636803875e-05,
+      "loss": 1.5688,
+      "step": 850
+    },
+    {
+      "epoch": 2.08,
+      "grad_norm": 2.830200672149658,
+      "learning_rate": 9.600484261501212e-05,
+      "loss": 1.5546,
+      "step": 860
+    },
+    {
+      "epoch": 2.11,
+      "grad_norm": 3.2287344932556152,
+      "learning_rate": 9.479418886198547e-05,
+      "loss": 1.5714,
+      "step": 870
+    },
+    {
+      "epoch": 2.13,
+      "grad_norm": 3.661449432373047,
+      "learning_rate": 9.358353510895884e-05,
+      "loss": 1.4973,
+      "step": 880
+    },
+    {
+      "epoch": 2.15,
+      "grad_norm": 6.353243827819824,
+      "learning_rate": 9.237288135593221e-05,
+      "loss": 1.5294,
+      "step": 890
+    },
+    {
+      "epoch": 2.18,
+      "grad_norm": 3.703733444213867,
+      "learning_rate": 9.116222760290558e-05,
+      "loss": 1.7385,
+      "step": 900
+    },
+    {
+      "epoch": 2.18,
+      "eval_accuracy": 0.6286449399656947,
+      "eval_loss": 1.3585376739501953,
+      "eval_runtime": 5.9781,
+      "eval_samples_per_second": 195.046,
+      "eval_steps_per_second": 24.423,
+      "step": 900
+    },
+    {
+      "epoch": 2.2,
+      "grad_norm": 2.730365514755249,
+      "learning_rate": 8.995157384987893e-05,
+      "loss": 1.626,
+      "step": 910
+    },
+    {
+      "epoch": 2.23,
+      "grad_norm": 4.335669040679932,
+      "learning_rate": 8.874092009685231e-05,
+      "loss": 1.5823,
+      "step": 920
+    },
+    {
+      "epoch": 2.25,
+      "grad_norm": 2.272915840148926,
+      "learning_rate": 8.753026634382567e-05,
+      "loss": 1.47,
+      "step": 930
+    },
+    {
+      "epoch": 2.28,
+      "grad_norm": 3.335453510284424,
+      "learning_rate": 8.631961259079904e-05,
+      "loss": 1.4733,
+      "step": 940
+    },
+    {
+      "epoch": 2.3,
+      "grad_norm": 5.18184232711792,
+      "learning_rate": 8.51089588377724e-05,
+      "loss": 1.3798,
+      "step": 950
+    },
+    {
+      "epoch": 2.32,
+      "grad_norm": 3.79761004447937,
+      "learning_rate": 8.389830508474577e-05,
+      "loss": 1.5103,
+      "step": 960
+    },
+    {
+      "epoch": 2.35,
+      "grad_norm": 2.568056344985962,
+      "learning_rate": 8.268765133171913e-05,
+      "loss": 1.5016,
+      "step": 970
+    },
+    {
+      "epoch": 2.37,
+      "grad_norm": 4.231459140777588,
+      "learning_rate": 8.14769975786925e-05,
+      "loss": 1.4617,
+      "step": 980
+    },
+    {
+      "epoch": 2.4,
+      "grad_norm": 3.2914044857025146,
+      "learning_rate": 8.026634382566586e-05,
+      "loss": 1.5527,
+      "step": 990
+    },
+    {
+      "epoch": 2.42,
+      "grad_norm": 2.967702627182007,
+      "learning_rate": 7.905569007263923e-05,
+      "loss": 1.5873,
+      "step": 1000
+    },
+    {
+      "epoch": 2.42,
+      "eval_accuracy": 0.6758147512864494,
+      "eval_loss": 1.2374264001846313,
+      "eval_runtime": 6.2974,
+      "eval_samples_per_second": 185.155,
+      "eval_steps_per_second": 23.184,
+      "step": 1000
+    },
+    {
+      "epoch": 2.45,
+      "grad_norm": 2.7834739685058594,
+      "learning_rate": 7.78450363196126e-05,
+      "loss": 1.4255,
+      "step": 1010
+    },
+    {
+      "epoch": 2.47,
+      "grad_norm": 3.380810260772705,
+      "learning_rate": 7.663438256658597e-05,
+      "loss": 1.4528,
+      "step": 1020
+    },
+    {
+      "epoch": 2.49,
+      "grad_norm": 3.3973748683929443,
+      "learning_rate": 7.542372881355932e-05,
+      "loss": 1.5726,
+      "step": 1030
+    },
+    {
+      "epoch": 2.52,
+      "grad_norm": 2.9069502353668213,
+      "learning_rate": 7.421307506053269e-05,
+      "loss": 1.2987,
+      "step": 1040
+    },
+    {
+      "epoch": 2.54,
+      "grad_norm": 2.8832297325134277,
+      "learning_rate": 7.300242130750606e-05,
+      "loss": 1.437,
+      "step": 1050
+    },
+    {
+      "epoch": 2.57,
+      "grad_norm": 3.137310743331909,
+      "learning_rate": 7.179176755447942e-05,
+      "loss": 1.5,
+      "step": 1060
+    },
+    {
+      "epoch": 2.59,
+      "grad_norm": 3.156430244445801,
+      "learning_rate": 7.058111380145279e-05,
+      "loss": 1.341,
+      "step": 1070
+    },
+    {
+      "epoch": 2.62,
+      "grad_norm": 3.470303535461426,
+      "learning_rate": 6.937046004842616e-05,
+      "loss": 1.3986,
+      "step": 1080
+    },
+    {
+      "epoch": 2.64,
+      "grad_norm": 3.426010847091675,
+      "learning_rate": 6.815980629539952e-05,
+      "loss": 1.3874,
+      "step": 1090
+    },
+    {
+      "epoch": 2.66,
+      "grad_norm": 3.8181042671203613,
+      "learning_rate": 6.694915254237288e-05,
+      "loss": 1.4775,
+      "step": 1100
+    },
+    {
+      "epoch": 2.66,
+      "eval_accuracy": 0.7024013722126929,
+      "eval_loss": 1.1351556777954102,
+      "eval_runtime": 6.2887,
+      "eval_samples_per_second": 185.412,
+      "eval_steps_per_second": 23.216,
+      "step": 1100
+    },
+    {
+      "epoch": 2.69,
+      "grad_norm": 3.4228086471557617,
+      "learning_rate": 6.573849878934625e-05,
+      "loss": 1.4804,
+      "step": 1110
+    },
+    {
+      "epoch": 2.71,
+      "grad_norm": 4.945833206176758,
+      "learning_rate": 6.45278450363196e-05,
+      "loss": 1.2617,
+      "step": 1120
+    },
+    {
+      "epoch": 2.74,
+      "grad_norm": 2.712095022201538,
+      "learning_rate": 6.331719128329297e-05,
+      "loss": 1.4254,
+      "step": 1130
+    },
+    {
+      "epoch": 2.76,
+      "grad_norm": 3.2312748432159424,
+      "learning_rate": 6.210653753026636e-05,
+      "loss": 1.4141,
+      "step": 1140
+    },
+    {
+      "epoch": 2.78,
+      "grad_norm": 2.4630300998687744,
+      "learning_rate": 6.089588377723972e-05,
+      "loss": 1.3438,
+      "step": 1150
+    },
+    {
+      "epoch": 2.81,
+      "grad_norm": 2.9009976387023926,
+      "learning_rate": 5.968523002421308e-05,
+      "loss": 1.3625,
+      "step": 1160
+    },
+    {
+      "epoch": 2.83,
+      "grad_norm": 5.364362716674805,
+      "learning_rate": 5.8474576271186446e-05,
+      "loss": 1.4056,
+      "step": 1170
+    },
+    {
+      "epoch": 2.86,
+      "grad_norm": 3.0310747623443604,
+      "learning_rate": 5.726392251815981e-05,
+      "loss": 1.2943,
+      "step": 1180
+    },
+    {
+      "epoch": 2.88,
+      "grad_norm": 2.7472984790802,
+      "learning_rate": 5.605326876513317e-05,
+      "loss": 1.4934,
+      "step": 1190
+    },
+    {
+      "epoch": 2.91,
+      "grad_norm": 2.9528918266296387,
+      "learning_rate": 5.484261501210654e-05,
+      "loss": 1.2697,
+      "step": 1200
+    },
+    {
+      "epoch": 2.91,
+      "eval_accuracy": 0.70926243567753,
+      "eval_loss": 1.104396104812622,
+      "eval_runtime": 6.0071,
+      "eval_samples_per_second": 194.105,
+      "eval_steps_per_second": 24.305,
+      "step": 1200
+    },
+    {
+      "epoch": 2.93,
+      "grad_norm": 2.5816805362701416,
+      "learning_rate": 5.363196125907991e-05,
+      "loss": 1.3362,
+      "step": 1210
+    },
+    {
+      "epoch": 2.95,
+      "grad_norm": 3.5116188526153564,
+      "learning_rate": 5.242130750605327e-05,
+      "loss": 1.3128,
+      "step": 1220
+    },
+    {
+      "epoch": 2.98,
+      "grad_norm": 2.873042583465576,
+      "learning_rate": 5.121065375302664e-05,
+      "loss": 1.3257,
+      "step": 1230
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 6.232132434844971,
+      "learning_rate": 5e-05,
+      "loss": 1.256,
+      "step": 1240
+    },
+    {
+      "epoch": 3.03,
+      "grad_norm": 2.3054957389831543,
+      "learning_rate": 4.8789346246973364e-05,
+      "loss": 1.1805,
+      "step": 1250
+    },
+    {
+      "epoch": 3.05,
+      "grad_norm": 3.0687952041625977,
+      "learning_rate": 4.757869249394674e-05,
+      "loss": 1.0767,
+      "step": 1260
+    },
+    {
+      "epoch": 3.08,
+      "grad_norm": 3.774822235107422,
+      "learning_rate": 4.63680387409201e-05,
+      "loss": 1.311,
+      "step": 1270
+    },
+    {
+      "epoch": 3.1,
+      "grad_norm": 4.785544395446777,
+      "learning_rate": 4.515738498789346e-05,
+      "loss": 1.2997,
+      "step": 1280
+    },
+    {
+      "epoch": 3.12,
+      "grad_norm": 3.4525294303894043,
+      "learning_rate": 4.394673123486683e-05,
+      "loss": 1.2039,
+      "step": 1290
+    },
+    {
+      "epoch": 3.15,
+      "grad_norm": 3.312502861022949,
+      "learning_rate": 4.27360774818402e-05,
+      "loss": 1.2137,
+      "step": 1300
+    },
+    {
+      "epoch": 3.15,
+      "eval_accuracy": 0.7615780445969125,
+      "eval_loss": 1.0005759000778198,
+      "eval_runtime": 6.3563,
+      "eval_samples_per_second": 183.44,
+      "eval_steps_per_second": 22.969,
+      "step": 1300
+    },
+    {
+      "epoch": 3.17,
+      "grad_norm": 3.375433921813965,
+      "learning_rate": 4.152542372881356e-05,
+      "loss": 1.2714,
+      "step": 1310
+    },
+    {
+      "epoch": 3.2,
+      "grad_norm": 2.5909006595611572,
+      "learning_rate": 4.0314769975786926e-05,
+      "loss": 1.3154,
+      "step": 1320
+    },
+    {
+      "epoch": 3.22,
+      "grad_norm": 3.0990185737609863,
+      "learning_rate": 3.910411622276029e-05,
+      "loss": 1.144,
+      "step": 1330
+    },
+    {
+      "epoch": 3.24,
+      "grad_norm": 1.911260962486267,
+      "learning_rate": 3.789346246973366e-05,
+      "loss": 1.0008,
+      "step": 1340
+    },
+    {
+      "epoch": 3.27,
+      "grad_norm": 2.93192458152771,
+      "learning_rate": 3.6682808716707024e-05,
+      "loss": 1.0603,
+      "step": 1350
+    },
+    {
+      "epoch": 3.29,
+      "grad_norm": 3.3576924800872803,
+      "learning_rate": 3.5472154963680385e-05,
+      "loss": 1.2791,
+      "step": 1360
+    },
+    {
+      "epoch": 3.32,
+      "grad_norm": 2.8567686080932617,
+      "learning_rate": 3.426150121065376e-05,
+      "loss": 1.1223,
+      "step": 1370
+    },
+    {
+      "epoch": 3.34,
+      "grad_norm": 2.735358953475952,
+      "learning_rate": 3.305084745762712e-05,
+      "loss": 1.2043,
+      "step": 1380
+    },
+    {
+      "epoch": 3.37,
+      "grad_norm": 3.374582529067993,
+      "learning_rate": 3.184019370460048e-05,
+      "loss": 1.0495,
+      "step": 1390
+    },
+    {
+      "epoch": 3.39,
+      "grad_norm": 4.9084792137146,
+      "learning_rate": 3.062953995157385e-05,
+      "loss": 1.423,
+      "step": 1400
+    },
+    {
+      "epoch": 3.39,
+      "eval_accuracy": 0.774442538593482,
+      "eval_loss": 0.9588848352432251,
+      "eval_runtime": 6.2494,
+      "eval_samples_per_second": 186.579,
+      "eval_steps_per_second": 23.362,
+      "step": 1400
+    },
+    {
+      "epoch": 3.41,
+      "grad_norm": 4.47416353225708,
+      "learning_rate": 2.941888619854722e-05,
+      "loss": 1.2965,
+      "step": 1410
+    },
+    {
+      "epoch": 3.44,
+      "grad_norm": 2.692729949951172,
+      "learning_rate": 2.8208232445520583e-05,
+      "loss": 1.1812,
+      "step": 1420
+    },
+    {
+      "epoch": 3.46,
+      "grad_norm": 3.5278244018554688,
+      "learning_rate": 2.6997578692493948e-05,
+      "loss": 1.2515,
+      "step": 1430
+    },
+    {
+      "epoch": 3.49,
+      "grad_norm": 2.9056203365325928,
+      "learning_rate": 2.5786924939467316e-05,
+      "loss": 1.0617,
+      "step": 1440
+    },
+    {
+      "epoch": 3.51,
+      "grad_norm": 2.6366896629333496,
+      "learning_rate": 2.457627118644068e-05,
+      "loss": 1.0449,
+      "step": 1450
+    },
+    {
+      "epoch": 3.54,
+      "grad_norm": 3.593003034591675,
+      "learning_rate": 2.3365617433414045e-05,
+      "loss": 1.1273,
+      "step": 1460
+    },
+    {
+      "epoch": 3.56,
+      "grad_norm": 3.5506863594055176,
+      "learning_rate": 2.215496368038741e-05,
+      "loss": 1.2625,
+      "step": 1470
+    },
+    {
+      "epoch": 3.58,
+      "grad_norm": 4.686192989349365,
+      "learning_rate": 2.0944309927360775e-05,
+      "loss": 1.1439,
+      "step": 1480
+    },
+    {
+      "epoch": 3.61,
+      "grad_norm": 3.072838068008423,
+      "learning_rate": 1.9733656174334143e-05,
+      "loss": 1.2008,
+      "step": 1490
+    },
+    {
+      "epoch": 3.63,
+      "grad_norm": 4.130647659301758,
+      "learning_rate": 1.8523002421307507e-05,
+      "loss": 1.0098,
+      "step": 1500
+    },
+    {
+      "epoch": 3.63,
+      "eval_accuracy": 0.7684391080617495,
+      "eval_loss": 0.9360153675079346,
+      "eval_runtime": 5.9954,
+      "eval_samples_per_second": 194.481,
+      "eval_steps_per_second": 24.352,
+      "step": 1500
+    },
+    {
+      "epoch": 3.66,
+      "grad_norm": 2.432633638381958,
+      "learning_rate": 1.7312348668280872e-05,
+      "loss": 1.0802,
+      "step": 1510
+    },
+    {
+      "epoch": 3.68,
+      "grad_norm": 3.6661131381988525,
+      "learning_rate": 1.6101694915254237e-05,
+      "loss": 1.0655,
+      "step": 1520
+    },
+    {
+      "epoch": 3.7,
+      "grad_norm": 3.967733860015869,
+      "learning_rate": 1.4891041162227603e-05,
+      "loss": 1.1482,
+      "step": 1530
+    },
+    {
+      "epoch": 3.73,
+      "grad_norm": 3.776456832885742,
+      "learning_rate": 1.3680387409200971e-05,
+      "loss": 1.2236,
+      "step": 1540
+    },
+    {
+      "epoch": 3.75,
+      "grad_norm": 3.1570096015930176,
+      "learning_rate": 1.2469733656174334e-05,
+      "loss": 1.0433,
+      "step": 1550
+    },
+    {
+      "epoch": 3.78,
+      "grad_norm": 3.3112399578094482,
+      "learning_rate": 1.12590799031477e-05,
+      "loss": 1.1766,
+      "step": 1560
+    },
+    {
+      "epoch": 3.8,
+      "grad_norm": 3.405649185180664,
+      "learning_rate": 1.0048426150121065e-05,
+      "loss": 1.1755,
+      "step": 1570
+    },
+    {
+      "epoch": 3.83,
+      "grad_norm": 2.6833651065826416,
+      "learning_rate": 8.837772397094432e-06,
+      "loss": 1.0593,
+      "step": 1580
+    },
+    {
+      "epoch": 3.85,
+      "grad_norm": 3.3236443996429443,
+      "learning_rate": 7.627118644067798e-06,
+      "loss": 1.1001,
+      "step": 1590
+    },
+    {
+      "epoch": 3.87,
+      "grad_norm": 3.5733933448791504,
+      "learning_rate": 6.4164648910411625e-06,
+      "loss": 1.1325,
+      "step": 1600
+    },
+    {
+      "epoch": 3.87,
+      "eval_accuracy": 0.7710120068610634,
+      "eval_loss": 0.912144124507904,
+      "eval_runtime": 6.3188,
+      "eval_samples_per_second": 184.528,
+      "eval_steps_per_second": 23.106,
+      "step": 1600
+    },
+    {
+      "epoch": 3.9,
+      "grad_norm": 3.3235766887664795,
+      "learning_rate": 5.205811138014528e-06,
+      "loss": 1.1434,
+      "step": 1610
+    },
+    {
+      "epoch": 3.92,
+      "grad_norm": 5.47670841217041,
+      "learning_rate": 3.9951573849878936e-06,
+      "loss": 1.0415,
+      "step": 1620
+    },
+    {
+      "epoch": 3.95,
+      "grad_norm": 2.83181095123291,
+      "learning_rate": 2.784503631961259e-06,
+      "loss": 1.0888,
+      "step": 1630
+    },
+    {
+      "epoch": 3.97,
+      "grad_norm": 4.57571268081665,
+      "learning_rate": 1.5738498789346248e-06,
+      "loss": 1.0908,
+      "step": 1640
+    },
+    {
+      "epoch": 4.0,
+      "grad_norm": 3.1416895389556885,
+      "learning_rate": 3.6319612590799036e-07,
+      "loss": 0.9855,
+      "step": 1650
+    },
+    {
+      "epoch": 4.0,
+      "step": 1652,
+      "total_flos": 2.047635634195759e+18,
+      "train_loss": 1.8191430680543978,
+      "train_runtime": 594.0822,
+      "train_samples_per_second": 44.458,
+      "train_steps_per_second": 2.781
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1652,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 100,
+  "total_flos": 2.047635634195759e+18,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cbb4d0502cb6aa8e763c1e6b3bea2a272af2e0cd58d51af575190057bab553e7
 size 4920

 version https://git-lfs.github.com/spec/v1
+oid sha256:52755d98ad2bd9ec55bf7137c74905f11d5f04d9a40c982b4c6e4d07c6bb986d
 size 4920