Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

model.safetensors +1 -1
optimizer.pt +1 -1
rng_state_0.pth +1 -1
rng_state_1.pth +1 -1
rng_state_2.pth +1 -1
rng_state_3.pth +1 -1
rng_state_4.pth +1 -1
rng_state_5.pth +1 -1
rng_state_6.pth +1 -1
rng_state_7.pth +1 -1
rng_state_8.pth +1 -1
rng_state_9.pth +1 -1
scheduler.pt +1 -1
trainer_state.json +1447 -3

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4d2a42cc56e07c013f73e56708f9890aeba1ef6f2507446949476e63f34240fa
 size 185097216

 version https://git-lfs.github.com/spec/v1
+oid sha256:0b11b2ff539c63d3c6bac569f7766703b625a1ff96963c99b63259f1624298ab
 size 185097216

optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9dd0b2a653496b95d07793491c50539b083d1eff242426d4d9cf66c7f12edaaa
 size 370329978

 version https://git-lfs.github.com/spec/v1
+oid sha256:aa9d2a0ec897c5663a4be748f0fc34398833a23f3d592a7fbdfecc9b082648a1
 size 370329978

rng_state_0.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:72aed7659a4568e39ea9a56bdc92196603df7d730a90c6411d24926b5d12ad03
 size 16433

 version https://git-lfs.github.com/spec/v1
+oid sha256:c4c1665affd4ed287cc307c033aa9b83f5129c6f23f546aeee5169ca1b8994af
 size 16433

rng_state_1.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8ccb7aad33c882b66b79a28ece740dc71664d087d1f12ad61b65b18df1beca55
 size 16433

 version https://git-lfs.github.com/spec/v1
+oid sha256:2abd31d46a5b8aba640a7054a1f1886e89e9f6b46ac0ecd782b859dc4214d256
 size 16433

rng_state_2.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bf3954ab65b1da5768888e4c50598bd4c761244f7027ce8da56d21582e829948
 size 16433

 version https://git-lfs.github.com/spec/v1
+oid sha256:b6356c5d3297eec0185fbb12716367c7cafc523337fc20be9926fc46325c429a
 size 16433

rng_state_3.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9373b1a76204778613e7dd6b7f31b34c6bb969dbb0b802f9f62ca911909492f2
 size 16433

 version https://git-lfs.github.com/spec/v1
+oid sha256:866ba9b91c55d95ef1da6feb07d2ffecceb23df8b5608e080221fc6dbac9ed2f
 size 16433

rng_state_4.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d1fc04b2ab887669d463a51ba4296461e18e6793bbf2221bc603bff1af03a7a6
 size 16433

 version https://git-lfs.github.com/spec/v1
+oid sha256:394d36bdefd8333a39a124a3477c46f23015ea9e02d59f4ba6dcce13ab46168f
 size 16433

rng_state_5.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:81879e4ef695491cab2ffd4ab75ee6208dd18e8e7339d7b4aff8a08580e999d0
 size 16433

 version https://git-lfs.github.com/spec/v1
+oid sha256:577c7045aadc9972169583d0b06cfc68d51e9cd2d016cd56617f0749af75cb31
 size 16433

rng_state_6.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0af269f614eb79e3ee82816f8d169e28a4fa1006f684260473e683e7ccea9d58
 size 16433

 version https://git-lfs.github.com/spec/v1
+oid sha256:5fd7094860928ca612494a904481849cce54a143aaf53d511c3fdafa80f4fefa
 size 16433

rng_state_7.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6e9f29cd3c33bc4028b8be6ed5e960831203bf34fb31356c09cdb29b4c851d3f
 size 16433

 version https://git-lfs.github.com/spec/v1
+oid sha256:b8e1cc2af93b92de277256d30ceb20997578b8263be191983b475c3ad46a0790
 size 16433

rng_state_8.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:62adbe536ff6dcfa7fb600a17a73fb0c108c33b017287d093401133eb29bb6d4
 size 16433

 version https://git-lfs.github.com/spec/v1
+oid sha256:fe4f2fce602372e0140548196138ac3dca07af36635f6b15963598f4bc25bf6b
 size 16433

rng_state_9.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5b7dd71e7a0ba4defb86d5997709ee80f19a9326e37319a19a3ae8c437ad97fa
 size 16433

 version https://git-lfs.github.com/spec/v1
+oid sha256:f4790382837b89dc610979dcc6f00fc79c7847cb5b0f57da5641fc8d16ea0a4a
 size 16433

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4729d5e48ae081b5107dc5941bf9dd080c7d24c9e46db7051ab1bfcf68e98eee
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:75fc12f97c880c2c5755340b9a95c8af52d0a5dc3399cb79b9acc0bfca303aae
 size 1064

trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.49979917224036463,
   "eval_steps": 1431,
-  "global_step": 7155,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -3624,6 +3624,1450 @@
       "eval_samples_per_second": 109.582,
       "eval_steps_per_second": 1.425,
       "step": 7155
     }
   ],
   "logging_steps": 14,
@@ -3643,7 +5087,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 1.906493869248217e+18,
   "train_batch_size": 8,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.6997188411365105,
   "eval_steps": 1431,
+  "global_step": 10017,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 109.582,
       "eval_steps_per_second": 1.425,
       "step": 7155
+    },
+    {
+      "epoch": 0.5007072629795855,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.001,
+      "loss": 1.2987,
+      "step": 7168
+    },
+    {
+      "epoch": 0.5016852068525924,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001,
+      "loss": 1.2759,
+      "step": 7182
+    },
+    {
+      "epoch": 0.5026631507255994,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.001,
+      "loss": 1.2814,
+      "step": 7196
+    },
+    {
+      "epoch": 0.5036410945986064,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.001,
+      "loss": 1.2701,
+      "step": 7210
+    },
+    {
+      "epoch": 0.5046190384716134,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.001,
+      "loss": 1.2857,
+      "step": 7224
+    },
+    {
+      "epoch": 0.5055969823446205,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.001,
+      "loss": 1.2707,
+      "step": 7238
+    },
+    {
+      "epoch": 0.5065749262176275,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.001,
+      "loss": 1.2851,
+      "step": 7252
+    },
+    {
+      "epoch": 0.5075528700906344,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.001,
+      "loss": 1.2722,
+      "step": 7266
+    },
+    {
+      "epoch": 0.5085308139636414,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001,
+      "loss": 1.277,
+      "step": 7280
+    },
+    {
+      "epoch": 0.5095087578366484,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.001,
+      "loss": 1.3021,
+      "step": 7294
+    },
+    {
+      "epoch": 0.5104867017096555,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.001,
+      "loss": 1.2856,
+      "step": 7308
+    },
+    {
+      "epoch": 0.5114646455826625,
+      "grad_norm": 0.25,
+      "learning_rate": 0.001,
+      "loss": 1.2704,
+      "step": 7322
+    },
+    {
+      "epoch": 0.5124425894556695,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001,
+      "loss": 1.2837,
+      "step": 7336
+    },
+    {
+      "epoch": 0.5134205333286764,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001,
+      "loss": 1.277,
+      "step": 7350
+    },
+    {
+      "epoch": 0.5143984772016834,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001,
+      "loss": 1.2838,
+      "step": 7364
+    },
+    {
+      "epoch": 0.5153764210746905,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001,
+      "loss": 1.2762,
+      "step": 7378
+    },
+    {
+      "epoch": 0.5163543649476975,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.001,
+      "loss": 1.2749,
+      "step": 7392
+    },
+    {
+      "epoch": 0.5173323088207045,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.001,
+      "loss": 1.2791,
+      "step": 7406
+    },
+    {
+      "epoch": 0.5183102526937114,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.001,
+      "loss": 1.2708,
+      "step": 7420
+    },
+    {
+      "epoch": 0.5192881965667184,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.001,
+      "loss": 1.2773,
+      "step": 7434
+    },
+    {
+      "epoch": 0.5202661404397255,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001,
+      "loss": 1.2783,
+      "step": 7448
+    },
+    {
+      "epoch": 0.5212440843127325,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.001,
+      "loss": 1.2944,
+      "step": 7462
+    },
+    {
+      "epoch": 0.5222220281857395,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.001,
+      "loss": 1.2714,
+      "step": 7476
+    },
+    {
+      "epoch": 0.5231999720587465,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.001,
+      "loss": 1.2711,
+      "step": 7490
+    },
+    {
+      "epoch": 0.5241779159317534,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001,
+      "loss": 1.2808,
+      "step": 7504
+    },
+    {
+      "epoch": 0.5251558598047605,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001,
+      "loss": 1.2765,
+      "step": 7518
+    },
+    {
+      "epoch": 0.5261338036777675,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001,
+      "loss": 1.2702,
+      "step": 7532
+    },
+    {
+      "epoch": 0.5271117475507745,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001,
+      "loss": 1.2802,
+      "step": 7546
+    },
+    {
+      "epoch": 0.5280896914237815,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.001,
+      "loss": 1.2733,
+      "step": 7560
+    },
+    {
+      "epoch": 0.5290676352967885,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.001,
+      "loss": 1.2575,
+      "step": 7574
+    },
+    {
+      "epoch": 0.5300455791697956,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.001,
+      "loss": 1.2863,
+      "step": 7588
+    },
+    {
+      "epoch": 0.5310235230428025,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.001,
+      "loss": 1.2815,
+      "step": 7602
+    },
+    {
+      "epoch": 0.5320014669158095,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.001,
+      "loss": 1.2745,
+      "step": 7616
+    },
+    {
+      "epoch": 0.5329794107888165,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001,
+      "loss": 1.294,
+      "step": 7630
+    },
+    {
+      "epoch": 0.5339573546618235,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.001,
+      "loss": 1.2797,
+      "step": 7644
+    },
+    {
+      "epoch": 0.5349352985348306,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001,
+      "loss": 1.2665,
+      "step": 7658
+    },
+    {
+      "epoch": 0.5359132424078376,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.001,
+      "loss": 1.28,
+      "step": 7672
+    },
+    {
+      "epoch": 0.5368911862808445,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.001,
+      "loss": 1.3057,
+      "step": 7686
+    },
+    {
+      "epoch": 0.5378691301538515,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.001,
+      "loss": 1.258,
+      "step": 7700
+    },
+    {
+      "epoch": 0.5388470740268585,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.001,
+      "loss": 1.2742,
+      "step": 7714
+    },
+    {
+      "epoch": 0.5398250178998655,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.001,
+      "loss": 1.277,
+      "step": 7728
+    },
+    {
+      "epoch": 0.5408029617728726,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.001,
+      "loss": 1.2819,
+      "step": 7742
+    },
+    {
+      "epoch": 0.5417809056458796,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001,
+      "loss": 1.3018,
+      "step": 7756
+    },
+    {
+      "epoch": 0.5427588495188865,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001,
+      "loss": 1.2619,
+      "step": 7770
+    },
+    {
+      "epoch": 0.5437367933918935,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001,
+      "loss": 1.2513,
+      "step": 7784
+    },
+    {
+      "epoch": 0.5447147372649005,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001,
+      "loss": 1.2456,
+      "step": 7798
+    },
+    {
+      "epoch": 0.5456926811379076,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.001,
+      "loss": 1.2768,
+      "step": 7812
+    },
+    {
+      "epoch": 0.5466706250109146,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.001,
+      "loss": 1.265,
+      "step": 7826
+    },
+    {
+      "epoch": 0.5476485688839215,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.001,
+      "loss": 1.2683,
+      "step": 7840
+    },
+    {
+      "epoch": 0.5486265127569285,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.001,
+      "loss": 1.2809,
+      "step": 7854
+    },
+    {
+      "epoch": 0.5496044566299355,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001,
+      "loss": 1.2498,
+      "step": 7868
+    },
+    {
+      "epoch": 0.5505824005029426,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.001,
+      "loss": 1.2632,
+      "step": 7882
+    },
+    {
+      "epoch": 0.5515603443759496,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001,
+      "loss": 1.2711,
+      "step": 7896
+    },
+    {
+      "epoch": 0.5525382882489566,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.001,
+      "loss": 1.2813,
+      "step": 7910
+    },
+    {
+      "epoch": 0.5535162321219635,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.001,
+      "loss": 1.2993,
+      "step": 7924
+    },
+    {
+      "epoch": 0.5544941759949705,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.001,
+      "loss": 1.3001,
+      "step": 7938
+    },
+    {
+      "epoch": 0.5554721198679776,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.001,
+      "loss": 1.2786,
+      "step": 7952
+    },
+    {
+      "epoch": 0.5564500637409846,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.001,
+      "loss": 1.2976,
+      "step": 7966
+    },
+    {
+      "epoch": 0.5574280076139916,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001,
+      "loss": 1.286,
+      "step": 7980
+    },
+    {
+      "epoch": 0.5584059514869986,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.001,
+      "loss": 1.282,
+      "step": 7994
+    },
+    {
+      "epoch": 0.5593838953600055,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.001,
+      "loss": 1.2853,
+      "step": 8008
+    },
+    {
+      "epoch": 0.5603618392330126,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.001,
+      "loss": 1.2572,
+      "step": 8022
+    },
+    {
+      "epoch": 0.5613397831060196,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.001,
+      "loss": 1.2572,
+      "step": 8036
+    },
+    {
+      "epoch": 0.5623177269790266,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.001,
+      "loss": 1.315,
+      "step": 8050
+    },
+    {
+      "epoch": 0.5632956708520336,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.001,
+      "loss": 1.3007,
+      "step": 8064
+    },
+    {
+      "epoch": 0.5642736147250406,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001,
+      "loss": 1.2737,
+      "step": 8078
+    },
+    {
+      "epoch": 0.5652515585980477,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.001,
+      "loss": 1.2766,
+      "step": 8092
+    },
+    {
+      "epoch": 0.5662295024710546,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.001,
+      "loss": 1.3102,
+      "step": 8106
+    },
+    {
+      "epoch": 0.5672074463440616,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.001,
+      "loss": 1.3044,
+      "step": 8120
+    },
+    {
+      "epoch": 0.5681853902170686,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.001,
+      "loss": 1.2612,
+      "step": 8134
+    },
+    {
+      "epoch": 0.5691633340900756,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001,
+      "loss": 1.2701,
+      "step": 8148
+    },
+    {
+      "epoch": 0.5701412779630827,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.001,
+      "loss": 1.2649,
+      "step": 8162
+    },
+    {
+      "epoch": 0.5711192218360897,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.001,
+      "loss": 1.2761,
+      "step": 8176
+    },
+    {
+      "epoch": 0.5720971657090966,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001,
+      "loss": 1.2668,
+      "step": 8190
+    },
+    {
+      "epoch": 0.5730751095821036,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.001,
+      "loss": 1.2847,
+      "step": 8204
+    },
+    {
+      "epoch": 0.5740530534551106,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001,
+      "loss": 1.2722,
+      "step": 8218
+    },
+    {
+      "epoch": 0.5750309973281177,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.001,
+      "loss": 1.253,
+      "step": 8232
+    },
+    {
+      "epoch": 0.5760089412011247,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.001,
+      "loss": 1.2454,
+      "step": 8246
+    },
+    {
+      "epoch": 0.5769868850741317,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001,
+      "loss": 1.2558,
+      "step": 8260
+    },
+    {
+      "epoch": 0.5779648289471386,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.001,
+      "loss": 1.2765,
+      "step": 8274
+    },
+    {
+      "epoch": 0.5789427728201456,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.001,
+      "loss": 1.289,
+      "step": 8288
+    },
+    {
+      "epoch": 0.5799207166931526,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.001,
+      "loss": 1.2724,
+      "step": 8302
+    },
+    {
+      "epoch": 0.5808986605661597,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.001,
+      "loss": 1.2753,
+      "step": 8316
+    },
+    {
+      "epoch": 0.5818766044391667,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.001,
+      "loss": 1.2558,
+      "step": 8330
+    },
+    {
+      "epoch": 0.5828545483121736,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001,
+      "loss": 1.2697,
+      "step": 8344
+    },
+    {
+      "epoch": 0.5838324921851806,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.001,
+      "loss": 1.2685,
+      "step": 8358
+    },
+    {
+      "epoch": 0.5848104360581876,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001,
+      "loss": 1.2724,
+      "step": 8372
+    },
+    {
+      "epoch": 0.5857883799311947,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.001,
+      "loss": 1.2287,
+      "step": 8386
+    },
+    {
+      "epoch": 0.5867663238042017,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.001,
+      "loss": 1.2363,
+      "step": 8400
+    },
+    {
+      "epoch": 0.5877442676772087,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.001,
+      "loss": 1.2648,
+      "step": 8414
+    },
+    {
+      "epoch": 0.5887222115502156,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.001,
+      "loss": 1.2949,
+      "step": 8428
+    },
+    {
+      "epoch": 0.5897001554232226,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.001,
+      "loss": 1.2571,
+      "step": 8442
+    },
+    {
+      "epoch": 0.5906780992962297,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.001,
+      "loss": 1.2831,
+      "step": 8456
+    },
+    {
+      "epoch": 0.5916560431692367,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.001,
+      "loss": 1.2965,
+      "step": 8470
+    },
+    {
+      "epoch": 0.5926339870422437,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.001,
+      "loss": 1.2685,
+      "step": 8484
+    },
+    {
+      "epoch": 0.5936119309152507,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.001,
+      "loss": 1.2637,
+      "step": 8498
+    },
+    {
+      "epoch": 0.5945898747882576,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.001,
+      "loss": 1.28,
+      "step": 8512
+    },
+    {
+      "epoch": 0.5955678186612647,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.001,
+      "loss": 1.2636,
+      "step": 8526
+    },
+    {
+      "epoch": 0.5965457625342717,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.001,
+      "loss": 1.251,
+      "step": 8540
+    },
+    {
+      "epoch": 0.5975237064072787,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.001,
+      "loss": 1.262,
+      "step": 8554
+    },
+    {
+      "epoch": 0.5985016502802857,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.001,
+      "loss": 1.2696,
+      "step": 8568
+    },
+    {
+      "epoch": 0.5994795941532927,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.001,
+      "loss": 1.2872,
+      "step": 8582
+    },
+    {
+      "epoch": 0.5997590066884375,
+      "eval_loss": 1.661841869354248,
+      "eval_runtime": 9.1193,
+      "eval_samples_per_second": 109.657,
+      "eval_steps_per_second": 1.426,
+      "step": 8586
+    },
+    {
+      "epoch": 0.6004575380262998,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.001,
+      "loss": 1.2767,
+      "step": 8596
+    },
+    {
+      "epoch": 0.6014354818993067,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001,
+      "loss": 1.2623,
+      "step": 8610
+    },
+    {
+      "epoch": 0.6024134257723137,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001,
+      "loss": 1.2617,
+      "step": 8624
+    },
+    {
+      "epoch": 0.6033913696453207,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001,
+      "loss": 1.2514,
+      "step": 8638
+    },
+    {
+      "epoch": 0.6043693135183277,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001,
+      "loss": 1.2664,
+      "step": 8652
+    },
+    {
+      "epoch": 0.6053472573913348,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.001,
+      "loss": 1.2421,
+      "step": 8666
+    },
+    {
+      "epoch": 0.6063252012643418,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001,
+      "loss": 1.2386,
+      "step": 8680
+    },
+    {
+      "epoch": 0.6073031451373487,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001,
+      "loss": 1.2601,
+      "step": 8694
+    },
+    {
+      "epoch": 0.6082810890103557,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.001,
+      "loss": 1.2715,
+      "step": 8708
+    },
+    {
+      "epoch": 0.6092590328833627,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.001,
+      "loss": 1.2848,
+      "step": 8722
+    },
+    {
+      "epoch": 0.6102369767563698,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.001,
+      "loss": 1.2632,
+      "step": 8736
+    },
+    {
+      "epoch": 0.6112149206293768,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001,
+      "loss": 1.2912,
+      "step": 8750
+    },
+    {
+      "epoch": 0.6121928645023837,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.001,
+      "loss": 1.2613,
+      "step": 8764
+    },
+    {
+      "epoch": 0.6131708083753907,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001,
+      "loss": 1.2357,
+      "step": 8778
+    },
+    {
+      "epoch": 0.6141487522483977,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.001,
+      "loss": 1.2541,
+      "step": 8792
+    },
+    {
+      "epoch": 0.6151266961214047,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.001,
+      "loss": 1.2746,
+      "step": 8806
+    },
+    {
+      "epoch": 0.6161046399944118,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001,
+      "loss": 1.2445,
+      "step": 8820
+    },
+    {
+      "epoch": 0.6170825838674188,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001,
+      "loss": 1.2854,
+      "step": 8834
+    },
+    {
+      "epoch": 0.6180605277404257,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.001,
+      "loss": 1.2746,
+      "step": 8848
+    },
+    {
+      "epoch": 0.6190384716134327,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.001,
+      "loss": 1.2837,
+      "step": 8862
+    },
+    {
+      "epoch": 0.6200164154864397,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.001,
+      "loss": 1.2682,
+      "step": 8876
+    },
+    {
+      "epoch": 0.6209943593594468,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001,
+      "loss": 1.2751,
+      "step": 8890
+    },
+    {
+      "epoch": 0.6219723032324538,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.001,
+      "loss": 1.2666,
+      "step": 8904
+    },
+    {
+      "epoch": 0.6229502471054608,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.001,
+      "loss": 1.2618,
+      "step": 8918
+    },
+    {
+      "epoch": 0.6239281909784677,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001,
+      "loss": 1.2721,
+      "step": 8932
+    },
+    {
+      "epoch": 0.6249061348514747,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.001,
+      "loss": 1.2528,
+      "step": 8946
+    },
+    {
+      "epoch": 0.6258840787244818,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.001,
+      "loss": 1.2771,
+      "step": 8960
+    },
+    {
+      "epoch": 0.6268620225974888,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001,
+      "loss": 1.2751,
+      "step": 8974
+    },
+    {
+      "epoch": 0.6278399664704958,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.001,
+      "loss": 1.2749,
+      "step": 8988
+    },
+    {
+      "epoch": 0.6288179103435028,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001,
+      "loss": 1.2851,
+      "step": 9002
+    },
+    {
+      "epoch": 0.6297958542165097,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.001,
+      "loss": 1.2529,
+      "step": 9016
+    },
+    {
+      "epoch": 0.6307737980895168,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.001,
+      "loss": 1.2673,
+      "step": 9030
+    },
+    {
+      "epoch": 0.6317517419625238,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001,
+      "loss": 1.2746,
+      "step": 9044
+    },
+    {
+      "epoch": 0.6327296858355308,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.001,
+      "loss": 1.2649,
+      "step": 9058
+    },
+    {
+      "epoch": 0.6337076297085378,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.001,
+      "loss": 1.2849,
+      "step": 9072
+    },
+    {
+      "epoch": 0.6346855735815448,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001,
+      "loss": 1.243,
+      "step": 9086
+    },
+    {
+      "epoch": 0.6356635174545519,
+      "grad_norm": 0.375,
+      "learning_rate": 0.001,
+      "loss": 1.2641,
+      "step": 9100
+    },
+    {
+      "epoch": 0.6366414613275588,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.001,
+      "loss": 1.2554,
+      "step": 9114
+    },
+    {
+      "epoch": 0.6376194052005658,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.001,
+      "loss": 1.2396,
+      "step": 9128
+    },
+    {
+      "epoch": 0.6385973490735728,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001,
+      "loss": 1.2508,
+      "step": 9142
+    },
+    {
+      "epoch": 0.6395752929465798,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001,
+      "loss": 1.2772,
+      "step": 9156
+    },
+    {
+      "epoch": 0.6405532368195869,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.001,
+      "loss": 1.2453,
+      "step": 9170
+    },
+    {
+      "epoch": 0.6415311806925938,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.001,
+      "loss": 1.2764,
+      "step": 9184
+    },
+    {
+      "epoch": 0.6425091245656008,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.001,
+      "loss": 1.251,
+      "step": 9198
+    },
+    {
+      "epoch": 0.6434870684386078,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.001,
+      "loss": 1.2855,
+      "step": 9212
+    },
+    {
+      "epoch": 0.6444650123116148,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.001,
+      "loss": 1.3198,
+      "step": 9226
+    },
+    {
+      "epoch": 0.6454429561846219,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.001,
+      "loss": 1.2773,
+      "step": 9240
+    },
+    {
+      "epoch": 0.6464209000576289,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.001,
+      "loss": 1.2786,
+      "step": 9254
+    },
+    {
+      "epoch": 0.6473988439306358,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.001,
+      "loss": 1.2389,
+      "step": 9268
+    },
+    {
+      "epoch": 0.6483767878036428,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.001,
+      "loss": 1.2587,
+      "step": 9282
+    },
+    {
+      "epoch": 0.6493547316766498,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.001,
+      "loss": 1.2806,
+      "step": 9296
+    },
+    {
+      "epoch": 0.6503326755496568,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.001,
+      "loss": 1.2307,
+      "step": 9310
+    },
+    {
+      "epoch": 0.6513106194226639,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001,
+      "loss": 1.2657,
+      "step": 9324
+    },
+    {
+      "epoch": 0.6522885632956709,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001,
+      "loss": 1.2605,
+      "step": 9338
+    },
+    {
+      "epoch": 0.6532665071686778,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.001,
+      "loss": 1.2538,
+      "step": 9352
+    },
+    {
+      "epoch": 0.6542444510416848,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001,
+      "loss": 1.2633,
+      "step": 9366
+    },
+    {
+      "epoch": 0.6552223949146918,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.001,
+      "loss": 1.2582,
+      "step": 9380
+    },
+    {
+      "epoch": 0.6562003387876989,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.001,
+      "loss": 1.2515,
+      "step": 9394
+    },
+    {
+      "epoch": 0.6571782826607059,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001,
+      "loss": 1.2679,
+      "step": 9408
+    },
+    {
+      "epoch": 0.6581562265337129,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.001,
+      "loss": 1.2539,
+      "step": 9422
+    },
+    {
+      "epoch": 0.6591341704067198,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.001,
+      "loss": 1.2632,
+      "step": 9436
+    },
+    {
+      "epoch": 0.6601121142797268,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001,
+      "loss": 1.2946,
+      "step": 9450
+    },
+    {
+      "epoch": 0.6610900581527339,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.001,
+      "loss": 1.2691,
+      "step": 9464
+    },
+    {
+      "epoch": 0.6620680020257409,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.001,
+      "loss": 1.246,
+      "step": 9478
+    },
+    {
+      "epoch": 0.6630459458987479,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.001,
+      "loss": 1.2606,
+      "step": 9492
+    },
+    {
+      "epoch": 0.6640238897717549,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.001,
+      "loss": 1.2782,
+      "step": 9506
+    },
+    {
+      "epoch": 0.6650018336447618,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.001,
+      "loss": 1.2687,
+      "step": 9520
+    },
+    {
+      "epoch": 0.6659797775177689,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.001,
+      "loss": 1.2778,
+      "step": 9534
+    },
+    {
+      "epoch": 0.6669577213907759,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.001,
+      "loss": 1.2544,
+      "step": 9548
+    },
+    {
+      "epoch": 0.6679356652637829,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001,
+      "loss": 1.246,
+      "step": 9562
+    },
+    {
+      "epoch": 0.6689136091367899,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001,
+      "loss": 1.2577,
+      "step": 9576
+    },
+    {
+      "epoch": 0.6698915530097969,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.001,
+      "loss": 1.2639,
+      "step": 9590
+    },
+    {
+      "epoch": 0.670869496882804,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001,
+      "loss": 1.2493,
+      "step": 9604
+    },
+    {
+      "epoch": 0.6718474407558109,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.001,
+      "loss": 1.2586,
+      "step": 9618
+    },
+    {
+      "epoch": 0.6728253846288179,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.001,
+      "loss": 1.2719,
+      "step": 9632
+    },
+    {
+      "epoch": 0.6738033285018249,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001,
+      "loss": 1.2272,
+      "step": 9646
+    },
+    {
+      "epoch": 0.6747812723748319,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.001,
+      "loss": 1.2667,
+      "step": 9660
+    },
+    {
+      "epoch": 0.675759216247839,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.001,
+      "loss": 1.2567,
+      "step": 9674
+    },
+    {
+      "epoch": 0.676737160120846,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.001,
+      "loss": 1.2403,
+      "step": 9688
+    },
+    {
+      "epoch": 0.6777151039938529,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.001,
+      "loss": 1.2642,
+      "step": 9702
+    },
+    {
+      "epoch": 0.6786930478668599,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.001,
+      "loss": 1.2123,
+      "step": 9716
+    },
+    {
+      "epoch": 0.6796709917398669,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001,
+      "loss": 1.2413,
+      "step": 9730
+    },
+    {
+      "epoch": 0.680648935612874,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001,
+      "loss": 1.2442,
+      "step": 9744
+    },
+    {
+      "epoch": 0.681626879485881,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001,
+      "loss": 1.2298,
+      "step": 9758
+    },
+    {
+      "epoch": 0.682604823358888,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.001,
+      "loss": 1.2421,
+      "step": 9772
+    },
+    {
+      "epoch": 0.6835827672318949,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.001,
+      "loss": 1.2725,
+      "step": 9786
+    },
+    {
+      "epoch": 0.6845607111049019,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001,
+      "loss": 1.2417,
+      "step": 9800
+    },
+    {
+      "epoch": 0.685538654977909,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.001,
+      "loss": 1.2526,
+      "step": 9814
+    },
+    {
+      "epoch": 0.686516598850916,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.001,
+      "loss": 1.2352,
+      "step": 9828
+    },
+    {
+      "epoch": 0.687494542723923,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.001,
+      "loss": 1.2653,
+      "step": 9842
+    },
+    {
+      "epoch": 0.68847248659693,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.001,
+      "loss": 1.2569,
+      "step": 9856
+    },
+    {
+      "epoch": 0.6894504304699369,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.001,
+      "loss": 1.248,
+      "step": 9870
+    },
+    {
+      "epoch": 0.6904283743429439,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.001,
+      "loss": 1.2864,
+      "step": 9884
+    },
+    {
+      "epoch": 0.691406318215951,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001,
+      "loss": 1.2663,
+      "step": 9898
+    },
+    {
+      "epoch": 0.692384262088958,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.001,
+      "loss": 1.2534,
+      "step": 9912
+    },
+    {
+      "epoch": 0.693362205961965,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.001,
+      "loss": 1.2439,
+      "step": 9926
+    },
+    {
+      "epoch": 0.6943401498349719,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.001,
+      "loss": 1.2592,
+      "step": 9940
+    },
+    {
+      "epoch": 0.6953180937079789,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.001,
+      "loss": 1.2655,
+      "step": 9954
+    },
+    {
+      "epoch": 0.696296037580986,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.001,
+      "loss": 1.2569,
+      "step": 9968
+    },
+    {
+      "epoch": 0.697273981453993,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.001,
+      "loss": 1.2663,
+      "step": 9982
+    },
+    {
+      "epoch": 0.698251925327,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001,
+      "loss": 1.2419,
+      "step": 9996
+    },
+    {
+      "epoch": 0.699229869200007,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.001,
+      "loss": 1.2406,
+      "step": 10010
+    },
+    {
+      "epoch": 0.6997188411365105,
+      "eval_loss": 1.6404287815093994,
+      "eval_runtime": 9.1224,
+      "eval_samples_per_second": 109.621,
+      "eval_steps_per_second": 1.425,
+      "step": 10017
     }
   ],
   "logging_steps": 14,
       "attributes": {}
     }
   },
+  "total_flos": 2.66909141699448e+18,
   "train_batch_size": 8,
   "trial_name": null,
   "trial_params": null