Upload 8 files

Browse files

Files changed (7) hide show

README.md +10 -10
all_results.json +12 -12
eval_results.json +6 -6
model.safetensors +1 -1
train_results.json +6 -6
trainer_state.json +2068 -360
training_args.bin +1 -1

README.md CHANGED Viewed

@@ -4,19 +4,19 @@ base_model: agentlans/multilingual-e5-small-aligned
 tags:
 - generated_from_trainer
 model-index:
-- name: multilingual-e5-small-aligned-sentiment
   results: []
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 should probably proofread and complete it, then remove this comment. -->
-# multilingual-e5-small-aligned-sentiment
 This model is a fine-tuned version of [agentlans/multilingual-e5-small-aligned](https://huggingface.co/agentlans/multilingual-e5-small-aligned) on an unknown dataset.
 It achieves the following results on the evaluation set:
-- Loss: 0.2188
-- Mse: 0.2188
 ## Model description
@@ -36,7 +36,7 @@ More information needed
 The following hyperparameters were used during training:
 - learning_rate: 5e-05
-- train_batch_size: 64
 - eval_batch_size: 8
 - seed: 42
 - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
@@ -45,11 +45,11 @@ The following hyperparameters were used during training:
 ### Training results
-| Training Loss | Epoch | Step  | Validation Loss | Mse    |
-|:-------------:|:-----:|:-----:|:---------------:|:------:|
-| 0.2635        | 1.0   | 13548 | 0.2526          | 0.2526 |
-| 0.1944        | 2.0   | 27096 | 0.2277          | 0.2277 |
-| 0.1489        | 3.0   | 40644 | 0.2188          | 0.2188 |
 ### Framework versions

 tags:
 - generated_from_trainer
 model-index:
+- name: multilingual-e5-small-aligned-transformed-sentiment
   results: []
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 should probably proofread and complete it, then remove this comment. -->
+# multilingual-e5-small-aligned-transformed-sentiment
 This model is a fine-tuned version of [agentlans/multilingual-e5-small-aligned](https://huggingface.co/agentlans/multilingual-e5-small-aligned) on an unknown dataset.
 It achieves the following results on the evaluation set:
+- Loss: 0.2082
+- Mse: 0.2082
 ## Model description
 The following hyperparameters were used during training:
 - learning_rate: 5e-05
+- train_batch_size: 32
 - eval_batch_size: 8
 - seed: 42
 - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
 ### Training results
+| Training Loss | Epoch | Step   | Validation Loss | Mse    |
+|:-------------:|:-----:|:------:|:---------------:|:------:|
+| 0.1898        | 1.0   | 54191  | 0.2322          | 0.2322 |
+| 0.1186        | 2.0   | 108382 | 0.2139          | 0.2139 |
+| 0.0861        | 3.0   | 162573 | 0.2082          | 0.2082 |
 ### Framework versions

all_results.json CHANGED Viewed

@@ -1,15 +1,15 @@
 {
     "epoch": 3.0,
-    "eval_loss": 0.21882201731204987,
-    "eval_mse": 0.21882200171115507,
-    "eval_runtime": 50.6658,
-    "eval_samples": 96338,
-    "eval_samples_per_second": 1901.439,
-    "eval_steps_per_second": 237.695,
-    "total_flos": 4.283504864539085e+16,
-    "train_loss": 0.21721052708159796,
-    "train_runtime": 3074.1411,
-    "train_samples": 867042,
-    "train_samples_per_second": 846.131,
-    "train_steps_per_second": 13.221
 }

 {
     "epoch": 3.0,
+    "eval_loss": 0.20824576914310455,
+    "eval_mse": 0.20824573578672098,
+    "eval_runtime": 118.1347,
+    "eval_samples": 192676,
+    "eval_samples_per_second": 1630.985,
+    "eval_steps_per_second": 203.877,
+    "total_flos": 8.56700972907817e+16,
+    "train_loss": 0.16141534491764947,
+    "train_runtime": 8977.4486,
+    "train_samples": 1734084,
+    "train_samples_per_second": 579.48,
+    "train_steps_per_second": 18.109
 }

eval_results.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
     "epoch": 3.0,
-    "eval_loss": 0.21882201731204987,
-    "eval_mse": 0.21882200171115507,
-    "eval_runtime": 50.6658,
-    "eval_samples": 96338,
-    "eval_samples_per_second": 1901.439,
-    "eval_steps_per_second": 237.695
 }

 {
     "epoch": 3.0,
+    "eval_loss": 0.20824576914310455,
+    "eval_mse": 0.20824573578672098,
+    "eval_runtime": 118.1347,
+    "eval_samples": 192676,
+    "eval_samples_per_second": 1630.985,
+    "eval_steps_per_second": 203.877
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5333c9df0d67cb5904f8530167efd77528f3f24bd07b181bbd3c10ec9946baeb
 size 470640124

 version https://git-lfs.github.com/spec/v1
+oid sha256:1625359b708464b43c87eea957f8f6c642c0ed136ac047d2d480e5e37858bab4
 size 470640124

train_results.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
     "epoch": 3.0,
-    "total_flos": 4.283504864539085e+16,
-    "train_loss": 0.21721052708159796,
-    "train_runtime": 3074.1411,
-    "train_samples": 867042,
-    "train_samples_per_second": 846.131,
-    "train_steps_per_second": 13.221
 }

 {
     "epoch": 3.0,
+    "total_flos": 8.56700972907817e+16,
+    "train_loss": 0.16141534491764947,
+    "train_runtime": 8977.4486,
+    "train_samples": 1734084,
+    "train_samples_per_second": 579.48,
+    "train_steps_per_second": 18.109
 }

trainer_state.json CHANGED Viewed

@@ -1,619 +1,2327 @@
 {
-  "best_metric": 0.21882201731204987,
-  "best_model_checkpoint": "multilingual-e5-small-aligned-sentiment/checkpoint-40644",
   "epoch": 3.0,
   "eval_steps": 500,
-  "global_step": 40644,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.03690581635665781,
-      "grad_norm": 6.3122382164001465,
-      "learning_rate": 4.938490306072237e-05,
-      "loss": 0.4391,
       "step": 500
     },
     {
-      "epoch": 0.07381163271331562,
-      "grad_norm": 2.3589205741882324,
-      "learning_rate": 4.876980612144474e-05,
-      "loss": 0.3671,
       "step": 1000
     },
     {
-      "epoch": 0.11071744906997343,
-      "grad_norm": 5.466468811035156,
-      "learning_rate": 4.815470918216711e-05,
-      "loss": 0.3484,
       "step": 1500
     },
     {
-      "epoch": 0.14762326542663123,
-      "grad_norm": 3.5365800857543945,
-      "learning_rate": 4.7539612242889484e-05,
-      "loss": 0.3363,
       "step": 2000
     },
     {
-      "epoch": 0.18452908178328906,
-      "grad_norm": 4.123874187469482,
-      "learning_rate": 4.692451530361185e-05,
-      "loss": 0.339,
       "step": 2500
     },
     {
-      "epoch": 0.22143489813994685,
-      "grad_norm": 3.2261433601379395,
-      "learning_rate": 4.6309418364334224e-05,
-      "loss": 0.3275,
       "step": 3000
     },
     {
-      "epoch": 0.2583407144966047,
-      "grad_norm": 4.198851108551025,
-      "learning_rate": 4.5694321425056594e-05,
-      "loss": 0.3209,
       "step": 3500
     },
     {
-      "epoch": 0.29524653085326247,
-      "grad_norm": 4.112880706787109,
-      "learning_rate": 4.507922448577896e-05,
-      "loss": 0.3161,
       "step": 4000
     },
     {
-      "epoch": 0.33215234720992026,
-      "grad_norm": 2.5054564476013184,
-      "learning_rate": 4.4464127546501335e-05,
-      "loss": 0.3075,
       "step": 4500
     },
     {
-      "epoch": 0.3690581635665781,
-      "grad_norm": 2.706176280975342,
-      "learning_rate": 4.38490306072237e-05,
-      "loss": 0.3003,
       "step": 5000
     },
     {
-      "epoch": 0.4059639799232359,
-      "grad_norm": 4.4395432472229,
-      "learning_rate": 4.323393366794607e-05,
-      "loss": 0.3049,
       "step": 5500
     },
     {
-      "epoch": 0.4428697962798937,
-      "grad_norm": 2.091357946395874,
-      "learning_rate": 4.261883672866844e-05,
-      "loss": 0.3025,
       "step": 6000
     },
     {
-      "epoch": 0.4797756126365515,
-      "grad_norm": 3.8511946201324463,
-      "learning_rate": 4.200373978939081e-05,
-      "loss": 0.2954,
       "step": 6500
     },
     {
-      "epoch": 0.5166814289932093,
-      "grad_norm": 5.480827331542969,
-      "learning_rate": 4.138864285011318e-05,
-      "loss": 0.2906,
       "step": 7000
     },
     {
-      "epoch": 0.5535872453498671,
-      "grad_norm": 2.709707260131836,
-      "learning_rate": 4.077354591083555e-05,
-      "loss": 0.2802,
       "step": 7500
     },
     {
-      "epoch": 0.5904930617065249,
-      "grad_norm": 2.0577099323272705,
-      "learning_rate": 4.015844897155792e-05,
-      "loss": 0.283,
       "step": 8000
     },
     {
-      "epoch": 0.6273988780631827,
-      "grad_norm": 2.7647361755371094,
-      "learning_rate": 3.954335203228029e-05,
-      "loss": 0.2812,
       "step": 8500
     },
     {
-      "epoch": 0.6643046944198405,
-      "grad_norm": 2.9902350902557373,
-      "learning_rate": 3.892825509300266e-05,
-      "loss": 0.2762,
       "step": 9000
     },
     {
-      "epoch": 0.7012105107764984,
-      "grad_norm": 2.2046756744384766,
-      "learning_rate": 3.8313158153725024e-05,
-      "loss": 0.2736,
       "step": 9500
     },
     {
-      "epoch": 0.7381163271331562,
-      "grad_norm": 3.422405958175659,
-      "learning_rate": 3.76980612144474e-05,
-      "loss": 0.2758,
       "step": 10000
     },
     {
-      "epoch": 0.775022143489814,
-      "grad_norm": 5.36775016784668,
-      "learning_rate": 3.708296427516977e-05,
-      "loss": 0.2695,
       "step": 10500
     },
     {
-      "epoch": 0.8119279598464718,
-      "grad_norm": 2.30532169342041,
-      "learning_rate": 3.6467867335892135e-05,
-      "loss": 0.2685,
       "step": 11000
     },
     {
-      "epoch": 0.8488337762031296,
-      "grad_norm": 3.8830413818359375,
-      "learning_rate": 3.585277039661451e-05,
-      "loss": 0.2715,
       "step": 11500
     },
     {
-      "epoch": 0.8857395925597874,
-      "grad_norm": 2.014604091644287,
-      "learning_rate": 3.5237673457336876e-05,
-      "loss": 0.2608,
       "step": 12000
     },
     {
-      "epoch": 0.9226454089164452,
-      "grad_norm": 2.6041476726531982,
-      "learning_rate": 3.4622576518059246e-05,
-      "loss": 0.2632,
       "step": 12500
     },
     {
-      "epoch": 0.959551225273103,
-      "grad_norm": 3.3516054153442383,
-      "learning_rate": 3.400747957878162e-05,
-      "loss": 0.2621,
       "step": 13000
     },
     {
-      "epoch": 0.9964570416297609,
-      "grad_norm": 2.724219799041748,
-      "learning_rate": 3.3392382639503986e-05,
-      "loss": 0.2635,
       "step": 13500
     },
     {
-      "epoch": 1.0,
-      "eval_loss": 0.2525743246078491,
-      "eval_mse": 0.2525743009952109,
-      "eval_runtime": 50.6282,
-      "eval_samples_per_second": 1902.852,
-      "eval_steps_per_second": 237.871,
-      "step": 13548
-    },
-    {
-      "epoch": 1.0333628579864187,
-      "grad_norm": 2.4586572647094727,
-      "learning_rate": 3.277728570022636e-05,
-      "loss": 0.2128,
       "step": 14000
     },
     {
-      "epoch": 1.0702686743430765,
-      "grad_norm": 2.403661012649536,
-      "learning_rate": 3.216218876094873e-05,
-      "loss": 0.2062,
       "step": 14500
     },
     {
-      "epoch": 1.1071744906997343,
-      "grad_norm": 1.8662785291671753,
-      "learning_rate": 3.15470918216711e-05,
-      "loss": 0.2088,
       "step": 15000
     },
     {
-      "epoch": 1.144080307056392,
-      "grad_norm": 4.150296688079834,
-      "learning_rate": 3.093199488239347e-05,
-      "loss": 0.203,
       "step": 15500
     },
     {
-      "epoch": 1.1809861234130499,
-      "grad_norm": 2.604682207107544,
-      "learning_rate": 3.0316897943115834e-05,
-      "loss": 0.2002,
       "step": 16000
     },
     {
-      "epoch": 1.2178919397697077,
-      "grad_norm": 2.8031857013702393,
-      "learning_rate": 2.9701801003838208e-05,
-      "loss": 0.2056,
       "step": 16500
     },
     {
-      "epoch": 1.2547977561263655,
-      "grad_norm": 4.056972503662109,
-      "learning_rate": 2.9086704064560578e-05,
-      "loss": 0.2067,
       "step": 17000
     },
     {
-      "epoch": 1.2917035724830233,
-      "grad_norm": 2.9248251914978027,
-      "learning_rate": 2.8471607125282945e-05,
-      "loss": 0.2049,
       "step": 17500
     },
     {
-      "epoch": 1.328609388839681,
-      "grad_norm": 1.4066252708435059,
-      "learning_rate": 2.7856510186005312e-05,
-      "loss": 0.201,
       "step": 18000
     },
     {
-      "epoch": 1.3655152051963388,
-      "grad_norm": 1.4685883522033691,
-      "learning_rate": 2.7241413246727686e-05,
-      "loss": 0.1983,
       "step": 18500
     },
     {
-      "epoch": 1.4024210215529966,
-      "grad_norm": 1.9358257055282593,
-      "learning_rate": 2.6626316307450056e-05,
-      "loss": 0.1983,
       "step": 19000
     },
     {
-      "epoch": 1.4393268379096544,
-      "grad_norm": 1.5204322338104248,
-      "learning_rate": 2.6011219368172423e-05,
-      "loss": 0.2032,
       "step": 19500
     },
     {
-      "epoch": 1.4762326542663124,
-      "grad_norm": 2.7880804538726807,
-      "learning_rate": 2.5396122428894797e-05,
-      "loss": 0.2045,
       "step": 20000
     },
     {
-      "epoch": 1.51313847062297,
-      "grad_norm": 2.1745991706848145,
-      "learning_rate": 2.4781025489617167e-05,
-      "loss": 0.1979,
       "step": 20500
     },
     {
-      "epoch": 1.550044286979628,
-      "grad_norm": 1.6532700061798096,
-      "learning_rate": 2.4165928550339534e-05,
-      "loss": 0.1979,
       "step": 21000
     },
     {
-      "epoch": 1.5869501033362858,
-      "grad_norm": 2.7065317630767822,
-      "learning_rate": 2.3550831611061904e-05,
-      "loss": 0.1958,
       "step": 21500
     },
     {
-      "epoch": 1.6238559196929436,
-      "grad_norm": 2.1913399696350098,
-      "learning_rate": 2.2935734671784274e-05,
-      "loss": 0.1958,
       "step": 22000
     },
     {
-      "epoch": 1.6607617360496014,
-      "grad_norm": 2.5118260383605957,
-      "learning_rate": 2.2320637732506645e-05,
-      "loss": 0.2016,
       "step": 22500
     },
     {
-      "epoch": 1.6976675524062592,
-      "grad_norm": 1.4727787971496582,
-      "learning_rate": 2.1705540793229015e-05,
-      "loss": 0.1965,
       "step": 23000
     },
     {
-      "epoch": 1.734573368762917,
-      "grad_norm": 2.5935685634613037,
-      "learning_rate": 2.1090443853951382e-05,
-      "loss": 0.1936,
       "step": 23500
     },
     {
-      "epoch": 1.7714791851195748,
-      "grad_norm": 1.701431155204773,
-      "learning_rate": 2.0475346914673755e-05,
-      "loss": 0.1982,
       "step": 24000
     },
     {
-      "epoch": 1.8083850014762326,
-      "grad_norm": 2.9000027179718018,
-      "learning_rate": 1.9860249975396122e-05,
-      "loss": 0.1955,
       "step": 24500
     },
     {
-      "epoch": 1.8452908178328906,
-      "grad_norm": 3.60319447517395,
-      "learning_rate": 1.9245153036118493e-05,
-      "loss": 0.1962,
       "step": 25000
     },
     {
-      "epoch": 1.8821966341895484,
-      "grad_norm": 2.8174662590026855,
-      "learning_rate": 1.8630056096840863e-05,
-      "loss": 0.1918,
       "step": 25500
     },
     {
-      "epoch": 1.9191024505462062,
-      "grad_norm": 3.1348931789398193,
-      "learning_rate": 1.8014959157563233e-05,
-      "loss": 0.1951,
       "step": 26000
     },
     {
-      "epoch": 1.956008266902864,
-      "grad_norm": 2.737175941467285,
-      "learning_rate": 1.7399862218285603e-05,
-      "loss": 0.1885,
       "step": 26500
     },
     {
-      "epoch": 1.9929140832595218,
-      "grad_norm": 3.2441189289093018,
-      "learning_rate": 1.678476527900797e-05,
-      "loss": 0.1944,
       "step": 27000
     },
     {
-      "epoch": 2.0,
-      "eval_loss": 0.2276565432548523,
-      "eval_mse": 0.22765655234639334,
-      "eval_runtime": 50.6433,
-      "eval_samples_per_second": 1902.284,
-      "eval_steps_per_second": 237.8,
-      "step": 27096
-    },
-    {
-      "epoch": 2.0298198996161796,
-      "grad_norm": 2.415947198867798,
-      "learning_rate": 1.6169668339730344e-05,
-      "loss": 0.1606,
       "step": 27500
     },
     {
-      "epoch": 2.0667257159728374,
-      "grad_norm": 2.8117918968200684,
-      "learning_rate": 1.555457140045271e-05,
-      "loss": 0.1566,
       "step": 28000
     },
     {
-      "epoch": 2.103631532329495,
-      "grad_norm": 2.8294386863708496,
-      "learning_rate": 1.4939474461175081e-05,
-      "loss": 0.1549,
       "step": 28500
     },
     {
-      "epoch": 2.140537348686153,
-      "grad_norm": 2.073002576828003,
-      "learning_rate": 1.4324377521897453e-05,
-      "loss": 0.151,
       "step": 29000
     },
     {
-      "epoch": 2.1774431650428108,
-      "grad_norm": 2.204664707183838,
-      "learning_rate": 1.3709280582619822e-05,
-      "loss": 0.1558,
       "step": 29500
     },
     {
-      "epoch": 2.2143489813994686,
-      "grad_norm": 2.2928786277770996,
-      "learning_rate": 1.3094183643342192e-05,
-      "loss": 0.1531,
       "step": 30000
     },
     {
-      "epoch": 2.2512547977561264,
-      "grad_norm": 4.089919567108154,
-      "learning_rate": 1.2479086704064562e-05,
-      "loss": 0.1578,
       "step": 30500
     },
     {
-      "epoch": 2.288160614112784,
-      "grad_norm": 3.0547707080841064,
-      "learning_rate": 1.186398976478693e-05,
-      "loss": 0.1551,
       "step": 31000
     },
     {
-      "epoch": 2.325066430469442,
-      "grad_norm": 1.791717767715454,
-      "learning_rate": 1.1248892825509301e-05,
-      "loss": 0.1538,
       "step": 31500
     },
     {
-      "epoch": 2.3619722468260997,
-      "grad_norm": 1.498639702796936,
-      "learning_rate": 1.0633795886231671e-05,
-      "loss": 0.1533,
       "step": 32000
     },
     {
-      "epoch": 2.3988780631827575,
-      "grad_norm": 1.8389638662338257,
-      "learning_rate": 1.001869894695404e-05,
-      "loss": 0.1536,
       "step": 32500
     },
     {
-      "epoch": 2.4357838795394153,
-      "grad_norm": 2.8968076705932617,
-      "learning_rate": 9.40360200767641e-06,
-      "loss": 0.1518,
       "step": 33000
     },
     {
-      "epoch": 2.472689695896073,
-      "grad_norm": 1.8149243593215942,
-      "learning_rate": 8.78850506839878e-06,
-      "loss": 0.1508,
       "step": 33500
     },
     {
-      "epoch": 2.509595512252731,
-      "grad_norm": 2.4595253467559814,
-      "learning_rate": 8.17340812912115e-06,
-      "loss": 0.1509,
       "step": 34000
     },
     {
-      "epoch": 2.5465013286093887,
-      "grad_norm": 2.2790329456329346,
-      "learning_rate": 7.55831118984352e-06,
-      "loss": 0.1495,
       "step": 34500
     },
     {
-      "epoch": 2.5834071449660465,
-      "grad_norm": 2.1698362827301025,
-      "learning_rate": 6.94321425056589e-06,
-      "loss": 0.1507,
       "step": 35000
     },
     {
-      "epoch": 2.6203129613227043,
-      "grad_norm": 1.564191460609436,
-      "learning_rate": 6.328117311288259e-06,
-      "loss": 0.1512,
       "step": 35500
     },
     {
-      "epoch": 2.657218777679362,
-      "grad_norm": 1.279205322265625,
-      "learning_rate": 5.713020372010629e-06,
-      "loss": 0.1479,
       "step": 36000
     },
     {
-      "epoch": 2.69412459403602,
-      "grad_norm": 2.1811535358428955,
-      "learning_rate": 5.097923432732999e-06,
-      "loss": 0.1508,
       "step": 36500
     },
     {
-      "epoch": 2.7310304103926777,
-      "grad_norm": 2.391449451446533,
-      "learning_rate": 4.482826493455368e-06,
-      "loss": 0.1444,
       "step": 37000
     },
     {
-      "epoch": 2.7679362267493355,
-      "grad_norm": 1.848325490951538,
-      "learning_rate": 3.8677295541777385e-06,
-      "loss": 0.1487,
       "step": 37500
     },
     {
-      "epoch": 2.8048420431059933,
-      "grad_norm": 2.8446269035339355,
-      "learning_rate": 3.2526326149001084e-06,
-      "loss": 0.1497,
       "step": 38000
     },
     {
-      "epoch": 2.841747859462651,
-      "grad_norm": 2.272193670272827,
-      "learning_rate": 2.6375356756224782e-06,
-      "loss": 0.1506,
       "step": 38500
     },
     {
-      "epoch": 2.878653675819309,
-      "grad_norm": 2.2728445529937744,
-      "learning_rate": 2.022438736344848e-06,
-      "loss": 0.1497,
       "step": 39000
     },
     {
-      "epoch": 2.9155594921759667,
-      "grad_norm": 1.8776350021362305,
-      "learning_rate": 1.4073417970672177e-06,
-      "loss": 0.1465,
       "step": 39500
     },
     {
-      "epoch": 2.952465308532625,
-      "grad_norm": 1.9717949628829956,
-      "learning_rate": 7.922448577895876e-07,
-      "loss": 0.1493,
       "step": 40000
     },
     {
-      "epoch": 2.9893711248892827,
-      "grad_norm": 2.8036680221557617,
-      "learning_rate": 1.771479185119575e-07,
-      "loss": 0.1489,
       "step": 40500
     },
     {
       "epoch": 3.0,
-      "eval_loss": 0.21882201731204987,
-      "eval_mse": 0.21882200171115507,
-      "eval_runtime": 50.9689,
-      "eval_samples_per_second": 1890.132,
-      "eval_steps_per_second": 236.281,
-      "step": 40644
     },
     {
       "epoch": 3.0,
-      "step": 40644,
-      "total_flos": 4.283504864539085e+16,
-      "train_loss": 0.21721052708159796,
-      "train_runtime": 3074.1411,
-      "train_samples_per_second": 846.131,
-      "train_steps_per_second": 13.221
     }
   ],
   "logging_steps": 500,
-  "max_steps": 40644,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 3,
   "save_steps": 500,
@@ -629,8 +2337,8 @@
       "attributes": {}
     }
   },
-  "total_flos": 4.283504864539085e+16,
-  "train_batch_size": 64,
   "trial_name": null,
   "trial_params": null
 }

 {
+  "best_metric": 0.20824576914310455,
+  "best_model_checkpoint": "multilingual-e5-small-aligned-transformed-sentiment/checkpoint-162573",
   "epoch": 3.0,
   "eval_steps": 500,
+  "global_step": 162573,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.009226624347216328,
+      "grad_norm": 4.861542224884033,
+      "learning_rate": 4.98462229275464e-05,
+      "loss": 0.4705,
       "step": 500
     },
     {
+      "epoch": 0.018453248694432656,
+      "grad_norm": 4.163719177246094,
+      "learning_rate": 4.969244585509279e-05,
+      "loss": 0.3878,
       "step": 1000
     },
     {
+      "epoch": 0.027679873041648984,
+      "grad_norm": 5.356781482696533,
+      "learning_rate": 4.9538668782639185e-05,
+      "loss": 0.3727,
       "step": 1500
     },
     {
+      "epoch": 0.03690649738886531,
+      "grad_norm": 4.754086017608643,
+      "learning_rate": 4.9384891710185583e-05,
+      "loss": 0.3647,
       "step": 2000
     },
     {
+      "epoch": 0.04613312173608164,
+      "grad_norm": 4.055091857910156,
+      "learning_rate": 4.9231114637731975e-05,
+      "loss": 0.3508,
       "step": 2500
     },
     {
+      "epoch": 0.05535974608329797,
+      "grad_norm": 4.6239824295043945,
+      "learning_rate": 4.907733756527837e-05,
+      "loss": 0.3418,
       "step": 3000
     },
     {
+      "epoch": 0.0645863704305143,
+      "grad_norm": 3.2537200450897217,
+      "learning_rate": 4.8923560492824766e-05,
+      "loss": 0.3471,
       "step": 3500
     },
     {
+      "epoch": 0.07381299477773062,
+      "grad_norm": 8.821883201599121,
+      "learning_rate": 4.876978342037116e-05,
+      "loss": 0.3448,
       "step": 4000
     },
     {
+      "epoch": 0.08303961912494695,
+      "grad_norm": 4.157027244567871,
+      "learning_rate": 4.8616006347917556e-05,
+      "loss": 0.3304,
       "step": 4500
     },
     {
+      "epoch": 0.09226624347216328,
+      "grad_norm": 5.966025352478027,
+      "learning_rate": 4.846222927546395e-05,
+      "loss": 0.3288,
       "step": 5000
     },
     {
+      "epoch": 0.1014928678193796,
+      "grad_norm": 2.689772367477417,
+      "learning_rate": 4.830845220301034e-05,
+      "loss": 0.315,
       "step": 5500
     },
     {
+      "epoch": 0.11071949216659593,
+      "grad_norm": 4.417109966278076,
+      "learning_rate": 4.815467513055674e-05,
+      "loss": 0.3171,
       "step": 6000
     },
     {
+      "epoch": 0.11994611651381226,
+      "grad_norm": 2.441032886505127,
+      "learning_rate": 4.800089805810313e-05,
+      "loss": 0.3111,
       "step": 6500
     },
     {
+      "epoch": 0.1291727408610286,
+      "grad_norm": 5.741962432861328,
+      "learning_rate": 4.784712098564952e-05,
+      "loss": 0.3161,
       "step": 7000
     },
     {
+      "epoch": 0.13839936520824492,
+      "grad_norm": 3.6779587268829346,
+      "learning_rate": 4.769334391319592e-05,
+      "loss": 0.3172,
       "step": 7500
     },
     {
+      "epoch": 0.14762598955546125,
+      "grad_norm": 3.389577627182007,
+      "learning_rate": 4.753956684074232e-05,
+      "loss": 0.3089,
       "step": 8000
     },
     {
+      "epoch": 0.15685261390267757,
+      "grad_norm": 6.343785285949707,
+      "learning_rate": 4.7385789768288705e-05,
+      "loss": 0.3038,
       "step": 8500
     },
     {
+      "epoch": 0.1660792382498939,
+      "grad_norm": 3.811483383178711,
+      "learning_rate": 4.7232012695835104e-05,
+      "loss": 0.2977,
       "step": 9000
     },
     {
+      "epoch": 0.17530586259711023,
+      "grad_norm": 3.0193912982940674,
+      "learning_rate": 4.70782356233815e-05,
+      "loss": 0.302,
       "step": 9500
     },
     {
+      "epoch": 0.18453248694432656,
+      "grad_norm": 5.484386444091797,
+      "learning_rate": 4.6924458550927894e-05,
+      "loss": 0.3041,
       "step": 10000
     },
     {
+      "epoch": 0.19375911129154288,
+      "grad_norm": 4.629725933074951,
+      "learning_rate": 4.6770681478474286e-05,
+      "loss": 0.3036,
       "step": 10500
     },
     {
+      "epoch": 0.2029857356387592,
+      "grad_norm": 9.174530982971191,
+      "learning_rate": 4.6616904406020685e-05,
+      "loss": 0.2884,
       "step": 11000
     },
     {
+      "epoch": 0.21221235998597554,
+      "grad_norm": 11.994110107421875,
+      "learning_rate": 4.6463127333567077e-05,
+      "loss": 0.2913,
       "step": 11500
     },
     {
+      "epoch": 0.22143898433319187,
+      "grad_norm": 4.78723669052124,
+      "learning_rate": 4.630935026111347e-05,
+      "loss": 0.2903,
       "step": 12000
     },
     {
+      "epoch": 0.2306656086804082,
+      "grad_norm": 4.056216239929199,
+      "learning_rate": 4.615557318865987e-05,
+      "loss": 0.2885,
       "step": 12500
     },
     {
+      "epoch": 0.23989223302762452,
+      "grad_norm": 2.5596282482147217,
+      "learning_rate": 4.600179611620626e-05,
+      "loss": 0.2865,
       "step": 13000
     },
     {
+      "epoch": 0.24911885737484085,
+      "grad_norm": 3.354088544845581,
+      "learning_rate": 4.584801904375266e-05,
+      "loss": 0.2801,
       "step": 13500
     },
     {
+      "epoch": 0.2583454817220572,
+      "grad_norm": 2.7451272010803223,
+      "learning_rate": 4.569424197129905e-05,
+      "loss": 0.2804,
       "step": 14000
     },
     {
+      "epoch": 0.2675721060692735,
+      "grad_norm": 2.0492589473724365,
+      "learning_rate": 4.554046489884544e-05,
+      "loss": 0.2805,
       "step": 14500
     },
     {
+      "epoch": 0.27679873041648984,
+      "grad_norm": 2.7824437618255615,
+      "learning_rate": 4.538668782639184e-05,
+      "loss": 0.276,
       "step": 15000
     },
     {
+      "epoch": 0.28602535476370616,
+      "grad_norm": 2.753225326538086,
+      "learning_rate": 4.523291075393823e-05,
+      "loss": 0.2779,
       "step": 15500
     },
     {
+      "epoch": 0.2952519791109225,
+      "grad_norm": 4.156832218170166,
+      "learning_rate": 4.5079133681484624e-05,
+      "loss": 0.2776,
       "step": 16000
     },
     {
+      "epoch": 0.3044786034581388,
+      "grad_norm": 1.4764610528945923,
+      "learning_rate": 4.492535660903102e-05,
+      "loss": 0.2777,
       "step": 16500
     },
     {
+      "epoch": 0.31370522780535515,
+      "grad_norm": 2.732165813446045,
+      "learning_rate": 4.477157953657742e-05,
+      "loss": 0.2759,
       "step": 17000
     },
     {
+      "epoch": 0.3229318521525715,
+      "grad_norm": 3.756098508834839,
+      "learning_rate": 4.4617802464123806e-05,
+      "loss": 0.2707,
       "step": 17500
     },
     {
+      "epoch": 0.3321584764997878,
+      "grad_norm": 3.7828195095062256,
+      "learning_rate": 4.4464025391670205e-05,
+      "loss": 0.2692,
       "step": 18000
     },
     {
+      "epoch": 0.34138510084700413,
+      "grad_norm": 6.942204475402832,
+      "learning_rate": 4.4310248319216603e-05,
+      "loss": 0.2722,
       "step": 18500
     },
     {
+      "epoch": 0.35061172519422046,
+      "grad_norm": 2.0811824798583984,
+      "learning_rate": 4.4156471246762995e-05,
+      "loss": 0.2624,
       "step": 19000
     },
     {
+      "epoch": 0.3598383495414368,
+      "grad_norm": 2.2063019275665283,
+      "learning_rate": 4.400269417430939e-05,
+      "loss": 0.2567,
       "step": 19500
     },
     {
+      "epoch": 0.3690649738886531,
+      "grad_norm": 3.381683826446533,
+      "learning_rate": 4.3848917101855786e-05,
+      "loss": 0.2623,
       "step": 20000
     },
     {
+      "epoch": 0.37829159823586944,
+      "grad_norm": 2.4916694164276123,
+      "learning_rate": 4.369514002940218e-05,
+      "loss": 0.2665,
       "step": 20500
     },
     {
+      "epoch": 0.38751822258308577,
+      "grad_norm": 3.138047695159912,
+      "learning_rate": 4.354136295694857e-05,
+      "loss": 0.251,
       "step": 21000
     },
     {
+      "epoch": 0.3967448469303021,
+      "grad_norm": 4.300042152404785,
+      "learning_rate": 4.338758588449497e-05,
+      "loss": 0.2542,
       "step": 21500
     },
     {
+      "epoch": 0.4059714712775184,
+      "grad_norm": 4.118566513061523,
+      "learning_rate": 4.323380881204136e-05,
+      "loss": 0.2545,
       "step": 22000
     },
     {
+      "epoch": 0.41519809562473475,
+      "grad_norm": 3.6837940216064453,
+      "learning_rate": 4.308003173958776e-05,
+      "loss": 0.2506,
       "step": 22500
     },
     {
+      "epoch": 0.4244247199719511,
+      "grad_norm": 3.9393532276153564,
+      "learning_rate": 4.292625466713415e-05,
+      "loss": 0.248,
       "step": 23000
     },
     {
+      "epoch": 0.4336513443191674,
+      "grad_norm": 4.186630725860596,
+      "learning_rate": 4.277247759468054e-05,
+      "loss": 0.2574,
       "step": 23500
     },
     {
+      "epoch": 0.44287796866638374,
+      "grad_norm": 2.1121768951416016,
+      "learning_rate": 4.261870052222694e-05,
+      "loss": 0.2552,
       "step": 24000
     },
     {
+      "epoch": 0.45210459301360006,
+      "grad_norm": 3.940450429916382,
+      "learning_rate": 4.246492344977333e-05,
+      "loss": 0.2449,
       "step": 24500
     },
     {
+      "epoch": 0.4613312173608164,
+      "grad_norm": 3.8467142581939697,
+      "learning_rate": 4.2311146377319725e-05,
+      "loss": 0.2497,
       "step": 25000
     },
     {
+      "epoch": 0.4705578417080327,
+      "grad_norm": 4.122659683227539,
+      "learning_rate": 4.2157369304866124e-05,
+      "loss": 0.2502,
       "step": 25500
     },
     {
+      "epoch": 0.47978446605524905,
+      "grad_norm": 4.005275249481201,
+      "learning_rate": 4.200359223241252e-05,
+      "loss": 0.239,
       "step": 26000
     },
     {
+      "epoch": 0.4890110904024654,
+      "grad_norm": 3.944265365600586,
+      "learning_rate": 4.184981515995891e-05,
+      "loss": 0.2495,
       "step": 26500
     },
     {
+      "epoch": 0.4982377147496817,
+      "grad_norm": 5.242092609405518,
+      "learning_rate": 4.1696038087505306e-05,
+      "loss": 0.2504,
       "step": 27000
     },
     {
+      "epoch": 0.507464339096898,
+      "grad_norm": 3.0890393257141113,
+      "learning_rate": 4.1542261015051705e-05,
+      "loss": 0.2424,
       "step": 27500
     },
     {
+      "epoch": 0.5166909634441144,
+      "grad_norm": 2.5902299880981445,
+      "learning_rate": 4.1388483942598097e-05,
+      "loss": 0.2432,
       "step": 28000
     },
     {
+      "epoch": 0.5259175877913307,
+      "grad_norm": 2.1534225940704346,
+      "learning_rate": 4.123470687014449e-05,
+      "loss": 0.2423,
       "step": 28500
     },
     {
+      "epoch": 0.535144212138547,
+      "grad_norm": 4.093803405761719,
+      "learning_rate": 4.108092979769089e-05,
+      "loss": 0.2393,
       "step": 29000
     },
     {
+      "epoch": 0.5443708364857633,
+      "grad_norm": 5.0820722579956055,
+      "learning_rate": 4.092715272523728e-05,
+      "loss": 0.2355,
       "step": 29500
     },
     {
+      "epoch": 0.5535974608329797,
+      "grad_norm": 3.2006969451904297,
+      "learning_rate": 4.077337565278367e-05,
+      "loss": 0.2378,
       "step": 30000
     },
     {
+      "epoch": 0.562824085180196,
+      "grad_norm": 2.7393364906311035,
+      "learning_rate": 4.061959858033007e-05,
+      "loss": 0.2391,
       "step": 30500
     },
     {
+      "epoch": 0.5720507095274123,
+      "grad_norm": 5.7313361167907715,
+      "learning_rate": 4.046582150787646e-05,
+      "loss": 0.2378,
       "step": 31000
     },
     {
+      "epoch": 0.5812773338746287,
+      "grad_norm": 3.5704684257507324,
+      "learning_rate": 4.031204443542286e-05,
+      "loss": 0.2416,
       "step": 31500
     },
     {
+      "epoch": 0.590503958221845,
+      "grad_norm": 3.010260820388794,
+      "learning_rate": 4.015826736296925e-05,
+      "loss": 0.2315,
       "step": 32000
     },
     {
+      "epoch": 0.5997305825690613,
+      "grad_norm": 6.030303001403809,
+      "learning_rate": 4.0004490290515644e-05,
+      "loss": 0.2346,
       "step": 32500
     },
     {
+      "epoch": 0.6089572069162776,
+      "grad_norm": 2.6332879066467285,
+      "learning_rate": 3.985071321806204e-05,
+      "loss": 0.2387,
       "step": 33000
     },
     {
+      "epoch": 0.618183831263494,
+      "grad_norm": 3.79506254196167,
+      "learning_rate": 3.9696936145608434e-05,
+      "loss": 0.2314,
       "step": 33500
     },
     {
+      "epoch": 0.6274104556107103,
+      "grad_norm": 3.9026734828948975,
+      "learning_rate": 3.9543159073154826e-05,
+      "loss": 0.2265,
       "step": 34000
     },
     {
+      "epoch": 0.6366370799579266,
+      "grad_norm": 7.885356426239014,
+      "learning_rate": 3.9389382000701225e-05,
+      "loss": 0.2288,
       "step": 34500
     },
     {
+      "epoch": 0.645863704305143,
+      "grad_norm": 3.634693145751953,
+      "learning_rate": 3.9235604928247623e-05,
+      "loss": 0.2269,
       "step": 35000
     },
     {
+      "epoch": 0.6550903286523593,
+      "grad_norm": 4.571321487426758,
+      "learning_rate": 3.9081827855794015e-05,
+      "loss": 0.226,
       "step": 35500
     },
     {
+      "epoch": 0.6643169529995756,
+      "grad_norm": 4.4402337074279785,
+      "learning_rate": 3.892805078334041e-05,
+      "loss": 0.2227,
       "step": 36000
     },
     {
+      "epoch": 0.6735435773467919,
+      "grad_norm": 2.3273956775665283,
+      "learning_rate": 3.8774273710886806e-05,
+      "loss": 0.2328,
       "step": 36500
     },
     {
+      "epoch": 0.6827702016940083,
+      "grad_norm": 7.7202372550964355,
+      "learning_rate": 3.86204966384332e-05,
+      "loss": 0.2242,
       "step": 37000
     },
     {
+      "epoch": 0.6919968260412246,
+      "grad_norm": 3.037423849105835,
+      "learning_rate": 3.846671956597959e-05,
+      "loss": 0.2219,
       "step": 37500
     },
     {
+      "epoch": 0.7012234503884409,
+      "grad_norm": 3.3124380111694336,
+      "learning_rate": 3.831294249352599e-05,
+      "loss": 0.2217,
       "step": 38000
     },
     {
+      "epoch": 0.7104500747356572,
+      "grad_norm": 1.5552330017089844,
+      "learning_rate": 3.815916542107238e-05,
+      "loss": 0.2237,
       "step": 38500
     },
     {
+      "epoch": 0.7196766990828736,
+      "grad_norm": 3.6003737449645996,
+      "learning_rate": 3.800538834861878e-05,
+      "loss": 0.2212,
       "step": 39000
     },
     {
+      "epoch": 0.7289033234300899,
+      "grad_norm": 2.323984146118164,
+      "learning_rate": 3.785161127616517e-05,
+      "loss": 0.2217,
       "step": 39500
     },
     {
+      "epoch": 0.7381299477773062,
+      "grad_norm": 4.002011775970459,
+      "learning_rate": 3.769783420371156e-05,
+      "loss": 0.2178,
       "step": 40000
     },
     {
+      "epoch": 0.7473565721245226,
+      "grad_norm": 9.153217315673828,
+      "learning_rate": 3.754405713125796e-05,
+      "loss": 0.2156,
       "step": 40500
     },
+    {
+      "epoch": 0.7565831964717389,
+      "grad_norm": 4.3000712394714355,
+      "learning_rate": 3.739028005880435e-05,
+      "loss": 0.2202,
+      "step": 41000
+    },
+    {
+      "epoch": 0.7658098208189552,
+      "grad_norm": 5.20850944519043,
+      "learning_rate": 3.7236502986350745e-05,
+      "loss": 0.2156,
+      "step": 41500
+    },
+    {
+      "epoch": 0.7750364451661715,
+      "grad_norm": 3.736025810241699,
+      "learning_rate": 3.7082725913897144e-05,
+      "loss": 0.2106,
+      "step": 42000
+    },
+    {
+      "epoch": 0.7842630695133879,
+      "grad_norm": 4.413645267486572,
+      "learning_rate": 3.692894884144354e-05,
+      "loss": 0.2154,
+      "step": 42500
+    },
+    {
+      "epoch": 0.7934896938606042,
+      "grad_norm": 3.298003911972046,
+      "learning_rate": 3.677517176898993e-05,
+      "loss": 0.2106,
+      "step": 43000
+    },
+    {
+      "epoch": 0.8027163182078205,
+      "grad_norm": 2.9312047958374023,
+      "learning_rate": 3.6621394696536326e-05,
+      "loss": 0.2043,
+      "step": 43500
+    },
+    {
+      "epoch": 0.8119429425550369,
+      "grad_norm": 4.253361701965332,
+      "learning_rate": 3.6467617624082725e-05,
+      "loss": 0.2131,
+      "step": 44000
+    },
+    {
+      "epoch": 0.8211695669022532,
+      "grad_norm": 2.0434412956237793,
+      "learning_rate": 3.6313840551629117e-05,
+      "loss": 0.2144,
+      "step": 44500
+    },
+    {
+      "epoch": 0.8303961912494695,
+      "grad_norm": 3.0040202140808105,
+      "learning_rate": 3.616006347917551e-05,
+      "loss": 0.2124,
+      "step": 45000
+    },
+    {
+      "epoch": 0.8396228155966858,
+      "grad_norm": 3.3966643810272217,
+      "learning_rate": 3.600628640672191e-05,
+      "loss": 0.2077,
+      "step": 45500
+    },
+    {
+      "epoch": 0.8488494399439022,
+      "grad_norm": 2.4415907859802246,
+      "learning_rate": 3.58525093342683e-05,
+      "loss": 0.2049,
+      "step": 46000
+    },
+    {
+      "epoch": 0.8580760642911185,
+      "grad_norm": 3.1614882946014404,
+      "learning_rate": 3.569873226181469e-05,
+      "loss": 0.2073,
+      "step": 46500
+    },
+    {
+      "epoch": 0.8673026886383348,
+      "grad_norm": 4.641379356384277,
+      "learning_rate": 3.554495518936109e-05,
+      "loss": 0.2025,
+      "step": 47000
+    },
+    {
+      "epoch": 0.8765293129855511,
+      "grad_norm": 3.275320529937744,
+      "learning_rate": 3.539117811690748e-05,
+      "loss": 0.206,
+      "step": 47500
+    },
+    {
+      "epoch": 0.8857559373327675,
+      "grad_norm": 2.602555274963379,
+      "learning_rate": 3.523740104445388e-05,
+      "loss": 0.2053,
+      "step": 48000
+    },
+    {
+      "epoch": 0.8949825616799838,
+      "grad_norm": 3.3625969886779785,
+      "learning_rate": 3.508362397200027e-05,
+      "loss": 0.2031,
+      "step": 48500
+    },
+    {
+      "epoch": 0.9042091860272001,
+      "grad_norm": 2.0234267711639404,
+      "learning_rate": 3.4929846899546664e-05,
+      "loss": 0.1981,
+      "step": 49000
+    },
+    {
+      "epoch": 0.9134358103744165,
+      "grad_norm": 2.6035192012786865,
+      "learning_rate": 3.477606982709306e-05,
+      "loss": 0.2013,
+      "step": 49500
+    },
+    {
+      "epoch": 0.9226624347216328,
+      "grad_norm": 5.516040802001953,
+      "learning_rate": 3.4622292754639454e-05,
+      "loss": 0.2063,
+      "step": 50000
+    },
+    {
+      "epoch": 0.9318890590688491,
+      "grad_norm": 4.573687553405762,
+      "learning_rate": 3.4468515682185846e-05,
+      "loss": 0.2044,
+      "step": 50500
+    },
+    {
+      "epoch": 0.9411156834160654,
+      "grad_norm": 3.124086856842041,
+      "learning_rate": 3.4314738609732245e-05,
+      "loss": 0.1937,
+      "step": 51000
+    },
+    {
+      "epoch": 0.9503423077632818,
+      "grad_norm": 4.916173458099365,
+      "learning_rate": 3.4160961537278643e-05,
+      "loss": 0.1959,
+      "step": 51500
+    },
+    {
+      "epoch": 0.9595689321104981,
+      "grad_norm": 3.445047378540039,
+      "learning_rate": 3.400718446482503e-05,
+      "loss": 0.1999,
+      "step": 52000
+    },
+    {
+      "epoch": 0.9687955564577144,
+      "grad_norm": 2.2390198707580566,
+      "learning_rate": 3.385340739237143e-05,
+      "loss": 0.1887,
+      "step": 52500
+    },
+    {
+      "epoch": 0.9780221808049308,
+      "grad_norm": 6.404945373535156,
+      "learning_rate": 3.3699630319917826e-05,
+      "loss": 0.1963,
+      "step": 53000
+    },
+    {
+      "epoch": 0.9872488051521471,
+      "grad_norm": 3.268970251083374,
+      "learning_rate": 3.354585324746422e-05,
+      "loss": 0.1958,
+      "step": 53500
+    },
+    {
+      "epoch": 0.9964754294993634,
+      "grad_norm": 2.5354039669036865,
+      "learning_rate": 3.339207617501061e-05,
+      "loss": 0.1898,
+      "step": 54000
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 0.23220877349376678,
+      "eval_mse": 0.2322087733155601,
+      "eval_runtime": 114.203,
+      "eval_samples_per_second": 1687.136,
+      "eval_steps_per_second": 210.896,
+      "step": 54191
+    },
+    {
+      "epoch": 1.0057020538465797,
+      "grad_norm": 2.1543655395507812,
+      "learning_rate": 3.323829910255701e-05,
+      "loss": 0.1712,
+      "step": 54500
+    },
+    {
+      "epoch": 1.014928678193796,
+      "grad_norm": 2.780333995819092,
+      "learning_rate": 3.30845220301034e-05,
+      "loss": 0.1574,
+      "step": 55000
+    },
+    {
+      "epoch": 1.0241553025410124,
+      "grad_norm": 5.817172527313232,
+      "learning_rate": 3.293074495764979e-05,
+      "loss": 0.1505,
+      "step": 55500
+    },
+    {
+      "epoch": 1.0333819268882287,
+      "grad_norm": 5.431843280792236,
+      "learning_rate": 3.277696788519619e-05,
+      "loss": 0.1551,
+      "step": 56000
+    },
+    {
+      "epoch": 1.042608551235445,
+      "grad_norm": 2.024513006210327,
+      "learning_rate": 3.262319081274258e-05,
+      "loss": 0.1541,
+      "step": 56500
+    },
+    {
+      "epoch": 1.0518351755826614,
+      "grad_norm": 5.155509948730469,
+      "learning_rate": 3.246941374028898e-05,
+      "loss": 0.1538,
+      "step": 57000
+    },
+    {
+      "epoch": 1.0610617999298777,
+      "grad_norm": 1.8281043767929077,
+      "learning_rate": 3.231563666783537e-05,
+      "loss": 0.1503,
+      "step": 57500
+    },
+    {
+      "epoch": 1.070288424277094,
+      "grad_norm": 3.030827283859253,
+      "learning_rate": 3.2161859595381765e-05,
+      "loss": 0.1535,
+      "step": 58000
+    },
+    {
+      "epoch": 1.0795150486243104,
+      "grad_norm": 3.2830984592437744,
+      "learning_rate": 3.2008082522928164e-05,
+      "loss": 0.1567,
+      "step": 58500
+    },
+    {
+      "epoch": 1.0887416729715267,
+      "grad_norm": 2.756232500076294,
+      "learning_rate": 3.1854305450474555e-05,
+      "loss": 0.1576,
+      "step": 59000
+    },
+    {
+      "epoch": 1.097968297318743,
+      "grad_norm": 2.0984957218170166,
+      "learning_rate": 3.170052837802095e-05,
+      "loss": 0.161,
+      "step": 59500
+    },
+    {
+      "epoch": 1.1071949216659593,
+      "grad_norm": 2.4525437355041504,
+      "learning_rate": 3.1546751305567346e-05,
+      "loss": 0.1542,
+      "step": 60000
+    },
+    {
+      "epoch": 1.1164215460131757,
+      "grad_norm": 2.31719970703125,
+      "learning_rate": 3.1392974233113745e-05,
+      "loss": 0.1528,
+      "step": 60500
+    },
+    {
+      "epoch": 1.125648170360392,
+      "grad_norm": 3.3912220001220703,
+      "learning_rate": 3.123919716066013e-05,
+      "loss": 0.1551,
+      "step": 61000
+    },
+    {
+      "epoch": 1.1348747947076083,
+      "grad_norm": 3.2458841800689697,
+      "learning_rate": 3.108542008820653e-05,
+      "loss": 0.1508,
+      "step": 61500
+    },
+    {
+      "epoch": 1.1441014190548247,
+      "grad_norm": 3.3046302795410156,
+      "learning_rate": 3.093164301575293e-05,
+      "loss": 0.1465,
+      "step": 62000
+    },
+    {
+      "epoch": 1.153328043402041,
+      "grad_norm": 4.0332183837890625,
+      "learning_rate": 3.077786594329932e-05,
+      "loss": 0.1535,
+      "step": 62500
+    },
+    {
+      "epoch": 1.1625546677492573,
+      "grad_norm": 2.0470728874206543,
+      "learning_rate": 3.062408887084571e-05,
+      "loss": 0.1501,
+      "step": 63000
+    },
+    {
+      "epoch": 1.1717812920964736,
+      "grad_norm": 4.00844669342041,
+      "learning_rate": 3.047031179839211e-05,
+      "loss": 0.1556,
+      "step": 63500
+    },
+    {
+      "epoch": 1.18100791644369,
+      "grad_norm": 2.260006904602051,
+      "learning_rate": 3.03165347259385e-05,
+      "loss": 0.1514,
+      "step": 64000
+    },
+    {
+      "epoch": 1.1902345407909063,
+      "grad_norm": 1.3348864316940308,
+      "learning_rate": 3.0162757653484897e-05,
+      "loss": 0.1436,
+      "step": 64500
+    },
+    {
+      "epoch": 1.1994611651381226,
+      "grad_norm": 5.925819396972656,
+      "learning_rate": 3.0008980581031292e-05,
+      "loss": 0.1521,
+      "step": 65000
+    },
+    {
+      "epoch": 1.208687789485339,
+      "grad_norm": 4.659446716308594,
+      "learning_rate": 2.9855203508577684e-05,
+      "loss": 0.1434,
+      "step": 65500
+    },
+    {
+      "epoch": 1.2179144138325553,
+      "grad_norm": 4.0146164894104,
+      "learning_rate": 2.970142643612408e-05,
+      "loss": 0.1503,
+      "step": 66000
+    },
+    {
+      "epoch": 1.2271410381797716,
+      "grad_norm": 1.715017557144165,
+      "learning_rate": 2.9547649363670478e-05,
+      "loss": 0.1499,
+      "step": 66500
+    },
+    {
+      "epoch": 1.236367662526988,
+      "grad_norm": 4.178813457489014,
+      "learning_rate": 2.9393872291216866e-05,
+      "loss": 0.1504,
+      "step": 67000
+    },
+    {
+      "epoch": 1.2455942868742043,
+      "grad_norm": 2.155510663986206,
+      "learning_rate": 2.9240095218763265e-05,
+      "loss": 0.1423,
+      "step": 67500
+    },
+    {
+      "epoch": 1.2548209112214206,
+      "grad_norm": 1.8401468992233276,
+      "learning_rate": 2.908631814630966e-05,
+      "loss": 0.1534,
+      "step": 68000
+    },
+    {
+      "epoch": 1.264047535568637,
+      "grad_norm": 3.5961029529571533,
+      "learning_rate": 2.8932541073856055e-05,
+      "loss": 0.1422,
+      "step": 68500
+    },
+    {
+      "epoch": 1.2732741599158532,
+      "grad_norm": 2.855060338973999,
+      "learning_rate": 2.8778764001402447e-05,
+      "loss": 0.1497,
+      "step": 69000
+    },
+    {
+      "epoch": 1.2825007842630696,
+      "grad_norm": 2.705552816390991,
+      "learning_rate": 2.8624986928948842e-05,
+      "loss": 0.1482,
+      "step": 69500
+    },
+    {
+      "epoch": 1.291727408610286,
+      "grad_norm": 3.748999834060669,
+      "learning_rate": 2.847120985649524e-05,
+      "loss": 0.1516,
+      "step": 70000
+    },
+    {
+      "epoch": 1.3009540329575022,
+      "grad_norm": 2.6836044788360596,
+      "learning_rate": 2.831743278404163e-05,
+      "loss": 0.1476,
+      "step": 70500
+    },
+    {
+      "epoch": 1.3101806573047186,
+      "grad_norm": 1.9708038568496704,
+      "learning_rate": 2.8163655711588028e-05,
+      "loss": 0.1469,
+      "step": 71000
+    },
+    {
+      "epoch": 1.3194072816519349,
+      "grad_norm": 2.0082767009735107,
+      "learning_rate": 2.8009878639134424e-05,
+      "loss": 0.1473,
+      "step": 71500
+    },
+    {
+      "epoch": 1.3286339059991512,
+      "grad_norm": 5.9193830490112305,
+      "learning_rate": 2.7856101566680815e-05,
+      "loss": 0.148,
+      "step": 72000
+    },
+    {
+      "epoch": 1.3378605303463675,
+      "grad_norm": 2.226789951324463,
+      "learning_rate": 2.770232449422721e-05,
+      "loss": 0.1479,
+      "step": 72500
+    },
+    {
+      "epoch": 1.3470871546935839,
+      "grad_norm": 2.320139169692993,
+      "learning_rate": 2.7548547421773606e-05,
+      "loss": 0.141,
+      "step": 73000
+    },
+    {
+      "epoch": 1.3563137790408002,
+      "grad_norm": 1.762904405593872,
+      "learning_rate": 2.7394770349319998e-05,
+      "loss": 0.143,
+      "step": 73500
+    },
+    {
+      "epoch": 1.3655404033880165,
+      "grad_norm": 1.4634217023849487,
+      "learning_rate": 2.7240993276866393e-05,
+      "loss": 0.1417,
+      "step": 74000
+    },
+    {
+      "epoch": 1.3747670277352328,
+      "grad_norm": 1.4410927295684814,
+      "learning_rate": 2.7087216204412792e-05,
+      "loss": 0.1417,
+      "step": 74500
+    },
+    {
+      "epoch": 1.3839936520824492,
+      "grad_norm": 2.7735280990600586,
+      "learning_rate": 2.693343913195918e-05,
+      "loss": 0.1439,
+      "step": 75000
+    },
+    {
+      "epoch": 1.3932202764296655,
+      "grad_norm": 2.384705066680908,
+      "learning_rate": 2.677966205950558e-05,
+      "loss": 0.1437,
+      "step": 75500
+    },
+    {
+      "epoch": 1.4024469007768818,
+      "grad_norm": 3.4809861183166504,
+      "learning_rate": 2.6625884987051974e-05,
+      "loss": 0.1408,
+      "step": 76000
+    },
+    {
+      "epoch": 1.4116735251240982,
+      "grad_norm": 2.29471492767334,
+      "learning_rate": 2.6472107914598366e-05,
+      "loss": 0.1459,
+      "step": 76500
+    },
+    {
+      "epoch": 1.4209001494713145,
+      "grad_norm": 3.0202510356903076,
+      "learning_rate": 2.631833084214476e-05,
+      "loss": 0.1402,
+      "step": 77000
+    },
+    {
+      "epoch": 1.4301267738185308,
+      "grad_norm": 2.7061448097229004,
+      "learning_rate": 2.6164553769691157e-05,
+      "loss": 0.1408,
+      "step": 77500
+    },
+    {
+      "epoch": 1.4393533981657471,
+      "grad_norm": 1.499624252319336,
+      "learning_rate": 2.601077669723755e-05,
+      "loss": 0.1443,
+      "step": 78000
+    },
+    {
+      "epoch": 1.4485800225129635,
+      "grad_norm": 8.131513595581055,
+      "learning_rate": 2.5856999624783944e-05,
+      "loss": 0.1374,
+      "step": 78500
+    },
+    {
+      "epoch": 1.4578066468601798,
+      "grad_norm": 1.652654767036438,
+      "learning_rate": 2.5703222552330342e-05,
+      "loss": 0.1401,
+      "step": 79000
+    },
+    {
+      "epoch": 1.4670332712073961,
+      "grad_norm": 2.2545433044433594,
+      "learning_rate": 2.554944547987673e-05,
+      "loss": 0.1388,
+      "step": 79500
+    },
+    {
+      "epoch": 1.4762598955546125,
+      "grad_norm": 2.1318209171295166,
+      "learning_rate": 2.539566840742313e-05,
+      "loss": 0.1434,
+      "step": 80000
+    },
+    {
+      "epoch": 1.4854865199018288,
+      "grad_norm": 1.8352861404418945,
+      "learning_rate": 2.5241891334969525e-05,
+      "loss": 0.142,
+      "step": 80500
+    },
+    {
+      "epoch": 1.4947131442490451,
+      "grad_norm": 2.1764025688171387,
+      "learning_rate": 2.5088114262515917e-05,
+      "loss": 0.1366,
+      "step": 81000
+    },
+    {
+      "epoch": 1.5039397685962612,
+      "grad_norm": 2.425063371658325,
+      "learning_rate": 2.4934337190062312e-05,
+      "loss": 0.1435,
+      "step": 81500
+    },
+    {
+      "epoch": 1.5131663929434778,
+      "grad_norm": 1.579362154006958,
+      "learning_rate": 2.4780560117608707e-05,
+      "loss": 0.1383,
+      "step": 82000
+    },
+    {
+      "epoch": 1.5223930172906939,
+      "grad_norm": 1.9185165166854858,
+      "learning_rate": 2.46267830451551e-05,
+      "loss": 0.1357,
+      "step": 82500
+    },
+    {
+      "epoch": 1.5316196416379104,
+      "grad_norm": 1.506785273551941,
+      "learning_rate": 2.4473005972701498e-05,
+      "loss": 0.1366,
+      "step": 83000
+    },
+    {
+      "epoch": 1.5408462659851265,
+      "grad_norm": 2.999217987060547,
+      "learning_rate": 2.431922890024789e-05,
+      "loss": 0.1374,
+      "step": 83500
+    },
+    {
+      "epoch": 1.550072890332343,
+      "grad_norm": 1.4639360904693604,
+      "learning_rate": 2.4165451827794285e-05,
+      "loss": 0.1339,
+      "step": 84000
+    },
+    {
+      "epoch": 1.5592995146795592,
+      "grad_norm": 3.4754111766815186,
+      "learning_rate": 2.401167475534068e-05,
+      "loss": 0.1288,
+      "step": 84500
+    },
+    {
+      "epoch": 1.5685261390267757,
+      "grad_norm": 2.0212953090667725,
+      "learning_rate": 2.3857897682887072e-05,
+      "loss": 0.1379,
+      "step": 85000
+    },
+    {
+      "epoch": 1.5777527633739918,
+      "grad_norm": 14.00969409942627,
+      "learning_rate": 2.3704120610433467e-05,
+      "loss": 0.135,
+      "step": 85500
+    },
+    {
+      "epoch": 1.5869793877212084,
+      "grad_norm": 2.084036111831665,
+      "learning_rate": 2.3550343537979862e-05,
+      "loss": 0.1406,
+      "step": 86000
+    },
+    {
+      "epoch": 1.5962060120684245,
+      "grad_norm": 1.8672277927398682,
+      "learning_rate": 2.3396566465526258e-05,
+      "loss": 0.131,
+      "step": 86500
+    },
+    {
+      "epoch": 1.605432636415641,
+      "grad_norm": 1.3933255672454834,
+      "learning_rate": 2.324278939307265e-05,
+      "loss": 0.1346,
+      "step": 87000
+    },
+    {
+      "epoch": 1.6146592607628572,
+      "grad_norm": 4.199204921722412,
+      "learning_rate": 2.3089012320619048e-05,
+      "loss": 0.1345,
+      "step": 87500
+    },
+    {
+      "epoch": 1.6238858851100737,
+      "grad_norm": 2.914705276489258,
+      "learning_rate": 2.293523524816544e-05,
+      "loss": 0.1331,
+      "step": 88000
+    },
+    {
+      "epoch": 1.6331125094572898,
+      "grad_norm": 2.8266611099243164,
+      "learning_rate": 2.2781458175711835e-05,
+      "loss": 0.1331,
+      "step": 88500
+    },
+    {
+      "epoch": 1.6423391338045064,
+      "grad_norm": 2.148892402648926,
+      "learning_rate": 2.262768110325823e-05,
+      "loss": 0.1353,
+      "step": 89000
+    },
+    {
+      "epoch": 1.6515657581517225,
+      "grad_norm": 3.0781641006469727,
+      "learning_rate": 2.2473904030804623e-05,
+      "loss": 0.1312,
+      "step": 89500
+    },
+    {
+      "epoch": 1.660792382498939,
+      "grad_norm": 1.3129165172576904,
+      "learning_rate": 2.2320126958351018e-05,
+      "loss": 0.1287,
+      "step": 90000
+    },
+    {
+      "epoch": 1.6700190068461551,
+      "grad_norm": 2.6767327785491943,
+      "learning_rate": 2.2166349885897413e-05,
+      "loss": 0.1307,
+      "step": 90500
+    },
+    {
+      "epoch": 1.6792456311933717,
+      "grad_norm": 2.783486843109131,
+      "learning_rate": 2.201257281344381e-05,
+      "loss": 0.1307,
+      "step": 91000
+    },
+    {
+      "epoch": 1.6884722555405878,
+      "grad_norm": 4.483890056610107,
+      "learning_rate": 2.18587957409902e-05,
+      "loss": 0.1311,
+      "step": 91500
+    },
+    {
+      "epoch": 1.6976988798878043,
+      "grad_norm": 2.766557216644287,
+      "learning_rate": 2.17050186685366e-05,
+      "loss": 0.1327,
+      "step": 92000
+    },
+    {
+      "epoch": 1.7069255042350204,
+      "grad_norm": 3.863123893737793,
+      "learning_rate": 2.155124159608299e-05,
+      "loss": 0.1358,
+      "step": 92500
+    },
+    {
+      "epoch": 1.716152128582237,
+      "grad_norm": 3.8993873596191406,
+      "learning_rate": 2.1397464523629386e-05,
+      "loss": 0.1306,
+      "step": 93000
+    },
+    {
+      "epoch": 1.725378752929453,
+      "grad_norm": 3.616542100906372,
+      "learning_rate": 2.124368745117578e-05,
+      "loss": 0.1306,
+      "step": 93500
+    },
+    {
+      "epoch": 1.7346053772766696,
+      "grad_norm": 2.784503698348999,
+      "learning_rate": 2.1089910378722173e-05,
+      "loss": 0.1316,
+      "step": 94000
+    },
+    {
+      "epoch": 1.7438320016238857,
+      "grad_norm": 2.199709415435791,
+      "learning_rate": 2.093613330626857e-05,
+      "loss": 0.13,
+      "step": 94500
+    },
+    {
+      "epoch": 1.7530586259711023,
+      "grad_norm": 1.9818087816238403,
+      "learning_rate": 2.0782356233814964e-05,
+      "loss": 0.1308,
+      "step": 95000
+    },
+    {
+      "epoch": 1.7622852503183184,
+      "grad_norm": 1.3748022317886353,
+      "learning_rate": 2.062857916136136e-05,
+      "loss": 0.1279,
+      "step": 95500
+    },
+    {
+      "epoch": 1.771511874665535,
+      "grad_norm": 2.4911797046661377,
+      "learning_rate": 2.047480208890775e-05,
+      "loss": 0.1287,
+      "step": 96000
+    },
+    {
+      "epoch": 1.780738499012751,
+      "grad_norm": 2.5785412788391113,
+      "learning_rate": 2.032102501645415e-05,
+      "loss": 0.1293,
+      "step": 96500
+    },
+    {
+      "epoch": 1.7899651233599676,
+      "grad_norm": 3.9389474391937256,
+      "learning_rate": 2.016724794400054e-05,
+      "loss": 0.1276,
+      "step": 97000
+    },
+    {
+      "epoch": 1.7991917477071837,
+      "grad_norm": 3.8254497051239014,
+      "learning_rate": 2.0013470871546937e-05,
+      "loss": 0.1259,
+      "step": 97500
+    },
+    {
+      "epoch": 1.8084183720544003,
+      "grad_norm": 2.5958099365234375,
+      "learning_rate": 1.9859693799093332e-05,
+      "loss": 0.1264,
+      "step": 98000
+    },
+    {
+      "epoch": 1.8176449964016164,
+      "grad_norm": 5.190915107727051,
+      "learning_rate": 1.9705916726639727e-05,
+      "loss": 0.1288,
+      "step": 98500
+    },
+    {
+      "epoch": 1.826871620748833,
+      "grad_norm": 1.9603300094604492,
+      "learning_rate": 1.955213965418612e-05,
+      "loss": 0.1271,
+      "step": 99000
+    },
+    {
+      "epoch": 1.836098245096049,
+      "grad_norm": 2.722358226776123,
+      "learning_rate": 1.9398362581732514e-05,
+      "loss": 0.1278,
+      "step": 99500
+    },
+    {
+      "epoch": 1.8453248694432656,
+      "grad_norm": 1.6586706638336182,
+      "learning_rate": 1.924458550927891e-05,
+      "loss": 0.1248,
+      "step": 100000
+    },
+    {
+      "epoch": 1.8545514937904817,
+      "grad_norm": 2.985854148864746,
+      "learning_rate": 1.90908084368253e-05,
+      "loss": 0.1266,
+      "step": 100500
+    },
+    {
+      "epoch": 1.8637781181376982,
+      "grad_norm": 2.7211902141571045,
+      "learning_rate": 1.89370313643717e-05,
+      "loss": 0.1267,
+      "step": 101000
+    },
+    {
+      "epoch": 1.8730047424849143,
+      "grad_norm": 2.373112678527832,
+      "learning_rate": 1.8783254291918092e-05,
+      "loss": 0.1239,
+      "step": 101500
+    },
+    {
+      "epoch": 1.8822313668321309,
+      "grad_norm": 2.1845340728759766,
+      "learning_rate": 1.862947721946449e-05,
+      "loss": 0.1259,
+      "step": 102000
+    },
+    {
+      "epoch": 1.891457991179347,
+      "grad_norm": 2.6702089309692383,
+      "learning_rate": 1.8475700147010882e-05,
+      "loss": 0.1267,
+      "step": 102500
+    },
+    {
+      "epoch": 1.9006846155265635,
+      "grad_norm": 1.2957886457443237,
+      "learning_rate": 1.8321923074557278e-05,
+      "loss": 0.1231,
+      "step": 103000
+    },
+    {
+      "epoch": 1.9099112398737796,
+      "grad_norm": 2.2960615158081055,
+      "learning_rate": 1.8168146002103673e-05,
+      "loss": 0.1242,
+      "step": 103500
+    },
+    {
+      "epoch": 1.9191378642209962,
+      "grad_norm": 1.4060367345809937,
+      "learning_rate": 1.8014368929650065e-05,
+      "loss": 0.1217,
+      "step": 104000
+    },
+    {
+      "epoch": 1.9283644885682123,
+      "grad_norm": 1.8247722387313843,
+      "learning_rate": 1.786059185719646e-05,
+      "loss": 0.1247,
+      "step": 104500
+    },
+    {
+      "epoch": 1.9375911129154288,
+      "grad_norm": 4.583653450012207,
+      "learning_rate": 1.7706814784742855e-05,
+      "loss": 0.1224,
+      "step": 105000
+    },
+    {
+      "epoch": 1.946817737262645,
+      "grad_norm": 1.7650556564331055,
+      "learning_rate": 1.755303771228925e-05,
+      "loss": 0.1235,
+      "step": 105500
+    },
+    {
+      "epoch": 1.9560443616098615,
+      "grad_norm": 2.088684320449829,
+      "learning_rate": 1.7399260639835643e-05,
+      "loss": 0.1203,
+      "step": 106000
+    },
+    {
+      "epoch": 1.9652709859570776,
+      "grad_norm": 2.448063850402832,
+      "learning_rate": 1.724548356738204e-05,
+      "loss": 0.1209,
+      "step": 106500
+    },
+    {
+      "epoch": 1.9744976103042942,
+      "grad_norm": 4.1177778244018555,
+      "learning_rate": 1.7091706494928433e-05,
+      "loss": 0.1188,
+      "step": 107000
+    },
+    {
+      "epoch": 1.9837242346515103,
+      "grad_norm": 4.088508129119873,
+      "learning_rate": 1.693792942247483e-05,
+      "loss": 0.1206,
+      "step": 107500
+    },
+    {
+      "epoch": 1.9929508589987268,
+      "grad_norm": 2.3093175888061523,
+      "learning_rate": 1.6784152350021224e-05,
+      "loss": 0.1186,
+      "step": 108000
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 0.2138589769601822,
+      "eval_mse": 0.21385898989204177,
+      "eval_runtime": 125.4291,
+      "eval_samples_per_second": 1536.134,
+      "eval_steps_per_second": 192.021,
+      "step": 108382
+    },
+    {
+      "epoch": 2.002177483345943,
+      "grad_norm": 2.2715179920196533,
+      "learning_rate": 1.6630375277567615e-05,
+      "loss": 0.114,
+      "step": 108500
+    },
+    {
+      "epoch": 2.0114041076931595,
+      "grad_norm": 2.236180543899536,
+      "learning_rate": 1.647659820511401e-05,
+      "loss": 0.0994,
+      "step": 109000
+    },
+    {
+      "epoch": 2.0206307320403756,
+      "grad_norm": 2.335440158843994,
+      "learning_rate": 1.6322821132660406e-05,
+      "loss": 0.0975,
+      "step": 109500
+    },
+    {
+      "epoch": 2.029857356387592,
+      "grad_norm": 4.5400519371032715,
+      "learning_rate": 1.61690440602068e-05,
+      "loss": 0.0972,
+      "step": 110000
+    },
+    {
+      "epoch": 2.0390839807348082,
+      "grad_norm": 2.633301258087158,
+      "learning_rate": 1.6015266987753193e-05,
+      "loss": 0.0982,
+      "step": 110500
+    },
+    {
+      "epoch": 2.048310605082025,
+      "grad_norm": 1.150661826133728,
+      "learning_rate": 1.5861489915299592e-05,
+      "loss": 0.0999,
+      "step": 111000
+    },
+    {
+      "epoch": 2.057537229429241,
+      "grad_norm": 1.9149357080459595,
+      "learning_rate": 1.5707712842845984e-05,
+      "loss": 0.0979,
+      "step": 111500
+    },
+    {
+      "epoch": 2.0667638537764574,
+      "grad_norm": 1.996846079826355,
+      "learning_rate": 1.555393577039238e-05,
+      "loss": 0.0996,
+      "step": 112000
+    },
+    {
+      "epoch": 2.0759904781236735,
+      "grad_norm": 1.5708836317062378,
+      "learning_rate": 1.5400158697938774e-05,
+      "loss": 0.0969,
+      "step": 112500
+    },
+    {
+      "epoch": 2.08521710247089,
+      "grad_norm": 1.5404409170150757,
+      "learning_rate": 1.5246381625485168e-05,
+      "loss": 0.0963,
+      "step": 113000
+    },
+    {
+      "epoch": 2.094443726818106,
+      "grad_norm": 1.6409614086151123,
+      "learning_rate": 1.5092604553031561e-05,
+      "loss": 0.0977,
+      "step": 113500
+    },
+    {
+      "epoch": 2.1036703511653227,
+      "grad_norm": 1.7960460186004639,
+      "learning_rate": 1.4938827480577958e-05,
+      "loss": 0.0964,
+      "step": 114000
+    },
+    {
+      "epoch": 2.112896975512539,
+      "grad_norm": 1.685120701789856,
+      "learning_rate": 1.4785050408124352e-05,
+      "loss": 0.0989,
+      "step": 114500
+    },
+    {
+      "epoch": 2.1221235998597554,
+      "grad_norm": 3.500861644744873,
+      "learning_rate": 1.4631273335670745e-05,
+      "loss": 0.0943,
+      "step": 115000
+    },
+    {
+      "epoch": 2.1313502242069715,
+      "grad_norm": 2.3654606342315674,
+      "learning_rate": 1.447749626321714e-05,
+      "loss": 0.0963,
+      "step": 115500
+    },
+    {
+      "epoch": 2.140576848554188,
+      "grad_norm": 3.000051975250244,
+      "learning_rate": 1.4323719190763534e-05,
+      "loss": 0.0922,
+      "step": 116000
+    },
+    {
+      "epoch": 2.149803472901404,
+      "grad_norm": 2.384732961654663,
+      "learning_rate": 1.4169942118309928e-05,
+      "loss": 0.0971,
+      "step": 116500
+    },
+    {
+      "epoch": 2.1590300972486207,
+      "grad_norm": 1.3965630531311035,
+      "learning_rate": 1.4016165045856325e-05,
+      "loss": 0.0971,
+      "step": 117000
+    },
+    {
+      "epoch": 2.168256721595837,
+      "grad_norm": 1.745569109916687,
+      "learning_rate": 1.3862387973402718e-05,
+      "loss": 0.0977,
+      "step": 117500
+    },
+    {
+      "epoch": 2.1774833459430534,
+      "grad_norm": 2.326707363128662,
+      "learning_rate": 1.3708610900949112e-05,
+      "loss": 0.0937,
+      "step": 118000
+    },
+    {
+      "epoch": 2.1867099702902695,
+      "grad_norm": 1.6542750597000122,
+      "learning_rate": 1.3554833828495509e-05,
+      "loss": 0.0943,
+      "step": 118500
+    },
+    {
+      "epoch": 2.195936594637486,
+      "grad_norm": 1.1322625875473022,
+      "learning_rate": 1.3401056756041902e-05,
+      "loss": 0.0937,
+      "step": 119000
+    },
+    {
+      "epoch": 2.205163218984702,
+      "grad_norm": 1.815834641456604,
+      "learning_rate": 1.3247279683588296e-05,
+      "loss": 0.0938,
+      "step": 119500
+    },
+    {
+      "epoch": 2.2143898433319187,
+      "grad_norm": 4.64595890045166,
+      "learning_rate": 1.3093502611134691e-05,
+      "loss": 0.0939,
+      "step": 120000
+    },
+    {
+      "epoch": 2.223616467679135,
+      "grad_norm": 2.1671462059020996,
+      "learning_rate": 1.2939725538681085e-05,
+      "loss": 0.093,
+      "step": 120500
+    },
+    {
+      "epoch": 2.2328430920263513,
+      "grad_norm": 1.636570692062378,
+      "learning_rate": 1.2785948466227478e-05,
+      "loss": 0.0928,
+      "step": 121000
+    },
+    {
+      "epoch": 2.2420697163735674,
+      "grad_norm": 3.4394800662994385,
+      "learning_rate": 1.2632171393773875e-05,
+      "loss": 0.0936,
+      "step": 121500
+    },
+    {
+      "epoch": 2.251296340720784,
+      "grad_norm": 2.013307571411133,
+      "learning_rate": 1.2478394321320269e-05,
+      "loss": 0.0954,
+      "step": 122000
+    },
+    {
+      "epoch": 2.260522965068,
+      "grad_norm": 3.2544264793395996,
+      "learning_rate": 1.2324617248866664e-05,
+      "loss": 0.0987,
+      "step": 122500
+    },
+    {
+      "epoch": 2.2697495894152167,
+      "grad_norm": 2.9892079830169678,
+      "learning_rate": 1.2170840176413058e-05,
+      "loss": 0.0931,
+      "step": 123000
+    },
+    {
+      "epoch": 2.2789762137624328,
+      "grad_norm": 3.113938331604004,
+      "learning_rate": 1.2017063103959453e-05,
+      "loss": 0.0945,
+      "step": 123500
+    },
+    {
+      "epoch": 2.2882028381096493,
+      "grad_norm": 1.7884827852249146,
+      "learning_rate": 1.1863286031505848e-05,
+      "loss": 0.0935,
+      "step": 124000
+    },
+    {
+      "epoch": 2.2974294624568654,
+      "grad_norm": 2.059272527694702,
+      "learning_rate": 1.1709508959052242e-05,
+      "loss": 0.0962,
+      "step": 124500
+    },
+    {
+      "epoch": 2.306656086804082,
+      "grad_norm": 1.7323048114776611,
+      "learning_rate": 1.1555731886598637e-05,
+      "loss": 0.0928,
+      "step": 125000
+    },
+    {
+      "epoch": 2.315882711151298,
+      "grad_norm": 1.6812376976013184,
+      "learning_rate": 1.140195481414503e-05,
+      "loss": 0.0918,
+      "step": 125500
+    },
+    {
+      "epoch": 2.3251093354985146,
+      "grad_norm": 1.550013780593872,
+      "learning_rate": 1.1248177741691424e-05,
+      "loss": 0.0944,
+      "step": 126000
+    },
+    {
+      "epoch": 2.3343359598457307,
+      "grad_norm": 2.913409948348999,
+      "learning_rate": 1.109440066923782e-05,
+      "loss": 0.0957,
+      "step": 126500
+    },
+    {
+      "epoch": 2.3435625841929473,
+      "grad_norm": 1.515856146812439,
+      "learning_rate": 1.0940623596784215e-05,
+      "loss": 0.0929,
+      "step": 127000
+    },
+    {
+      "epoch": 2.3527892085401634,
+      "grad_norm": 1.571866512298584,
+      "learning_rate": 1.0786846524330608e-05,
+      "loss": 0.0925,
+      "step": 127500
+    },
+    {
+      "epoch": 2.36201583288738,
+      "grad_norm": 2.379932403564453,
+      "learning_rate": 1.0633069451877004e-05,
+      "loss": 0.0927,
+      "step": 128000
+    },
+    {
+      "epoch": 2.371242457234596,
+      "grad_norm": 3.373950958251953,
+      "learning_rate": 1.0479292379423399e-05,
+      "loss": 0.0908,
+      "step": 128500
+    },
+    {
+      "epoch": 2.3804690815818126,
+      "grad_norm": 2.3678219318389893,
+      "learning_rate": 1.0325515306969792e-05,
+      "loss": 0.0898,
+      "step": 129000
+    },
+    {
+      "epoch": 2.3896957059290287,
+      "grad_norm": 2.636244058609009,
+      "learning_rate": 1.0171738234516188e-05,
+      "loss": 0.0892,
+      "step": 129500
+    },
+    {
+      "epoch": 2.3989223302762452,
+      "grad_norm": 2.6495730876922607,
+      "learning_rate": 1.0017961162062581e-05,
+      "loss": 0.0886,
+      "step": 130000
+    },
+    {
+      "epoch": 2.4081489546234613,
+      "grad_norm": 2.9955618381500244,
+      "learning_rate": 9.864184089608975e-06,
+      "loss": 0.0897,
+      "step": 130500
+    },
+    {
+      "epoch": 2.417375578970678,
+      "grad_norm": 3.0076186656951904,
+      "learning_rate": 9.71040701715537e-06,
+      "loss": 0.0895,
+      "step": 131000
+    },
+    {
+      "epoch": 2.426602203317894,
+      "grad_norm": 2.0592894554138184,
+      "learning_rate": 9.556629944701765e-06,
+      "loss": 0.0905,
+      "step": 131500
+    },
+    {
+      "epoch": 2.4358288276651106,
+      "grad_norm": 1.5429611206054688,
+      "learning_rate": 9.402852872248159e-06,
+      "loss": 0.0937,
+      "step": 132000
+    },
+    {
+      "epoch": 2.4450554520123267,
+      "grad_norm": 2.048470973968506,
+      "learning_rate": 9.249075799794554e-06,
+      "loss": 0.0903,
+      "step": 132500
+    },
+    {
+      "epoch": 2.454282076359543,
+      "grad_norm": 1.8051766157150269,
+      "learning_rate": 9.09529872734095e-06,
+      "loss": 0.091,
+      "step": 133000
+    },
+    {
+      "epoch": 2.4635087007067593,
+      "grad_norm": 1.5680794715881348,
+      "learning_rate": 8.941521654887343e-06,
+      "loss": 0.0892,
+      "step": 133500
+    },
+    {
+      "epoch": 2.472735325053976,
+      "grad_norm": 1.979874610900879,
+      "learning_rate": 8.787744582433738e-06,
+      "loss": 0.0877,
+      "step": 134000
+    },
+    {
+      "epoch": 2.481961949401192,
+      "grad_norm": 2.7211787700653076,
+      "learning_rate": 8.633967509980134e-06,
+      "loss": 0.0925,
+      "step": 134500
+    },
+    {
+      "epoch": 2.4911885737484085,
+      "grad_norm": 1.0742968320846558,
+      "learning_rate": 8.480190437526527e-06,
+      "loss": 0.0881,
+      "step": 135000
+    },
+    {
+      "epoch": 2.5004151980956246,
+      "grad_norm": 2.0518765449523926,
+      "learning_rate": 8.32641336507292e-06,
+      "loss": 0.0943,
+      "step": 135500
+    },
+    {
+      "epoch": 2.509641822442841,
+      "grad_norm": 1.9672821760177612,
+      "learning_rate": 8.172636292619316e-06,
+      "loss": 0.0898,
+      "step": 136000
+    },
+    {
+      "epoch": 2.5188684467900573,
+      "grad_norm": 1.2716307640075684,
+      "learning_rate": 8.01885922016571e-06,
+      "loss": 0.0875,
+      "step": 136500
+    },
+    {
+      "epoch": 2.528095071137274,
+      "grad_norm": 2.617617607116699,
+      "learning_rate": 7.865082147712105e-06,
+      "loss": 0.0889,
+      "step": 137000
+    },
+    {
+      "epoch": 2.53732169548449,
+      "grad_norm": 0.8945909738540649,
+      "learning_rate": 7.7113050752585e-06,
+      "loss": 0.0909,
+      "step": 137500
+    },
+    {
+      "epoch": 2.5465483198317065,
+      "grad_norm": 1.661537766456604,
+      "learning_rate": 7.557528002804894e-06,
+      "loss": 0.0878,
+      "step": 138000
+    },
+    {
+      "epoch": 2.5557749441789226,
+      "grad_norm": 3.6078097820281982,
+      "learning_rate": 7.403750930351289e-06,
+      "loss": 0.0903,
+      "step": 138500
+    },
+    {
+      "epoch": 2.565001568526139,
+      "grad_norm": 1.483906626701355,
+      "learning_rate": 7.249973857897683e-06,
+      "loss": 0.0862,
+      "step": 139000
+    },
+    {
+      "epoch": 2.5742281928733552,
+      "grad_norm": 1.867789626121521,
+      "learning_rate": 7.096196785444077e-06,
+      "loss": 0.0891,
+      "step": 139500
+    },
+    {
+      "epoch": 2.583454817220572,
+      "grad_norm": 2.8336305618286133,
+      "learning_rate": 6.942419712990472e-06,
+      "loss": 0.0901,
+      "step": 140000
+    },
+    {
+      "epoch": 2.592681441567788,
+      "grad_norm": 1.5188074111938477,
+      "learning_rate": 6.7886426405368675e-06,
+      "loss": 0.0903,
+      "step": 140500
+    },
+    {
+      "epoch": 2.6019080659150045,
+      "grad_norm": 2.809237480163574,
+      "learning_rate": 6.634865568083261e-06,
+      "loss": 0.0892,
+      "step": 141000
+    },
+    {
+      "epoch": 2.6111346902622206,
+      "grad_norm": 1.773245096206665,
+      "learning_rate": 6.4810884956296555e-06,
+      "loss": 0.0909,
+      "step": 141500
+    },
+    {
+      "epoch": 2.620361314609437,
+      "grad_norm": 1.85002863407135,
+      "learning_rate": 6.327311423176051e-06,
+      "loss": 0.092,
+      "step": 142000
+    },
+    {
+      "epoch": 2.629587938956653,
+      "grad_norm": 0.9777950048446655,
+      "learning_rate": 6.173534350722445e-06,
+      "loss": 0.0888,
+      "step": 142500
+    },
+    {
+      "epoch": 2.6388145633038698,
+      "grad_norm": 2.261619806289673,
+      "learning_rate": 6.0197572782688396e-06,
+      "loss": 0.0879,
+      "step": 143000
+    },
+    {
+      "epoch": 2.648041187651086,
+      "grad_norm": 2.093942642211914,
+      "learning_rate": 5.865980205815234e-06,
+      "loss": 0.0866,
+      "step": 143500
+    },
+    {
+      "epoch": 2.6572678119983024,
+      "grad_norm": 3.01939058303833,
+      "learning_rate": 5.712203133361628e-06,
+      "loss": 0.0882,
+      "step": 144000
+    },
+    {
+      "epoch": 2.6664944363455185,
+      "grad_norm": 2.6572530269622803,
+      "learning_rate": 5.558426060908023e-06,
+      "loss": 0.0889,
+      "step": 144500
+    },
+    {
+      "epoch": 2.675721060692735,
+      "grad_norm": 1.3037127256393433,
+      "learning_rate": 5.404648988454418e-06,
+      "loss": 0.0907,
+      "step": 145000
+    },
+    {
+      "epoch": 2.684947685039951,
+      "grad_norm": 1.2352185249328613,
+      "learning_rate": 5.2508719160008125e-06,
+      "loss": 0.0888,
+      "step": 145500
+    },
+    {
+      "epoch": 2.6941743093871677,
+      "grad_norm": 1.6539493799209595,
+      "learning_rate": 5.097094843547207e-06,
+      "loss": 0.0886,
+      "step": 146000
+    },
+    {
+      "epoch": 2.703400933734384,
+      "grad_norm": 1.900009036064148,
+      "learning_rate": 4.943317771093601e-06,
+      "loss": 0.0896,
+      "step": 146500
+    },
+    {
+      "epoch": 2.7126275580816004,
+      "grad_norm": 1.3108474016189575,
+      "learning_rate": 4.789540698639996e-06,
+      "loss": 0.0874,
+      "step": 147000
+    },
+    {
+      "epoch": 2.7218541824288165,
+      "grad_norm": 0.9704590439796448,
+      "learning_rate": 4.63576362618639e-06,
+      "loss": 0.087,
+      "step": 147500
+    },
+    {
+      "epoch": 2.731080806776033,
+      "grad_norm": 1.0830601453781128,
+      "learning_rate": 4.481986553732785e-06,
+      "loss": 0.0881,
+      "step": 148000
+    },
+    {
+      "epoch": 2.740307431123249,
+      "grad_norm": 1.5252071619033813,
+      "learning_rate": 4.32820948127918e-06,
+      "loss": 0.0892,
+      "step": 148500
+    },
+    {
+      "epoch": 2.7495340554704657,
+      "grad_norm": 1.7691118717193604,
+      "learning_rate": 4.174432408825573e-06,
+      "loss": 0.0859,
+      "step": 149000
+    },
+    {
+      "epoch": 2.758760679817682,
+      "grad_norm": 7.3577494621276855,
+      "learning_rate": 4.020655336371969e-06,
+      "loss": 0.0856,
+      "step": 149500
+    },
+    {
+      "epoch": 2.7679873041648984,
+      "grad_norm": 1.1883918046951294,
+      "learning_rate": 3.866878263918363e-06,
+      "loss": 0.086,
+      "step": 150000
+    },
+    {
+      "epoch": 2.7772139285121145,
+      "grad_norm": 2.320882797241211,
+      "learning_rate": 3.7131011914647575e-06,
+      "loss": 0.0868,
+      "step": 150500
+    },
+    {
+      "epoch": 2.786440552859331,
+      "grad_norm": 2.119135618209839,
+      "learning_rate": 3.5593241190111523e-06,
+      "loss": 0.0837,
+      "step": 151000
+    },
+    {
+      "epoch": 2.795667177206547,
+      "grad_norm": 2.0826363563537598,
+      "learning_rate": 3.4055470465575468e-06,
+      "loss": 0.0871,
+      "step": 151500
+    },
+    {
+      "epoch": 2.8048938015537637,
+      "grad_norm": 1.4801201820373535,
+      "learning_rate": 3.2517699741039408e-06,
+      "loss": 0.0855,
+      "step": 152000
+    },
+    {
+      "epoch": 2.8141204259009798,
+      "grad_norm": 3.352520227432251,
+      "learning_rate": 3.0979929016503356e-06,
+      "loss": 0.0907,
+      "step": 152500
+    },
+    {
+      "epoch": 2.8233470502481963,
+      "grad_norm": 3.1500301361083984,
+      "learning_rate": 2.94421582919673e-06,
+      "loss": 0.0858,
+      "step": 153000
+    },
+    {
+      "epoch": 2.8325736745954124,
+      "grad_norm": 1.9149506092071533,
+      "learning_rate": 2.790438756743125e-06,
+      "loss": 0.0856,
+      "step": 153500
+    },
+    {
+      "epoch": 2.841800298942629,
+      "grad_norm": 2.150416612625122,
+      "learning_rate": 2.6366616842895193e-06,
+      "loss": 0.0849,
+      "step": 154000
+    },
+    {
+      "epoch": 2.851026923289845,
+      "grad_norm": 1.613443374633789,
+      "learning_rate": 2.4828846118359137e-06,
+      "loss": 0.084,
+      "step": 154500
+    },
+    {
+      "epoch": 2.8602535476370616,
+      "grad_norm": 4.109127998352051,
+      "learning_rate": 2.3291075393823085e-06,
+      "loss": 0.0859,
+      "step": 155000
+    },
+    {
+      "epoch": 2.8694801719842777,
+      "grad_norm": 2.9541776180267334,
+      "learning_rate": 2.175330466928703e-06,
+      "loss": 0.0869,
+      "step": 155500
+    },
+    {
+      "epoch": 2.8787067963314943,
+      "grad_norm": 2.9944493770599365,
+      "learning_rate": 2.0215533944750974e-06,
+      "loss": 0.0861,
+      "step": 156000
+    },
+    {
+      "epoch": 2.8879334206787104,
+      "grad_norm": 2.072777271270752,
+      "learning_rate": 1.867776322021492e-06,
+      "loss": 0.084,
+      "step": 156500
+    },
+    {
+      "epoch": 2.897160045025927,
+      "grad_norm": 2.4962828159332275,
+      "learning_rate": 1.7139992495678866e-06,
+      "loss": 0.0826,
+      "step": 157000
+    },
+    {
+      "epoch": 2.906386669373143,
+      "grad_norm": 1.9871286153793335,
+      "learning_rate": 1.560222177114281e-06,
+      "loss": 0.086,
+      "step": 157500
+    },
+    {
+      "epoch": 2.9156132937203596,
+      "grad_norm": 1.9906572103500366,
+      "learning_rate": 1.4064451046606757e-06,
+      "loss": 0.0838,
+      "step": 158000
+    },
+    {
+      "epoch": 2.9248399180675757,
+      "grad_norm": 2.1322317123413086,
+      "learning_rate": 1.25266803220707e-06,
+      "loss": 0.0844,
+      "step": 158500
+    },
+    {
+      "epoch": 2.9340665424147923,
+      "grad_norm": 2.26415753364563,
+      "learning_rate": 1.0988909597534647e-06,
+      "loss": 0.0839,
+      "step": 159000
+    },
+    {
+      "epoch": 2.9432931667620084,
+      "grad_norm": 1.9201428890228271,
+      "learning_rate": 9.451138872998592e-07,
+      "loss": 0.0861,
+      "step": 159500
+    },
+    {
+      "epoch": 2.952519791109225,
+      "grad_norm": 1.4225293397903442,
+      "learning_rate": 7.913368148462537e-07,
+      "loss": 0.0867,
+      "step": 160000
+    },
+    {
+      "epoch": 2.961746415456441,
+      "grad_norm": 4.265661239624023,
+      "learning_rate": 6.375597423926483e-07,
+      "loss": 0.0864,
+      "step": 160500
+    },
+    {
+      "epoch": 2.9709730398036576,
+      "grad_norm": 3.4527368545532227,
+      "learning_rate": 4.837826699390428e-07,
+      "loss": 0.0853,
+      "step": 161000
+    },
+    {
+      "epoch": 2.9801996641508737,
+      "grad_norm": 1.618511438369751,
+      "learning_rate": 3.300055974854373e-07,
+      "loss": 0.0862,
+      "step": 161500
+    },
+    {
+      "epoch": 2.9894262884980902,
+      "grad_norm": 1.6494286060333252,
+      "learning_rate": 1.7622852503183185e-07,
+      "loss": 0.0827,
+      "step": 162000
+    },
+    {
+      "epoch": 2.9986529128453063,
+      "grad_norm": 1.5462067127227783,
+      "learning_rate": 2.24514525782264e-08,
+      "loss": 0.0861,
+      "step": 162500
+    },
     {
       "epoch": 3.0,
+      "eval_loss": 0.20824576914310455,
+      "eval_mse": 0.20824573578672098,
+      "eval_runtime": 124.5011,
+      "eval_samples_per_second": 1547.584,
+      "eval_steps_per_second": 193.452,
+      "step": 162573
     },
     {
       "epoch": 3.0,
+      "step": 162573,
+      "total_flos": 8.56700972907817e+16,
+      "train_loss": 0.16141534491764947,
+      "train_runtime": 8977.4486,
+      "train_samples_per_second": 579.48,
+      "train_steps_per_second": 18.109
     }
   ],
   "logging_steps": 500,
+  "max_steps": 162573,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 3,
   "save_steps": 500,
       "attributes": {}
     }
   },
+  "total_flos": 8.56700972907817e+16,
+  "train_batch_size": 32,
   "trial_name": null,
   "trial_params": null
 }

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3ebe7b7b05490c20b53fbc39ffdeec0c1f0c552cf79f29de5652b1d14465d395
 size 5368

 version https://git-lfs.github.com/spec/v1
+oid sha256:2ab0472f329dda31a741344576f9001dc9064737abfb75a6baa4c9a1bdeb39ed
 size 5368