Training in progress, step 4000

Files changed (8) hide show

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:047d719f73cd36ac9f58a2203fa5e0c98b8b1d4370508542a1a633fb075d3753
 size 995604017

 version https://git-lfs.github.com/spec/v1
+oid sha256:c18ec9ae19fb67dde669cb881dc949405131436f9cc1761e33a79f52d52e874a
 size 995604017

last-checkpoint/pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9d90bbdc010d82ccf4c49b054ebb3c8c9600f1f85cdd8548e8b1cf667068e70a
 size 510396521

 version https://git-lfs.github.com/spec/v1
+oid sha256:f42fe0c2ec41d656453cec73abd968b21f89daa58209662a04075026845f4c5d
 size 510396521

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a7d28830c94d92b6d19326a0184936f2862087877e8af1a427622e66aefb96fa
 size 14503

 version https://git-lfs.github.com/spec/v1
+oid sha256:430f1248620a7a9f128f21dc3d873f638e60a994e72fbe17888d55a4b3b61863
 size 14503

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:360bfe5d7b0b6d589270f3932ba49712e88c4c1d12dc82eb69c2d144a105fa45
 size 623

 version https://git-lfs.github.com/spec/v1
+oid sha256:b94329f00f4cdc9834d8c689e9e9178c26b5c51fa127b64c75940601b3f9f205
 size 623

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
-  "best_metric": 4.06580114364624,
-  "best_model_checkpoint": "./ES_corlec/checkpoint-2000",
-  "epoch": 0.3965107057890563,
-  "global_step": 2000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -38,11 +38,43 @@
       "eval_samples_per_second": 42.075,
       "eval_steps_per_second": 2.631,
       "step": 2000
     }
   ],
   "max_steps": 35308,
   "num_train_epochs": 7,
-  "total_flos": 2449612800000000.0,
   "trial_name": null,
   "trial_params": null
 }

 {
+  "best_metric": 4.000585079193115,
+  "best_model_checkpoint": "./ES_corlec/checkpoint-4000",
+  "epoch": 0.7930214115781126,
+  "global_step": 4000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 42.075,
       "eval_steps_per_second": 2.631,
       "step": 2000
+    },
+    {
+      "epoch": 0.5,
+      "learning_rate": 9.344878660134443e-07,
+      "loss": 4.0586,
+      "step": 2500
+    },
+    {
+      "epoch": 0.59,
+      "learning_rate": 9.202460977554973e-07,
+      "loss": 4.022,
+      "step": 3000
+    },
+    {
+      "epoch": 0.69,
+      "learning_rate": 9.060043294975503e-07,
+      "loss": 4.0038,
+      "step": 3500
+    },
+    {
+      "epoch": 0.79,
+      "learning_rate": 8.917625612396035e-07,
+      "loss": 3.9852,
+      "step": 4000
+    },
+    {
+      "epoch": 0.79,
+      "eval_loss": 4.000585079193115,
+      "eval_runtime": 962.3175,
+      "eval_samples_per_second": 42.066,
+      "eval_steps_per_second": 2.63,
+      "step": 4000
     }
   ],
   "max_steps": 35308,
   "num_train_epochs": 7,
+  "total_flos": 4899225600000000.0,
   "trial_name": null,
   "trial_params": null
 }

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9d90bbdc010d82ccf4c49b054ebb3c8c9600f1f85cdd8548e8b1cf667068e70a
 size 510396521

 version https://git-lfs.github.com/spec/v1
+oid sha256:f42fe0c2ec41d656453cec73abd968b21f89daa58209662a04075026845f4c5d
 size 510396521

stderr.slurm CHANGED Viewed

The diff for this file is too large to render. See raw diff

stdout.slurm CHANGED Viewed

@@ -2,3 +2,8 @@
 {'loss': 4.1911, 'learning_rate': 9.77213170787285e-07, 'epoch': 0.2}
 {'loss': 4.1368, 'learning_rate': 9.62971402529338e-07, 'epoch': 0.3}
 {'loss': 4.0957, 'learning_rate': 9.487296342713911e-07, 'epoch': 0.4}

 {'loss': 4.1911, 'learning_rate': 9.77213170787285e-07, 'epoch': 0.2}
 {'loss': 4.1368, 'learning_rate': 9.62971402529338e-07, 'epoch': 0.3}
 {'loss': 4.0957, 'learning_rate': 9.487296342713911e-07, 'epoch': 0.4}
+{'eval_loss': 4.06580114364624, 'eval_runtime': 962.1135, 'eval_samples_per_second': 42.075, 'eval_steps_per_second': 2.631, 'epoch': 0.4}
+{'loss': 4.0586, 'learning_rate': 9.344878660134443e-07, 'epoch': 0.5}
+{'loss': 4.022, 'learning_rate': 9.202460977554973e-07, 'epoch': 0.59}
+{'loss': 4.0038, 'learning_rate': 9.060043294975503e-07, 'epoch': 0.69}
+{'loss': 3.9852, 'learning_rate': 8.917625612396035e-07, 'epoch': 0.79}