pere commited on Sep 14, 2021

Commit

ebb3343

•

1 Parent(s): e565538

updating before training

Browse files

Files changed (23) hide show

rotobart_output/events.out.tfevents.1631348281.t1v-n-6f5efcd5-w-0.36723.0.v2 → events.out.tfevents.1631647502.t1v-n-6f5efcd5-w-0.361354.0.v2 +1 -1
rotobart_output/events.out.tfevents.1631374032.t1v-n-6f5efcd5-w-0.59864.0.v2 +0 -3
rotobart_output/events.out.tfevents.1631470009.t1v-n-6f5efcd5-w-0.149719.0.v2 +0 -3
rotobart_output/events.out.tfevents.1631470963.t1v-n-6f5efcd5-w-0.152003.0.v2 +0 -3
rotobart_output/events.out.tfevents.1631473889.t1v-n-6f5efcd5-w-0.155772.0.v2 +0 -3
rotobart_output/events.out.tfevents.1631505260.t1v-n-6f5efcd5-w-0.183749.0.v2 +0 -3
rotobart_output/events.out.tfevents.1631519483.t1v-n-6f5efcd5-w-0.197621.0.v2 +0 -3
rotobart_output/events.out.tfevents.1631535266.t1v-n-6f5efcd5-w-0.215021.0.v2 +0 -3
rotobart_output/events.out.tfevents.1631541744.t1v-n-6f5efcd5-w-0.221830.0.v2 +0 -3
rotobart_output/events.out.tfevents.1631542427.t1v-n-6f5efcd5-w-0.223857.0.v2 +0 -3
rotobart_output/events.out.tfevents.1631544225.t1v-n-6f5efcd5-w-0.226802.0.v2 +0 -3
rotobart_output/events.out.tfevents.1631545819.t1v-n-6f5efcd5-w-0.229738.0.v2 +0 -3
rotobart_output/events.out.tfevents.1631547648.t1v-n-6f5efcd5-w-0.233322.0.v2 +0 -3
rotobart_output/events.out.tfevents.1631548699.t1v-n-6f5efcd5-w-0.235778.0.v2 +0 -3
rotobart_output/events.out.tfevents.1631550514.t1v-n-6f5efcd5-w-0.238915.0.v2 +0 -3
rotobart_output/events.out.tfevents.1631563532.t1v-n-6f5efcd5-w-0.253179.0.v2 +0 -3
rotobart_output/events.out.tfevents.1631564788.t1v-n-6f5efcd5-w-0.255766.0.v2 +0 -3
rotobart_output/events.out.tfevents.1631565650.t1v-n-6f5efcd5-w-0.257966.0.v2 +0 -3
rotobart_output/events.out.tfevents.1631566798.t1v-n-6f5efcd5-w-0.260486.0.v2 +0 -3
run_dnlm_flax.py +7 -4
train.sh +2 -2
train_ncc.sh +7 -6
trainbak_ncc.sh +22 -0

rotobart_output/events.out.tfevents.1631348281.t1v-n-6f5efcd5-w-0.36723.0.v2 → events.out.tfevents.1631647502.t1v-n-6f5efcd5-w-0.361354.0.v2 RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:96b8400771e3cbb61f4021e90a614e7af1bb5cd08e8f706bed4150bf954774d6
 size 40

 version https://git-lfs.github.com/spec/v1
+oid sha256:6184654b658f61b6d43b9278af1fb1af08f7ca2565de7c7089fbc99360a7917c
 size 40

rotobart_output/events.out.tfevents.1631374032.t1v-n-6f5efcd5-w-0.59864.0.v2 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7313ab3ee173c965aaa6efc6f25ae5f47d5bcb9a7c204b36f6090a13c8e8c036
-size 40

rotobart_output/events.out.tfevents.1631470009.t1v-n-6f5efcd5-w-0.149719.0.v2 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ab1602a428d8d8a787ce3ccd99f06f469f364af5577876c9e3f99620fa77b150
-size 40

rotobart_output/events.out.tfevents.1631470963.t1v-n-6f5efcd5-w-0.152003.0.v2 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:50e917d0db6903e26691490233e0f741f95eeca101ecf64d1d27a6c39fded355
-size 40

rotobart_output/events.out.tfevents.1631473889.t1v-n-6f5efcd5-w-0.155772.0.v2 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c600d6ee8f2718c6a15b181f8ad7dc11e8f6462329a53e98cbbb682dee5bb8bf
-size 40

rotobart_output/events.out.tfevents.1631505260.t1v-n-6f5efcd5-w-0.183749.0.v2 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9b682bf2e455564988153cad2bb6591643d83df9406ea4c6108cd960e0fd4d30
-size 40

rotobart_output/events.out.tfevents.1631519483.t1v-n-6f5efcd5-w-0.197621.0.v2 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b4370fdadf8a5284e69b1cf8df34ce8b3d585ab507abb36eec6f73ba2bff1958
-size 40

rotobart_output/events.out.tfevents.1631535266.t1v-n-6f5efcd5-w-0.215021.0.v2 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7e8630cbeb6794223d0d32dc870599b8637e315935867a69b0e4b74036ed89e4
-size 40

rotobart_output/events.out.tfevents.1631541744.t1v-n-6f5efcd5-w-0.221830.0.v2 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7393c226a4dfb55afe3650f8205dc23c09fda596183f5a37bb220ced7fd8470f
-size 40

rotobart_output/events.out.tfevents.1631542427.t1v-n-6f5efcd5-w-0.223857.0.v2 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:fb457eff41f4843c3a9a3fab1faf262aadb5fd06780b2d60a9d2e57a5d295c0a
-size 40

rotobart_output/events.out.tfevents.1631544225.t1v-n-6f5efcd5-w-0.226802.0.v2 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9db0b2619de390bea001a7e07c9f646fa52522af96e87887f1c6485e463efc61
-size 40

rotobart_output/events.out.tfevents.1631545819.t1v-n-6f5efcd5-w-0.229738.0.v2 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d807a080e13cc50150526c2d490fc48ca16d6129b1faa2c8347e25788e64cf46
-size 40

rotobart_output/events.out.tfevents.1631547648.t1v-n-6f5efcd5-w-0.233322.0.v2 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:29e2f67d9216d876ea21408c1d59ff8469efd1e129023d04034f8a1470be95b5
-size 40

rotobart_output/events.out.tfevents.1631548699.t1v-n-6f5efcd5-w-0.235778.0.v2 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3044160d2bc03ba7a0926415aee57436571c79c245ed08960605c1acb4b4ef19
-size 40

rotobart_output/events.out.tfevents.1631550514.t1v-n-6f5efcd5-w-0.238915.0.v2 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:65f79c3671f084b9ec9f6969b27a6dd77bb277ffaf9bc36eb876dc50a07b2c87
-size 40

rotobart_output/events.out.tfevents.1631563532.t1v-n-6f5efcd5-w-0.253179.0.v2 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:34528a2299052d833f74715dfd3b30d97a0d30a613d82c6799614b6f9db81c42
-size 40

rotobart_output/events.out.tfevents.1631564788.t1v-n-6f5efcd5-w-0.255766.0.v2 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ac93aa647e492b6912b740b708984db5de27ee7dfae2846c827b21257cb48d66
-size 40

rotobart_output/events.out.tfevents.1631565650.t1v-n-6f5efcd5-w-0.257966.0.v2 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f551eada3ee392c877a18bc4900c4cf3ebda3003aa47b187ae36c086ac286a4d
-size 40

rotobart_output/events.out.tfevents.1631566798.t1v-n-6f5efcd5-w-0.260486.0.v2 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:92449ce4ea97d5aea6cdb2c304dc0a75f25703a8258ee1f6e494da7ab133db30
-size 40

run_dnlm_flax.py CHANGED Viewed

@@ -254,7 +254,7 @@ if __name__ == "__main__":
             data_args.dataset_path, split="train", cache_dir=model_args.cache_dir, streaming=True, use_auth_token=data_args.auth_token,
         )
         print("Loading eval data")
-        # Test Dataset - Stream The Pile dataset
         eval_dataset = load_dataset(
             data_args.dataset_path,
@@ -579,7 +579,7 @@ if __name__ == "__main__":
         # ======================== Evaluating ==============================
         if step % training_args.eval_steps == 0 and step > 0:
             num_eval_batches = data_args.num_eval_samples // eval_batch_size
-            breakpoint()
             #print(f'Step={step}')
             #print(f'Eval_step={training_args.eval_steps}')
             #print(f'Num_eval_batches={num_eval_batches}')
@@ -615,7 +615,8 @@ if __name__ == "__main__":
             eval_metrics = []
             # save checkpoint after each epoch and push checkpoint to the hub
-            if jax.process_index() == 0 and training_args.save_strategy == "epoch":
                 params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
                 model.save_pretrained(
                     training_args.output_dir,
@@ -631,7 +632,9 @@ if __name__ == "__main__":
         # save checkpoint on steps and push checkpoint to the hub
-        if (training_args.save_steps % (step + 1)) == 0 and training_args.save_strategy == "steps":
             params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
             model.save_pretrained(
                 training_args.output_dir,

             data_args.dataset_path, split="train", cache_dir=model_args.cache_dir, streaming=True, use_auth_token=data_args.auth_token,
         )
         print("Loading eval data")
+        # Test Dataset - Stream The Pile ataset
         eval_dataset = load_dataset(
             data_args.dataset_path,
         # ======================== Evaluating ==============================
         if step % training_args.eval_steps == 0 and step > 0:
             num_eval_batches = data_args.num_eval_samples // eval_batch_size
+            #breakpoint()
             #print(f'Step={step}')
             #print(f'Eval_step={training_args.eval_steps}')
             #print(f'Num_eval_batches={num_eval_batches}')
             eval_metrics = []
             # save checkpoint after each epoch and push checkpoint to the hub
+            if jax.process_index() == 0 and str(training_args.save_strategy) == "IntervalStrategy.EPOCH":
+                print(f'Saving model in training_args.output_dir')
                 params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
                 model.save_pretrained(
                     training_args.output_dir,
         # save checkpoint on steps and push checkpoint to the hub
+        #print(f"should I save? training save steps = {training_args.save_steps} and step = {step}. Strategy = {training_args.save_strategy}, {str(training_args.save_strategy)}")
+        if (training_args.save_steps % (step + 1)) == 0 and str(training_args.save_strategy) == "IntervalStrategy.STEPS":
+            print(f'Saving model in training_args.output_dir')
             params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
             model.save_pretrained(
                 training_args.output_dir,

train.sh CHANGED Viewed

@@ -13,8 +13,8 @@ python3 run_dnlm_flax.py \
   --per_device_eval_batch_size 1 \
   --logging_steps 8 \
   --num_train_steps 1000 \
-  --eval_steps 1000 \
-  --save_steps 1000 \
   --num_eval_samples 25 \
   --warmup_steps 30 \
   --learning_rate 1e-4 \

   --per_device_eval_batch_size 1 \
   --logging_steps 8 \
   --num_train_steps 1000 \
+  --eval_steps 100 \
+  --save_steps 100 \
   --num_eval_samples 25 \
   --warmup_steps 30 \
   --learning_rate 1e-4 \

train_ncc.sh CHANGED Viewed

@@ -1,8 +1,8 @@
 python3 run_dnlm_flax.py \
-  --output_dir rotobart_output \
   --overwrite_output_dir \
   --dataset_path NbAiLab/NCC2 \
-  --model_name_or_path rotobart \
   --tokenizer_name vocab-2/the_pile.model \
   --shuffle_buffer_size 100_000 \
   --do_train --do_eval \
@@ -13,10 +13,11 @@ python3 run_dnlm_flax.py \
   --per_device_eval_batch_size 1 \
   --logging_steps 8 \
   --num_train_steps 100000 \
-  --eval_steps 100000 \
-  --save_steps 1000 \
-  --num_eval_samples 200 \
-  --warmup_steps 30 \
   --learning_rate 1e-4 \
   --auth_token True \
   --use_bf16 \

 python3 run_dnlm_flax.py \
+  --output_dir "." \
   --overwrite_output_dir \
   --dataset_path NbAiLab/NCC2 \
+  --config_name rotobart \
   --tokenizer_name vocab-2/the_pile.model \
   --shuffle_buffer_size 100_000 \
   --do_train --do_eval \
   --per_device_eval_batch_size 1 \
   --logging_steps 8 \
   --num_train_steps 100000 \
+  --eval_steps 10000 \
+  --save_steps 10000 \
+  --num_eval_samples 500 \
+  --warmup_steps 5000 \
   --learning_rate 1e-4 \
   --auth_token True \
+  --save_strategy steps \
   --use_bf16 \

trainbak_ncc.sh ADDED Viewed

	@@ -0,0 +1,22 @@

+python3 run_dnlm_flax.py \
+  --output_dir rotobart_output \
+  --overwrite_output_dir \
+  --dataset_path NbAiLab/NCC2 \
+  --model_name_or_path rotobart \
+  --tokenizer_name vocab-2/the_pile.model \
+  --shuffle_buffer_size 100_000 \
+  --do_train --do_eval \
+  --max_seq_length 1024 \
+  --encoder_layers 12 \
+  --decoder_layers 12 \
+  --per_device_train_batch_size 1 \
+  --per_device_eval_batch_size 1 \
+  --logging_steps 8 \
+  --num_train_steps 1000 \
+  --eval_steps 100 \
+  --save_steps 100 \
+  --num_eval_samples 200 \
+  --warmup_steps 30 \
+  --learning_rate 1e-4 \
+  --auth_token True \
+  --use_bf16 \