pere commited on
Commit
ebb3343
1 Parent(s): e565538

updating before training

Browse files
Files changed (23) hide show
  1. rotobart_output/events.out.tfevents.1631348281.t1v-n-6f5efcd5-w-0.36723.0.v2 → events.out.tfevents.1631647502.t1v-n-6f5efcd5-w-0.361354.0.v2 +1 -1
  2. rotobart_output/events.out.tfevents.1631374032.t1v-n-6f5efcd5-w-0.59864.0.v2 +0 -3
  3. rotobart_output/events.out.tfevents.1631470009.t1v-n-6f5efcd5-w-0.149719.0.v2 +0 -3
  4. rotobart_output/events.out.tfevents.1631470963.t1v-n-6f5efcd5-w-0.152003.0.v2 +0 -3
  5. rotobart_output/events.out.tfevents.1631473889.t1v-n-6f5efcd5-w-0.155772.0.v2 +0 -3
  6. rotobart_output/events.out.tfevents.1631505260.t1v-n-6f5efcd5-w-0.183749.0.v2 +0 -3
  7. rotobart_output/events.out.tfevents.1631519483.t1v-n-6f5efcd5-w-0.197621.0.v2 +0 -3
  8. rotobart_output/events.out.tfevents.1631535266.t1v-n-6f5efcd5-w-0.215021.0.v2 +0 -3
  9. rotobart_output/events.out.tfevents.1631541744.t1v-n-6f5efcd5-w-0.221830.0.v2 +0 -3
  10. rotobart_output/events.out.tfevents.1631542427.t1v-n-6f5efcd5-w-0.223857.0.v2 +0 -3
  11. rotobart_output/events.out.tfevents.1631544225.t1v-n-6f5efcd5-w-0.226802.0.v2 +0 -3
  12. rotobart_output/events.out.tfevents.1631545819.t1v-n-6f5efcd5-w-0.229738.0.v2 +0 -3
  13. rotobart_output/events.out.tfevents.1631547648.t1v-n-6f5efcd5-w-0.233322.0.v2 +0 -3
  14. rotobart_output/events.out.tfevents.1631548699.t1v-n-6f5efcd5-w-0.235778.0.v2 +0 -3
  15. rotobart_output/events.out.tfevents.1631550514.t1v-n-6f5efcd5-w-0.238915.0.v2 +0 -3
  16. rotobart_output/events.out.tfevents.1631563532.t1v-n-6f5efcd5-w-0.253179.0.v2 +0 -3
  17. rotobart_output/events.out.tfevents.1631564788.t1v-n-6f5efcd5-w-0.255766.0.v2 +0 -3
  18. rotobart_output/events.out.tfevents.1631565650.t1v-n-6f5efcd5-w-0.257966.0.v2 +0 -3
  19. rotobart_output/events.out.tfevents.1631566798.t1v-n-6f5efcd5-w-0.260486.0.v2 +0 -3
  20. run_dnlm_flax.py +7 -4
  21. train.sh +2 -2
  22. train_ncc.sh +7 -6
  23. trainbak_ncc.sh +22 -0
rotobart_output/events.out.tfevents.1631348281.t1v-n-6f5efcd5-w-0.36723.0.v2 → events.out.tfevents.1631647502.t1v-n-6f5efcd5-w-0.361354.0.v2 RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:96b8400771e3cbb61f4021e90a614e7af1bb5cd08e8f706bed4150bf954774d6
3
  size 40
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6184654b658f61b6d43b9278af1fb1af08f7ca2565de7c7089fbc99360a7917c
3
  size 40
rotobart_output/events.out.tfevents.1631374032.t1v-n-6f5efcd5-w-0.59864.0.v2 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7313ab3ee173c965aaa6efc6f25ae5f47d5bcb9a7c204b36f6090a13c8e8c036
3
- size 40
 
 
 
 
rotobart_output/events.out.tfevents.1631470009.t1v-n-6f5efcd5-w-0.149719.0.v2 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ab1602a428d8d8a787ce3ccd99f06f469f364af5577876c9e3f99620fa77b150
3
- size 40
 
 
 
 
rotobart_output/events.out.tfevents.1631470963.t1v-n-6f5efcd5-w-0.152003.0.v2 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:50e917d0db6903e26691490233e0f741f95eeca101ecf64d1d27a6c39fded355
3
- size 40
 
 
 
 
rotobart_output/events.out.tfevents.1631473889.t1v-n-6f5efcd5-w-0.155772.0.v2 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c600d6ee8f2718c6a15b181f8ad7dc11e8f6462329a53e98cbbb682dee5bb8bf
3
- size 40
 
 
 
 
rotobart_output/events.out.tfevents.1631505260.t1v-n-6f5efcd5-w-0.183749.0.v2 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9b682bf2e455564988153cad2bb6591643d83df9406ea4c6108cd960e0fd4d30
3
- size 40
 
 
 
 
rotobart_output/events.out.tfevents.1631519483.t1v-n-6f5efcd5-w-0.197621.0.v2 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b4370fdadf8a5284e69b1cf8df34ce8b3d585ab507abb36eec6f73ba2bff1958
3
- size 40
 
 
 
 
rotobart_output/events.out.tfevents.1631535266.t1v-n-6f5efcd5-w-0.215021.0.v2 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7e8630cbeb6794223d0d32dc870599b8637e315935867a69b0e4b74036ed89e4
3
- size 40
 
 
 
 
rotobart_output/events.out.tfevents.1631541744.t1v-n-6f5efcd5-w-0.221830.0.v2 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7393c226a4dfb55afe3650f8205dc23c09fda596183f5a37bb220ced7fd8470f
3
- size 40
 
 
 
 
rotobart_output/events.out.tfevents.1631542427.t1v-n-6f5efcd5-w-0.223857.0.v2 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:fb457eff41f4843c3a9a3fab1faf262aadb5fd06780b2d60a9d2e57a5d295c0a
3
- size 40
 
 
 
 
rotobart_output/events.out.tfevents.1631544225.t1v-n-6f5efcd5-w-0.226802.0.v2 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9db0b2619de390bea001a7e07c9f646fa52522af96e87887f1c6485e463efc61
3
- size 40
 
 
 
 
rotobart_output/events.out.tfevents.1631545819.t1v-n-6f5efcd5-w-0.229738.0.v2 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d807a080e13cc50150526c2d490fc48ca16d6129b1faa2c8347e25788e64cf46
3
- size 40
 
 
 
 
rotobart_output/events.out.tfevents.1631547648.t1v-n-6f5efcd5-w-0.233322.0.v2 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:29e2f67d9216d876ea21408c1d59ff8469efd1e129023d04034f8a1470be95b5
3
- size 40
 
 
 
 
rotobart_output/events.out.tfevents.1631548699.t1v-n-6f5efcd5-w-0.235778.0.v2 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3044160d2bc03ba7a0926415aee57436571c79c245ed08960605c1acb4b4ef19
3
- size 40
 
 
 
 
rotobart_output/events.out.tfevents.1631550514.t1v-n-6f5efcd5-w-0.238915.0.v2 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:65f79c3671f084b9ec9f6969b27a6dd77bb277ffaf9bc36eb876dc50a07b2c87
3
- size 40
 
 
 
 
rotobart_output/events.out.tfevents.1631563532.t1v-n-6f5efcd5-w-0.253179.0.v2 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:34528a2299052d833f74715dfd3b30d97a0d30a613d82c6799614b6f9db81c42
3
- size 40
 
 
 
 
rotobart_output/events.out.tfevents.1631564788.t1v-n-6f5efcd5-w-0.255766.0.v2 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ac93aa647e492b6912b740b708984db5de27ee7dfae2846c827b21257cb48d66
3
- size 40
 
 
 
 
rotobart_output/events.out.tfevents.1631565650.t1v-n-6f5efcd5-w-0.257966.0.v2 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f551eada3ee392c877a18bc4900c4cf3ebda3003aa47b187ae36c086ac286a4d
3
- size 40
 
 
 
 
rotobart_output/events.out.tfevents.1631566798.t1v-n-6f5efcd5-w-0.260486.0.v2 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:92449ce4ea97d5aea6cdb2c304dc0a75f25703a8258ee1f6e494da7ab133db30
3
- size 40
 
 
 
 
run_dnlm_flax.py CHANGED
@@ -254,7 +254,7 @@ if __name__ == "__main__":
254
  data_args.dataset_path, split="train", cache_dir=model_args.cache_dir, streaming=True, use_auth_token=data_args.auth_token,
255
  )
256
  print("Loading eval data")
257
- # Test Dataset - Stream The Pile dataset
258
 
259
  eval_dataset = load_dataset(
260
  data_args.dataset_path,
@@ -579,7 +579,7 @@ if __name__ == "__main__":
579
  # ======================== Evaluating ==============================
580
  if step % training_args.eval_steps == 0 and step > 0:
581
  num_eval_batches = data_args.num_eval_samples // eval_batch_size
582
- breakpoint()
583
  #print(f'Step={step}')
584
  #print(f'Eval_step={training_args.eval_steps}')
585
  #print(f'Num_eval_batches={num_eval_batches}')
@@ -615,7 +615,8 @@ if __name__ == "__main__":
615
  eval_metrics = []
616
 
617
  # save checkpoint after each epoch and push checkpoint to the hub
618
- if jax.process_index() == 0 and training_args.save_strategy == "epoch":
 
619
  params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
620
  model.save_pretrained(
621
  training_args.output_dir,
@@ -631,7 +632,9 @@ if __name__ == "__main__":
631
 
632
 
633
  # save checkpoint on steps and push checkpoint to the hub
634
- if (training_args.save_steps % (step + 1)) == 0 and training_args.save_strategy == "steps":
 
 
635
  params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
636
  model.save_pretrained(
637
  training_args.output_dir,
 
254
  data_args.dataset_path, split="train", cache_dir=model_args.cache_dir, streaming=True, use_auth_token=data_args.auth_token,
255
  )
256
  print("Loading eval data")
257
+ # Test Dataset - Stream The Pile ataset
258
 
259
  eval_dataset = load_dataset(
260
  data_args.dataset_path,
 
579
  # ======================== Evaluating ==============================
580
  if step % training_args.eval_steps == 0 and step > 0:
581
  num_eval_batches = data_args.num_eval_samples // eval_batch_size
582
+ #breakpoint()
583
  #print(f'Step={step}')
584
  #print(f'Eval_step={training_args.eval_steps}')
585
  #print(f'Num_eval_batches={num_eval_batches}')
 
615
  eval_metrics = []
616
 
617
  # save checkpoint after each epoch and push checkpoint to the hub
618
+ if jax.process_index() == 0 and str(training_args.save_strategy) == "IntervalStrategy.EPOCH":
619
+ print(f'Saving model in training_args.output_dir')
620
  params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
621
  model.save_pretrained(
622
  training_args.output_dir,
 
632
 
633
 
634
  # save checkpoint on steps and push checkpoint to the hub
635
+ #print(f"should I save? training save steps = {training_args.save_steps} and step = {step}. Strategy = {training_args.save_strategy}, {str(training_args.save_strategy)}")
636
+ if (training_args.save_steps % (step + 1)) == 0 and str(training_args.save_strategy) == "IntervalStrategy.STEPS":
637
+ print(f'Saving model in training_args.output_dir')
638
  params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
639
  model.save_pretrained(
640
  training_args.output_dir,
train.sh CHANGED
@@ -13,8 +13,8 @@ python3 run_dnlm_flax.py \
13
  --per_device_eval_batch_size 1 \
14
  --logging_steps 8 \
15
  --num_train_steps 1000 \
16
- --eval_steps 1000 \
17
- --save_steps 1000 \
18
  --num_eval_samples 25 \
19
  --warmup_steps 30 \
20
  --learning_rate 1e-4 \
 
13
  --per_device_eval_batch_size 1 \
14
  --logging_steps 8 \
15
  --num_train_steps 1000 \
16
+ --eval_steps 100 \
17
+ --save_steps 100 \
18
  --num_eval_samples 25 \
19
  --warmup_steps 30 \
20
  --learning_rate 1e-4 \
train_ncc.sh CHANGED
@@ -1,8 +1,8 @@
1
  python3 run_dnlm_flax.py \
2
- --output_dir rotobart_output \
3
  --overwrite_output_dir \
4
  --dataset_path NbAiLab/NCC2 \
5
- --model_name_or_path rotobart \
6
  --tokenizer_name vocab-2/the_pile.model \
7
  --shuffle_buffer_size 100_000 \
8
  --do_train --do_eval \
@@ -13,10 +13,11 @@ python3 run_dnlm_flax.py \
13
  --per_device_eval_batch_size 1 \
14
  --logging_steps 8 \
15
  --num_train_steps 100000 \
16
- --eval_steps 100000 \
17
- --save_steps 1000 \
18
- --num_eval_samples 200 \
19
- --warmup_steps 30 \
20
  --learning_rate 1e-4 \
21
  --auth_token True \
 
22
  --use_bf16 \
 
1
  python3 run_dnlm_flax.py \
2
+ --output_dir "." \
3
  --overwrite_output_dir \
4
  --dataset_path NbAiLab/NCC2 \
5
+ --config_name rotobart \
6
  --tokenizer_name vocab-2/the_pile.model \
7
  --shuffle_buffer_size 100_000 \
8
  --do_train --do_eval \
 
13
  --per_device_eval_batch_size 1 \
14
  --logging_steps 8 \
15
  --num_train_steps 100000 \
16
+ --eval_steps 10000 \
17
+ --save_steps 10000 \
18
+ --num_eval_samples 500 \
19
+ --warmup_steps 5000 \
20
  --learning_rate 1e-4 \
21
  --auth_token True \
22
+ --save_strategy steps \
23
  --use_bf16 \
trainbak_ncc.sh ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ python3 run_dnlm_flax.py \
2
+ --output_dir rotobart_output \
3
+ --overwrite_output_dir \
4
+ --dataset_path NbAiLab/NCC2 \
5
+ --model_name_or_path rotobart \
6
+ --tokenizer_name vocab-2/the_pile.model \
7
+ --shuffle_buffer_size 100_000 \
8
+ --do_train --do_eval \
9
+ --max_seq_length 1024 \
10
+ --encoder_layers 12 \
11
+ --decoder_layers 12 \
12
+ --per_device_train_batch_size 1 \
13
+ --per_device_eval_batch_size 1 \
14
+ --logging_steps 8 \
15
+ --num_train_steps 1000 \
16
+ --eval_steps 100 \
17
+ --save_steps 100 \
18
+ --num_eval_samples 200 \
19
+ --warmup_steps 30 \
20
+ --learning_rate 1e-4 \
21
+ --auth_token True \
22
+ --use_bf16 \