updating before training
Browse files- rotobart_output/events.out.tfevents.1631348281.t1v-n-6f5efcd5-w-0.36723.0.v2 → events.out.tfevents.1631647502.t1v-n-6f5efcd5-w-0.361354.0.v2 +1 -1
- rotobart_output/events.out.tfevents.1631374032.t1v-n-6f5efcd5-w-0.59864.0.v2 +0 -3
- rotobart_output/events.out.tfevents.1631470009.t1v-n-6f5efcd5-w-0.149719.0.v2 +0 -3
- rotobart_output/events.out.tfevents.1631470963.t1v-n-6f5efcd5-w-0.152003.0.v2 +0 -3
- rotobart_output/events.out.tfevents.1631473889.t1v-n-6f5efcd5-w-0.155772.0.v2 +0 -3
- rotobart_output/events.out.tfevents.1631505260.t1v-n-6f5efcd5-w-0.183749.0.v2 +0 -3
- rotobart_output/events.out.tfevents.1631519483.t1v-n-6f5efcd5-w-0.197621.0.v2 +0 -3
- rotobart_output/events.out.tfevents.1631535266.t1v-n-6f5efcd5-w-0.215021.0.v2 +0 -3
- rotobart_output/events.out.tfevents.1631541744.t1v-n-6f5efcd5-w-0.221830.0.v2 +0 -3
- rotobart_output/events.out.tfevents.1631542427.t1v-n-6f5efcd5-w-0.223857.0.v2 +0 -3
- rotobart_output/events.out.tfevents.1631544225.t1v-n-6f5efcd5-w-0.226802.0.v2 +0 -3
- rotobart_output/events.out.tfevents.1631545819.t1v-n-6f5efcd5-w-0.229738.0.v2 +0 -3
- rotobart_output/events.out.tfevents.1631547648.t1v-n-6f5efcd5-w-0.233322.0.v2 +0 -3
- rotobart_output/events.out.tfevents.1631548699.t1v-n-6f5efcd5-w-0.235778.0.v2 +0 -3
- rotobart_output/events.out.tfevents.1631550514.t1v-n-6f5efcd5-w-0.238915.0.v2 +0 -3
- rotobart_output/events.out.tfevents.1631563532.t1v-n-6f5efcd5-w-0.253179.0.v2 +0 -3
- rotobart_output/events.out.tfevents.1631564788.t1v-n-6f5efcd5-w-0.255766.0.v2 +0 -3
- rotobart_output/events.out.tfevents.1631565650.t1v-n-6f5efcd5-w-0.257966.0.v2 +0 -3
- rotobart_output/events.out.tfevents.1631566798.t1v-n-6f5efcd5-w-0.260486.0.v2 +0 -3
- run_dnlm_flax.py +7 -4
- train.sh +2 -2
- train_ncc.sh +7 -6
- trainbak_ncc.sh +22 -0
rotobart_output/events.out.tfevents.1631348281.t1v-n-6f5efcd5-w-0.36723.0.v2 → events.out.tfevents.1631647502.t1v-n-6f5efcd5-w-0.361354.0.v2
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 40
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6184654b658f61b6d43b9278af1fb1af08f7ca2565de7c7089fbc99360a7917c
|
3 |
size 40
|
rotobart_output/events.out.tfevents.1631374032.t1v-n-6f5efcd5-w-0.59864.0.v2
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:7313ab3ee173c965aaa6efc6f25ae5f47d5bcb9a7c204b36f6090a13c8e8c036
|
3 |
-
size 40
|
|
|
|
|
|
|
|
rotobart_output/events.out.tfevents.1631470009.t1v-n-6f5efcd5-w-0.149719.0.v2
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:ab1602a428d8d8a787ce3ccd99f06f469f364af5577876c9e3f99620fa77b150
|
3 |
-
size 40
|
|
|
|
|
|
|
|
rotobart_output/events.out.tfevents.1631470963.t1v-n-6f5efcd5-w-0.152003.0.v2
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:50e917d0db6903e26691490233e0f741f95eeca101ecf64d1d27a6c39fded355
|
3 |
-
size 40
|
|
|
|
|
|
|
|
rotobart_output/events.out.tfevents.1631473889.t1v-n-6f5efcd5-w-0.155772.0.v2
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:c600d6ee8f2718c6a15b181f8ad7dc11e8f6462329a53e98cbbb682dee5bb8bf
|
3 |
-
size 40
|
|
|
|
|
|
|
|
rotobart_output/events.out.tfevents.1631505260.t1v-n-6f5efcd5-w-0.183749.0.v2
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:9b682bf2e455564988153cad2bb6591643d83df9406ea4c6108cd960e0fd4d30
|
3 |
-
size 40
|
|
|
|
|
|
|
|
rotobart_output/events.out.tfevents.1631519483.t1v-n-6f5efcd5-w-0.197621.0.v2
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:b4370fdadf8a5284e69b1cf8df34ce8b3d585ab507abb36eec6f73ba2bff1958
|
3 |
-
size 40
|
|
|
|
|
|
|
|
rotobart_output/events.out.tfevents.1631535266.t1v-n-6f5efcd5-w-0.215021.0.v2
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:7e8630cbeb6794223d0d32dc870599b8637e315935867a69b0e4b74036ed89e4
|
3 |
-
size 40
|
|
|
|
|
|
|
|
rotobart_output/events.out.tfevents.1631541744.t1v-n-6f5efcd5-w-0.221830.0.v2
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:7393c226a4dfb55afe3650f8205dc23c09fda596183f5a37bb220ced7fd8470f
|
3 |
-
size 40
|
|
|
|
|
|
|
|
rotobart_output/events.out.tfevents.1631542427.t1v-n-6f5efcd5-w-0.223857.0.v2
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:fb457eff41f4843c3a9a3fab1faf262aadb5fd06780b2d60a9d2e57a5d295c0a
|
3 |
-
size 40
|
|
|
|
|
|
|
|
rotobart_output/events.out.tfevents.1631544225.t1v-n-6f5efcd5-w-0.226802.0.v2
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:9db0b2619de390bea001a7e07c9f646fa52522af96e87887f1c6485e463efc61
|
3 |
-
size 40
|
|
|
|
|
|
|
|
rotobart_output/events.out.tfevents.1631545819.t1v-n-6f5efcd5-w-0.229738.0.v2
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:d807a080e13cc50150526c2d490fc48ca16d6129b1faa2c8347e25788e64cf46
|
3 |
-
size 40
|
|
|
|
|
|
|
|
rotobart_output/events.out.tfevents.1631547648.t1v-n-6f5efcd5-w-0.233322.0.v2
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:29e2f67d9216d876ea21408c1d59ff8469efd1e129023d04034f8a1470be95b5
|
3 |
-
size 40
|
|
|
|
|
|
|
|
rotobart_output/events.out.tfevents.1631548699.t1v-n-6f5efcd5-w-0.235778.0.v2
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:3044160d2bc03ba7a0926415aee57436571c79c245ed08960605c1acb4b4ef19
|
3 |
-
size 40
|
|
|
|
|
|
|
|
rotobart_output/events.out.tfevents.1631550514.t1v-n-6f5efcd5-w-0.238915.0.v2
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:65f79c3671f084b9ec9f6969b27a6dd77bb277ffaf9bc36eb876dc50a07b2c87
|
3 |
-
size 40
|
|
|
|
|
|
|
|
rotobart_output/events.out.tfevents.1631563532.t1v-n-6f5efcd5-w-0.253179.0.v2
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:34528a2299052d833f74715dfd3b30d97a0d30a613d82c6799614b6f9db81c42
|
3 |
-
size 40
|
|
|
|
|
|
|
|
rotobart_output/events.out.tfevents.1631564788.t1v-n-6f5efcd5-w-0.255766.0.v2
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:ac93aa647e492b6912b740b708984db5de27ee7dfae2846c827b21257cb48d66
|
3 |
-
size 40
|
|
|
|
|
|
|
|
rotobart_output/events.out.tfevents.1631565650.t1v-n-6f5efcd5-w-0.257966.0.v2
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:f551eada3ee392c877a18bc4900c4cf3ebda3003aa47b187ae36c086ac286a4d
|
3 |
-
size 40
|
|
|
|
|
|
|
|
rotobart_output/events.out.tfevents.1631566798.t1v-n-6f5efcd5-w-0.260486.0.v2
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:92449ce4ea97d5aea6cdb2c304dc0a75f25703a8258ee1f6e494da7ab133db30
|
3 |
-
size 40
|
|
|
|
|
|
|
|
run_dnlm_flax.py
CHANGED
@@ -254,7 +254,7 @@ if __name__ == "__main__":
|
|
254 |
data_args.dataset_path, split="train", cache_dir=model_args.cache_dir, streaming=True, use_auth_token=data_args.auth_token,
|
255 |
)
|
256 |
print("Loading eval data")
|
257 |
-
# Test Dataset - Stream The Pile
|
258 |
|
259 |
eval_dataset = load_dataset(
|
260 |
data_args.dataset_path,
|
@@ -579,7 +579,7 @@ if __name__ == "__main__":
|
|
579 |
# ======================== Evaluating ==============================
|
580 |
if step % training_args.eval_steps == 0 and step > 0:
|
581 |
num_eval_batches = data_args.num_eval_samples // eval_batch_size
|
582 |
-
breakpoint()
|
583 |
#print(f'Step={step}')
|
584 |
#print(f'Eval_step={training_args.eval_steps}')
|
585 |
#print(f'Num_eval_batches={num_eval_batches}')
|
@@ -615,7 +615,8 @@ if __name__ == "__main__":
|
|
615 |
eval_metrics = []
|
616 |
|
617 |
# save checkpoint after each epoch and push checkpoint to the hub
|
618 |
-
if jax.process_index() == 0 and training_args.save_strategy == "
|
|
|
619 |
params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
|
620 |
model.save_pretrained(
|
621 |
training_args.output_dir,
|
@@ -631,7 +632,9 @@ if __name__ == "__main__":
|
|
631 |
|
632 |
|
633 |
# save checkpoint on steps and push checkpoint to the hub
|
634 |
-
|
|
|
|
|
635 |
params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
|
636 |
model.save_pretrained(
|
637 |
training_args.output_dir,
|
|
|
254 |
data_args.dataset_path, split="train", cache_dir=model_args.cache_dir, streaming=True, use_auth_token=data_args.auth_token,
|
255 |
)
|
256 |
print("Loading eval data")
|
257 |
+
# Test Dataset - Stream The Pile ataset
|
258 |
|
259 |
eval_dataset = load_dataset(
|
260 |
data_args.dataset_path,
|
|
|
579 |
# ======================== Evaluating ==============================
|
580 |
if step % training_args.eval_steps == 0 and step > 0:
|
581 |
num_eval_batches = data_args.num_eval_samples // eval_batch_size
|
582 |
+
#breakpoint()
|
583 |
#print(f'Step={step}')
|
584 |
#print(f'Eval_step={training_args.eval_steps}')
|
585 |
#print(f'Num_eval_batches={num_eval_batches}')
|
|
|
615 |
eval_metrics = []
|
616 |
|
617 |
# save checkpoint after each epoch and push checkpoint to the hub
|
618 |
+
if jax.process_index() == 0 and str(training_args.save_strategy) == "IntervalStrategy.EPOCH":
|
619 |
+
print(f'Saving model in training_args.output_dir')
|
620 |
params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
|
621 |
model.save_pretrained(
|
622 |
training_args.output_dir,
|
|
|
632 |
|
633 |
|
634 |
# save checkpoint on steps and push checkpoint to the hub
|
635 |
+
#print(f"should I save? training save steps = {training_args.save_steps} and step = {step}. Strategy = {training_args.save_strategy}, {str(training_args.save_strategy)}")
|
636 |
+
if (training_args.save_steps % (step + 1)) == 0 and str(training_args.save_strategy) == "IntervalStrategy.STEPS":
|
637 |
+
print(f'Saving model in training_args.output_dir')
|
638 |
params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
|
639 |
model.save_pretrained(
|
640 |
training_args.output_dir,
|
train.sh
CHANGED
@@ -13,8 +13,8 @@ python3 run_dnlm_flax.py \
|
|
13 |
--per_device_eval_batch_size 1 \
|
14 |
--logging_steps 8 \
|
15 |
--num_train_steps 1000 \
|
16 |
-
--eval_steps
|
17 |
-
--save_steps
|
18 |
--num_eval_samples 25 \
|
19 |
--warmup_steps 30 \
|
20 |
--learning_rate 1e-4 \
|
|
|
13 |
--per_device_eval_batch_size 1 \
|
14 |
--logging_steps 8 \
|
15 |
--num_train_steps 1000 \
|
16 |
+
--eval_steps 100 \
|
17 |
+
--save_steps 100 \
|
18 |
--num_eval_samples 25 \
|
19 |
--warmup_steps 30 \
|
20 |
--learning_rate 1e-4 \
|
train_ncc.sh
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
python3 run_dnlm_flax.py \
|
2 |
-
--output_dir
|
3 |
--overwrite_output_dir \
|
4 |
--dataset_path NbAiLab/NCC2 \
|
5 |
-
--
|
6 |
--tokenizer_name vocab-2/the_pile.model \
|
7 |
--shuffle_buffer_size 100_000 \
|
8 |
--do_train --do_eval \
|
@@ -13,10 +13,11 @@ python3 run_dnlm_flax.py \
|
|
13 |
--per_device_eval_batch_size 1 \
|
14 |
--logging_steps 8 \
|
15 |
--num_train_steps 100000 \
|
16 |
-
--eval_steps
|
17 |
-
--save_steps
|
18 |
-
--num_eval_samples
|
19 |
-
--warmup_steps
|
20 |
--learning_rate 1e-4 \
|
21 |
--auth_token True \
|
|
|
22 |
--use_bf16 \
|
|
|
1 |
python3 run_dnlm_flax.py \
|
2 |
+
--output_dir "." \
|
3 |
--overwrite_output_dir \
|
4 |
--dataset_path NbAiLab/NCC2 \
|
5 |
+
--config_name rotobart \
|
6 |
--tokenizer_name vocab-2/the_pile.model \
|
7 |
--shuffle_buffer_size 100_000 \
|
8 |
--do_train --do_eval \
|
|
|
13 |
--per_device_eval_batch_size 1 \
|
14 |
--logging_steps 8 \
|
15 |
--num_train_steps 100000 \
|
16 |
+
--eval_steps 10000 \
|
17 |
+
--save_steps 10000 \
|
18 |
+
--num_eval_samples 500 \
|
19 |
+
--warmup_steps 5000 \
|
20 |
--learning_rate 1e-4 \
|
21 |
--auth_token True \
|
22 |
+
--save_strategy steps \
|
23 |
--use_bf16 \
|
trainbak_ncc.sh
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
python3 run_dnlm_flax.py \
|
2 |
+
--output_dir rotobart_output \
|
3 |
+
--overwrite_output_dir \
|
4 |
+
--dataset_path NbAiLab/NCC2 \
|
5 |
+
--model_name_or_path rotobart \
|
6 |
+
--tokenizer_name vocab-2/the_pile.model \
|
7 |
+
--shuffle_buffer_size 100_000 \
|
8 |
+
--do_train --do_eval \
|
9 |
+
--max_seq_length 1024 \
|
10 |
+
--encoder_layers 12 \
|
11 |
+
--decoder_layers 12 \
|
12 |
+
--per_device_train_batch_size 1 \
|
13 |
+
--per_device_eval_batch_size 1 \
|
14 |
+
--logging_steps 8 \
|
15 |
+
--num_train_steps 1000 \
|
16 |
+
--eval_steps 100 \
|
17 |
+
--save_steps 100 \
|
18 |
+
--num_eval_samples 200 \
|
19 |
+
--warmup_steps 30 \
|
20 |
+
--learning_rate 1e-4 \
|
21 |
+
--auth_token True \
|
22 |
+
--use_bf16 \
|