Update scripts to work around collator valueerror. Update weights

Browse files

Files changed (8) hide show

config.json +2 -0
flax_model.msgpack +1 -1
opt_state.msgpack +3 -0
pytorch_model.bin +1 -1
run_t5.sh +37 -22
run_t5_mlm_flax_custom_dataset.py +5 -0
runs/{Jul11_17-06-36_t1v-n-0e7426e8-w-0/events.out.tfevents.1626023202.t1v-n-0e7426e8-w-0.178001.3.v2 → Jul12_06-43-08_t1v-n-0e7426e8-w-0/events.out.tfevents.1626072193.t1v-n-0e7426e8-w-0.238699.3.v2} +2 -2
training_state.json +1 -0

config.json CHANGED Viewed

@@ -1,4 +1,5 @@
 {
   "architectures": [
     "T5ForConditionalGeneration"
   ],
@@ -50,6 +51,7 @@
       "prefix": "translate English to Romanian: "
     }
   },
   "transformers_version": "4.9.0.dev0",
   "use_cache": true,
   "vocab_size": 32103

 {
+  "_name_or_path": ".",
   "architectures": [
     "T5ForConditionalGeneration"
   ],
       "prefix": "translate English to Romanian: "
     }
   },
+  "torch_dtype": "float32",
   "transformers_version": "4.9.0.dev0",
   "use_cache": true,
   "vocab_size": 32103

flax_model.msgpack CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:02c8aedd34c528d3a7806d216941cc23732a751a8d687f8bf1db06eb1e1e75a3
 size 891548548

 version https://git-lfs.github.com/spec/v1
+oid sha256:8c8d5a4eb1275b4c679b148f38edb974772997a3925809f39095204009f83502
 size 891548548

opt_state.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:97c0ff372805930fa4d7e81ae09094b7daf3cc2c1ba06224fc522a8e672af91a
+size 1985609

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1a8f60fdc3ad43a82bab7ec3dcaf1138179d7508798267becb15426d86b9385f
 size 891650495

 version https://git-lfs.github.com/spec/v1
+oid sha256:782edc5c7aa8aa66320a3417abff572760287ee6a7759f1867486d2217563650
 size 891650495

run_t5.sh CHANGED Viewed

@@ -7,28 +7,42 @@ mkdir -p "${MODEL_DIR}/runs"
 # T5 paper lr 0.01 with batch size 128
 # We have a batch size of 8 devices * 32 = 256, so lr = 0.01/2
-./run_t5_mlm_flax_custom_dataset.py \
-    --output_dir="${MODEL_DIR}" \
-    --model_type="t5" \
-    --config_name="flax-community/${MODEL}" \
-    --tokenizer_name="${MODEL_DIR}" \
-    --preprocessing_num_workers="96" \
-    --do_train --do_eval \
-    --adafactor \
-    --max_seq_length="512" \
-    --per_device_train_batch_size="32" \
-    --per_device_eval_batch_size="32" \
-    --learning_rate="5e-3" \
-    --dtype="bfloat16" \
-    --overwrite_output_dir \
-    --num_train_epochs="3" \
-    --logging_steps="50" \
-    --save_steps="2000" \
-    --eval_steps="10000000" \
-    --resume_from_checkpoint="${MODEL_DIR}/ckpt-18000" \
-    --warmup_steps="3413" \
-    --push_to_hub
 #git add pytorch_model.bin
@@ -37,3 +51,4 @@ mkdir -p "${MODEL_DIR}/runs"
 #    --gradient_accumulation_steps="2" \

 # T5 paper lr 0.01 with batch size 128
 # We have a batch size of 8 devices * 32 = 256, so lr = 0.01/2
+while true; do
+  # Set the seed to random before each run, so date shuffling per epoch is different each run.
+  # This kills reproducibility, but is required as long as during training ValueError can be raised.
+  SEED=$RANDOM
+  ./run_t5_mlm_flax_custom_dataset.py \
+      --output_dir="${MODEL_DIR}" \
+      --model_type="t5" \
+      --config_name="flax-community/${MODEL}" \
+      --tokenizer_name="${MODEL_DIR}" \
+      --seed="${SEED}" \
+      --preprocessing_num_workers="96" \
+      --do_train --do_eval \
+      --adafactor \
+      --max_seq_length="512" \
+      --per_device_train_batch_size="32" \
+      --per_device_eval_batch_size="32" \
+      --learning_rate="5e-3" \
+      --dtype="bfloat16" \
+      --overwrite_output_dir \
+      --num_train_epochs="3" \
+      --logging_steps="50" \
+      --save_steps="501" \
+      --eval_steps="10000000" \
+      --resume_from_checkpoint="${MODEL_DIR}" \
+      --warmup_steps="3413"
+#       \
+#      --push_to_hub
+  echo "RESTARTING"
+  sleep 20
+done
+#
+#     \
 #git add pytorch_model.bin
 #    --gradient_accumulation_steps="2" \
+#    --resume_from_checkpoint="${MODEL_DIR}/ckpt-18000" \

run_t5_mlm_flax_custom_dataset.py CHANGED Viewed

@@ -432,6 +432,11 @@ def save_checkpoint(model, save_dir, state, with_opt: bool = True):
         push_to_hub=training_args.push_to_hub,
         commit_message=f"Saving weights and logs of step {cur_step}",
     )
     logger.info("checkpoint saved")

         push_to_hub=training_args.push_to_hub,
         commit_message=f"Saving weights and logs of step {cur_step}",
     )
+    if with_opt:
+        with open(os.path.join(training_args.output_dir, "opt_state.msgpack"), "wb") as f:
+            f.write(to_bytes(state.opt_state))
+        with open(os.path.join(training_args.output_dir, "training_state.json"), "w") as f:
+            json.dump({"step": state.step.item()}, f)
     logger.info("checkpoint saved")

runs/{Jul11_17-06-36_t1v-n-0e7426e8-w-0/events.out.tfevents.1626023202.t1v-n-0e7426e8-w-0.178001.3.v2 → Jul12_06-43-08_t1v-n-0e7426e8-w-0/events.out.tfevents.1626072193.t1v-n-0e7426e8-w-0.238699.3.v2} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0b89824cdb72fe97627209c68074b163e725d00349a36ed38b233e7d579e1b92
-size 296685

 version https://git-lfs.github.com/spec/v1
+oid sha256:9f5f6fcc83f8cf7fac87cc276fa00a02c9ce4e252c6bb69a3988452bed73f67e
+size 200238

training_state.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"step": 15004}