Saving weights and logs of step 300

Browse files

Files changed (5) hide show

flax_model.msgpack +1 -1
run_t5.sh +5 -2
run_t5_mlm_flax_custom_dataset.py +14 -3
runs/Jul10_07-37-20_t1v-n-0e7426e8-w-0/events.out.tfevents.1625902752.t1v-n-0e7426e8-w-0.18397.3.v2 +3 -0
runs/Jul10_07-45-49_t1v-n-0e7426e8-w-0/events.out.tfevents.1625903173.t1v-n-0e7426e8-w-0.20563.3.v2 +3 -0

flax_model.msgpack CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7df1f5835f058622107709c8dc20a5d3452a8facd5dd852b33913d99ebc91e5a
 size 891548548

 version https://git-lfs.github.com/spec/v1
+oid sha256:457c5948252576d9d5252b28b79a754223d3dea5a24a77f5b2b7cb5189129499
 size 891548548

run_t5.sh CHANGED Viewed

@@ -16,9 +16,8 @@ mkdir -p "${MODEL_DIR}/runs"
     --preprocessing_num_workers="96" \
     --do_train --do_eval \
     --adafactor \
-    --dtype="bfloat16" \
     --max_seq_length="512" \
-    --gradient_accumulation_steps="4" \
     --per_device_train_batch_size="32" \
     --per_device_eval_batch_size="32" \
     --learning_rate="5e-3" \
@@ -32,3 +31,7 @@ mkdir -p "${MODEL_DIR}/runs"
 #git add pytorch_model.bin
 #git commit -m "Update pytorch model after training"
 #git push origin main

     --preprocessing_num_workers="96" \
     --do_train --do_eval \
     --adafactor \
     --max_seq_length="512" \
+    --gradient_accumulation_steps="16" \
     --per_device_train_batch_size="32" \
     --per_device_eval_batch_size="32" \
     --learning_rate="5e-3" \
 #git add pytorch_model.bin
 #git commit -m "Update pytorch model after training"
 #git push origin main
+#    --dtype="bfloat16" \
+#    --resume_from_checkpoint="${MODEL_DIR}/ckpt-3300" \

run_t5_mlm_flax_custom_dataset.py CHANGED Viewed

@@ -722,6 +722,9 @@ if __name__ == "__main__":
     num_train_steps = len(tokenized_datasets["train"]) // train_batch_size * num_epochs
     # Create learning rate schedule
     # See https://arxiv.org/pdf/2104.07705.pdf for rationale of choosing the peak at 6% of training steps
@@ -775,6 +778,11 @@ if __name__ == "__main__":
     # Setup train state
     state = train_state.TrainState.create(apply_fn=model.__call__, params=model.params, tx=optimizer)
     # Define gradient update step fn
     def train_step(state, batch, dropout_rng):
         dropout_rng, new_dropout_rng = jax.random.split(dropout_rng)
@@ -828,8 +836,7 @@ if __name__ == "__main__":
     # Replicate the train state on each device
     state = jax_utils.replicate(state)
-    steps_per_epoch = len(datasets['train']) // train_batch_size
-    total_train_steps = steps_per_epoch * num_epochs
     logger.info("***** Running training *****")
     logger.info(f"  Num examples = {len(datasets['train'])}")
@@ -855,6 +862,11 @@ if __name__ == "__main__":
         # Gather the indexes for creating the batch and do a training step
         for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1)):
             samples = [tokenized_datasets["train"][int(idx)] for idx in batch_idx]
             model_inputs = data_collator(samples)
@@ -863,7 +875,6 @@ if __name__ == "__main__":
             state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
             train_metrics.append(train_metric)
-            cur_step = epoch * (num_train_samples // train_batch_size) + step
             if cur_step % training_args.logging_steps * grad_accum_steps == 0 and cur_step > 0:
                 # Save metrics

     num_train_steps = len(tokenized_datasets["train"]) // train_batch_size * num_epochs
+    steps_per_epoch = len(tokenized_datasets['train']) // train_batch_size
+    total_train_steps = steps_per_epoch * num_epochs
     # Create learning rate schedule
     # See https://arxiv.org/pdf/2104.07705.pdf for rationale of choosing the peak at 6% of training steps
     # Setup train state
     state = train_state.TrainState.create(apply_fn=model.__call__, params=model.params, tx=optimizer)
+    if training_args.resume_from_checkpoint:
+        state, resume_step = restore_checkpoint(training_args.resume_from_checkpoint, state)
+    else:
+        resume_step = 0
     # Define gradient update step fn
     def train_step(state, batch, dropout_rng):
         dropout_rng, new_dropout_rng = jax.random.split(dropout_rng)
     # Replicate the train state on each device
     state = jax_utils.replicate(state)
     logger.info("***** Running training *****")
     logger.info(f"  Num examples = {len(datasets['train'])}")
         # Gather the indexes for creating the batch and do a training step
         for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1)):
+            cur_step = epoch * (num_train_samples // train_batch_size) + step
+            # skip to the step from which we are resuming
+            if cur_step < resume_step:
+                continue
             samples = [tokenized_datasets["train"][int(idx)] for idx in batch_idx]
             model_inputs = data_collator(samples)
             state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
             train_metrics.append(train_metric)
             if cur_step % training_args.logging_steps * grad_accum_steps == 0 and cur_step > 0:
                 # Save metrics

runs/Jul10_07-37-20_t1v-n-0e7426e8-w-0/events.out.tfevents.1625902752.t1v-n-0e7426e8-w-0.18397.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1aa4fd14ba6d0007ac2b4c7ad5f7b03ab486b3899ece3eba1fefe852923f2366
+size 40

runs/Jul10_07-45-49_t1v-n-0e7426e8-w-0/events.out.tfevents.1625903173.t1v-n-0e7426e8-w-0.20563.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9086b97ea9ba59e96e4c66b26c205fe1207d0a94ab355127a1e4f8078d84a269
+size 45399