Large model with 10% validation set

Files changed (4) hide show

flax_model.msgpack CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f84da6fceb9936906a58eafe7ef1ceb4e7f94978a833b73bd88f0f0bb2c67b50
 size 498796983

 version https://git-lfs.github.com/spec/v1
+oid sha256:349d2b21007510b12abb5e8a79b6f4925875228eca5b82932d3968977422dd03
 size 498796983

run.sh CHANGED Viewed

@@ -5,6 +5,7 @@ HUB_TOKEN=`cat $HOME/.huggingface/token`
     --config_name="${MODEL_DIR}" \
     --tokenizer_name="${MODEL_DIR}" \
     --train_file="/home/nipunsadvilkar/mr_data/mr_train_punctrm.csv" \
     --max_seq_length="128" \
     --weight_decay="0.01" \
     --per_device_train_batch_size="128" \

     --config_name="${MODEL_DIR}" \
     --tokenizer_name="${MODEL_DIR}" \
     --train_file="/home/nipunsadvilkar/mr_data/mr_train_punctrm.csv" \
+    --validation_split_percentage=10 \
     --max_seq_length="128" \
     --weight_decay="0.01" \
     --per_device_train_batch_size="128" \

run_mlm_flax.py CHANGED Viewed

@@ -364,6 +364,7 @@ if __name__ == "__main__":
                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
             )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -691,6 +692,7 @@ if __name__ == "__main__":
                 if jax.process_index() == 0:
                     step_output_dir = f"checkpoint_{cur_step}"
                     os.mkdir(step_output_dir)
                     params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
                     model.save_pretrained(
                         step_output_dir,
@@ -698,5 +700,5 @@ if __name__ == "__main__":
                         push_to_hub=training_args.push_to_hub,
                         commit_message=f"Saving weights and logs of step {cur_step}",
                     )
-                    with open(f"{step_output_dir}/opt_state_{cur_step}.msgpack", "wb") as f:
                         f.write(to_bytes(state.opt_state))

                 split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
             )
+            print(datasets)
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
                 if jax.process_index() == 0:
                     step_output_dir = f"checkpoint_{cur_step}"
                     os.mkdir(step_output_dir)
+                    print(f"Saving weights, optimizer state and logs of step {cur_step} at {step_output_dir}")
                     params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
                     model.save_pretrained(
                         step_output_dir,
                         push_to_hub=training_args.push_to_hub,
                         commit_message=f"Saving weights and logs of step {cur_step}",
                     )
+                    with open("opt_state.msgpack", "wb") as f:
                         f.write(to_bytes(state.opt_state))

test_marathi_model.py CHANGED Viewed

@@ -3,8 +3,8 @@ import pprint
 from transformers import pipeline, AutoTokenizer, RobertaForMaskedLM
-tokenizer = AutoTokenizer.from_pretrained("/home/nipunsadvilkar/roberta-base-mr/checkpoint_45000/")
-model = RobertaForMaskedLM.from_pretrained("/home/nipunsadvilkar/roberta-base-mr/checkpoint_45000/", from_flax=True)
 nlp = pipeline("fill-mask", model=model, tokenizer=tokenizer)
 # masked_input = "माझा नाव <mask> आहे"

 from transformers import pipeline, AutoTokenizer, RobertaForMaskedLM
+tokenizer = AutoTokenizer.from_pretrained("./")
+model = RobertaForMaskedLM.from_pretrained("./", from_flax=True)
 nlp = pipeline("fill-mask", model=model, tokenizer=tokenizer)
 # masked_input = "माझा नाव <mask> आहे"