nipunsadvilkar
commited on
Commit
•
e307ddc
1
Parent(s):
30fff05
Large model with 10% validation set
Browse files- flax_model.msgpack +1 -1
- run.sh +1 -0
- run_mlm_flax.py +3 -1
- test_marathi_model.py +2 -2
flax_model.msgpack
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 498796983
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:349d2b21007510b12abb5e8a79b6f4925875228eca5b82932d3968977422dd03
|
3 |
size 498796983
|
run.sh
CHANGED
@@ -5,6 +5,7 @@ HUB_TOKEN=`cat $HOME/.huggingface/token`
|
|
5 |
--config_name="${MODEL_DIR}" \
|
6 |
--tokenizer_name="${MODEL_DIR}" \
|
7 |
--train_file="/home/nipunsadvilkar/mr_data/mr_train_punctrm.csv" \
|
|
|
8 |
--max_seq_length="128" \
|
9 |
--weight_decay="0.01" \
|
10 |
--per_device_train_batch_size="128" \
|
5 |
--config_name="${MODEL_DIR}" \
|
6 |
--tokenizer_name="${MODEL_DIR}" \
|
7 |
--train_file="/home/nipunsadvilkar/mr_data/mr_train_punctrm.csv" \
|
8 |
+
--validation_split_percentage=10 \
|
9 |
--max_seq_length="128" \
|
10 |
--weight_decay="0.01" \
|
11 |
--per_device_train_batch_size="128" \
|
run_mlm_flax.py
CHANGED
@@ -364,6 +364,7 @@ if __name__ == "__main__":
|
|
364 |
split=f"train[{data_args.validation_split_percentage}%:]",
|
365 |
cache_dir=model_args.cache_dir,
|
366 |
)
|
|
|
367 |
|
368 |
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
369 |
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
@@ -691,6 +692,7 @@ if __name__ == "__main__":
|
|
691 |
if jax.process_index() == 0:
|
692 |
step_output_dir = f"checkpoint_{cur_step}"
|
693 |
os.mkdir(step_output_dir)
|
|
|
694 |
params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
|
695 |
model.save_pretrained(
|
696 |
step_output_dir,
|
@@ -698,5 +700,5 @@ if __name__ == "__main__":
|
|
698 |
push_to_hub=training_args.push_to_hub,
|
699 |
commit_message=f"Saving weights and logs of step {cur_step}",
|
700 |
)
|
701 |
-
with open(
|
702 |
f.write(to_bytes(state.opt_state))
|
364 |
split=f"train[{data_args.validation_split_percentage}%:]",
|
365 |
cache_dir=model_args.cache_dir,
|
366 |
)
|
367 |
+
print(datasets)
|
368 |
|
369 |
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
370 |
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
692 |
if jax.process_index() == 0:
|
693 |
step_output_dir = f"checkpoint_{cur_step}"
|
694 |
os.mkdir(step_output_dir)
|
695 |
+
print(f"Saving weights, optimizer state and logs of step {cur_step} at {step_output_dir}")
|
696 |
params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
|
697 |
model.save_pretrained(
|
698 |
step_output_dir,
|
700 |
push_to_hub=training_args.push_to_hub,
|
701 |
commit_message=f"Saving weights and logs of step {cur_step}",
|
702 |
)
|
703 |
+
with open("opt_state.msgpack", "wb") as f:
|
704 |
f.write(to_bytes(state.opt_state))
|
test_marathi_model.py
CHANGED
@@ -3,8 +3,8 @@ import pprint
|
|
3 |
|
4 |
from transformers import pipeline, AutoTokenizer, RobertaForMaskedLM
|
5 |
|
6 |
-
tokenizer = AutoTokenizer.from_pretrained("
|
7 |
-
model = RobertaForMaskedLM.from_pretrained("
|
8 |
|
9 |
nlp = pipeline("fill-mask", model=model, tokenizer=tokenizer)
|
10 |
# masked_input = "माझा नाव <mask> आहे"
|
3 |
|
4 |
from transformers import pipeline, AutoTokenizer, RobertaForMaskedLM
|
5 |
|
6 |
+
tokenizer = AutoTokenizer.from_pretrained("./")
|
7 |
+
model = RobertaForMaskedLM.from_pretrained("./", from_flax=True)
|
8 |
|
9 |
nlp = pipeline("fill-mask", model=model, tokenizer=tokenizer)
|
10 |
# masked_input = "माझा नाव <mask> आहे"
|