nipunsadvilkar commited on
Commit
e307ddc
1 Parent(s): 30fff05

Large model with 10% validation set

Browse files
Files changed (4) hide show
  1. flax_model.msgpack +1 -1
  2. run.sh +1 -0
  3. run_mlm_flax.py +3 -1
  4. test_marathi_model.py +2 -2
flax_model.msgpack CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f84da6fceb9936906a58eafe7ef1ceb4e7f94978a833b73bd88f0f0bb2c67b50
3
  size 498796983
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:349d2b21007510b12abb5e8a79b6f4925875228eca5b82932d3968977422dd03
3
  size 498796983
run.sh CHANGED
@@ -5,6 +5,7 @@ HUB_TOKEN=`cat $HOME/.huggingface/token`
5
  --config_name="${MODEL_DIR}" \
6
  --tokenizer_name="${MODEL_DIR}" \
7
  --train_file="/home/nipunsadvilkar/mr_data/mr_train_punctrm.csv" \
 
8
  --max_seq_length="128" \
9
  --weight_decay="0.01" \
10
  --per_device_train_batch_size="128" \
5
  --config_name="${MODEL_DIR}" \
6
  --tokenizer_name="${MODEL_DIR}" \
7
  --train_file="/home/nipunsadvilkar/mr_data/mr_train_punctrm.csv" \
8
+ --validation_split_percentage=10 \
9
  --max_seq_length="128" \
10
  --weight_decay="0.01" \
11
  --per_device_train_batch_size="128" \
run_mlm_flax.py CHANGED
@@ -364,6 +364,7 @@ if __name__ == "__main__":
364
  split=f"train[{data_args.validation_split_percentage}%:]",
365
  cache_dir=model_args.cache_dir,
366
  )
 
367
 
368
  # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
369
  # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -691,6 +692,7 @@ if __name__ == "__main__":
691
  if jax.process_index() == 0:
692
  step_output_dir = f"checkpoint_{cur_step}"
693
  os.mkdir(step_output_dir)
 
694
  params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
695
  model.save_pretrained(
696
  step_output_dir,
@@ -698,5 +700,5 @@ if __name__ == "__main__":
698
  push_to_hub=training_args.push_to_hub,
699
  commit_message=f"Saving weights and logs of step {cur_step}",
700
  )
701
- with open(f"{step_output_dir}/opt_state_{cur_step}.msgpack", "wb") as f:
702
  f.write(to_bytes(state.opt_state))
364
  split=f"train[{data_args.validation_split_percentage}%:]",
365
  cache_dir=model_args.cache_dir,
366
  )
367
+ print(datasets)
368
 
369
  # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
370
  # https://huggingface.co/docs/datasets/loading_datasets.html.
692
  if jax.process_index() == 0:
693
  step_output_dir = f"checkpoint_{cur_step}"
694
  os.mkdir(step_output_dir)
695
+ print(f"Saving weights, optimizer state and logs of step {cur_step} at {step_output_dir}")
696
  params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
697
  model.save_pretrained(
698
  step_output_dir,
700
  push_to_hub=training_args.push_to_hub,
701
  commit_message=f"Saving weights and logs of step {cur_step}",
702
  )
703
+ with open("opt_state.msgpack", "wb") as f:
704
  f.write(to_bytes(state.opt_state))
test_marathi_model.py CHANGED
@@ -3,8 +3,8 @@ import pprint
3
 
4
  from transformers import pipeline, AutoTokenizer, RobertaForMaskedLM
5
 
6
- tokenizer = AutoTokenizer.from_pretrained("/home/nipunsadvilkar/roberta-base-mr/checkpoint_45000/")
7
- model = RobertaForMaskedLM.from_pretrained("/home/nipunsadvilkar/roberta-base-mr/checkpoint_45000/", from_flax=True)
8
 
9
  nlp = pipeline("fill-mask", model=model, tokenizer=tokenizer)
10
  # masked_input = "माझा नाव <mask> आहे"
3
 
4
  from transformers import pipeline, AutoTokenizer, RobertaForMaskedLM
5
 
6
+ tokenizer = AutoTokenizer.from_pretrained("./")
7
+ model = RobertaForMaskedLM.from_pretrained("./", from_flax=True)
8
 
9
  nlp = pipeline("fill-mask", model=model, tokenizer=tokenizer)
10
  # masked_input = "माझा नाव <mask> आहे"