pere commited on
Commit
1c494ec
·
1 Parent(s): 6238d83

first submit

Browse files
Files changed (2) hide show
  1. run_128.sh +1 -0
  2. run_mlm_flax.py +15 -9
run_128.sh CHANGED
@@ -19,6 +19,7 @@ python run_mlm_flax.py \
19
  --logging_steps="1000" \
20
  --save_steps="1000" \
21
  --eval_steps="1000" \
 
22
  --do_train \
23
  --do_eval \
24
  --dtype="bfloat16" \
 
19
  --logging_steps="1000" \
20
  --save_steps="1000" \
21
  --eval_steps="1000" \
22
+ --auth_token="True" \
23
  --do_train \
24
  --do_eval \
25
  --dtype="bfloat16" \
run_mlm_flax.py CHANGED
@@ -224,6 +224,10 @@ class DataTrainingArguments:
224
  default=False,
225
  metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
226
  )
 
 
 
 
227
 
228
  def __post_init__(self):
229
  if self.dataset_name is None and self.train_file is None and self.validation_file is None:
@@ -376,14 +380,14 @@ def main():
376
  set_seed(training_args.seed)
377
 
378
  # Handle the repository creation
379
- if training_args.push_to_hub:
380
- if training_args.hub_model_id is None:
381
- repo_name = get_full_repo_name(
382
- Path(training_args.output_dir).absolute().name, token=training_args.hub_token
383
- )
384
- else:
385
- repo_name = training_args.hub_model_id
386
- repo = Repository(training_args.output_dir, clone_from=repo_name)
387
 
388
  # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
389
  # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
@@ -396,7 +400,7 @@ def main():
396
  # download the dataset.
397
  if data_args.dataset_name is not None:
398
  # Downloading and loading a dataset from the hub.
399
- datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
400
 
401
  if "validation" not in datasets.keys():
402
  datasets["validation"] = load_dataset(
@@ -404,12 +408,14 @@ def main():
404
  data_args.dataset_config_name,
405
  split=f"train[:{data_args.validation_split_percentage}%]",
406
  cache_dir=model_args.cache_dir,
 
407
  )
408
  datasets["train"] = load_dataset(
409
  data_args.dataset_name,
410
  data_args.dataset_config_name,
411
  split=f"train[{data_args.validation_split_percentage}%:]",
412
  cache_dir=model_args.cache_dir,
 
413
  )
414
  else:
415
  data_files = {}
 
224
  default=False,
225
  metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
226
  )
227
+
228
+ auth_token: bool = field(
229
+ default=False, metadata={"help": "Use authorisation token"}
230
+ )
231
 
232
  def __post_init__(self):
233
  if self.dataset_name is None and self.train_file is None and self.validation_file is None:
 
380
  set_seed(training_args.seed)
381
 
382
  # Handle the repository creation
383
+ # if training_args.push_to_hub:
384
+ # if training_args.hub_model_id is None:
385
+ # repo_name = get_full_repo_name(
386
+ # Path(training_args.output_dir).absolute().name, token=training_args.hub_token
387
+ # )
388
+ # else:
389
+ # repo_name = training_args.hub_model_id
390
+ # repo = Repository(training_args.output_dir, clone_from=repo_name)
391
 
392
  # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
393
  # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
 
400
  # download the dataset.
401
  if data_args.dataset_name is not None:
402
  # Downloading and loading a dataset from the hub.
403
+ datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, use_auth_token=data_args.auth_token, cache_dir=model_args.cache_dir)
404
 
405
  if "validation" not in datasets.keys():
406
  datasets["validation"] = load_dataset(
 
408
  data_args.dataset_config_name,
409
  split=f"train[:{data_args.validation_split_percentage}%]",
410
  cache_dir=model_args.cache_dir,
411
+ use_auth_token=data_args.auth_token,
412
  )
413
  datasets["train"] = load_dataset(
414
  data_args.dataset_name,
415
  data_args.dataset_config_name,
416
  split=f"train[{data_args.validation_split_percentage}%:]",
417
  cache_dir=model_args.cache_dir,
418
+ use_auth_token=data_args.auth_token,
419
  )
420
  else:
421
  data_files = {}