first submit
Browse files- run_128.sh +1 -0
- run_mlm_flax.py +15 -9
run_128.sh
CHANGED
@@ -19,6 +19,7 @@ python run_mlm_flax.py \
|
|
19 |
--logging_steps="1000" \
|
20 |
--save_steps="1000" \
|
21 |
--eval_steps="1000" \
|
|
|
22 |
--do_train \
|
23 |
--do_eval \
|
24 |
--dtype="bfloat16" \
|
|
|
19 |
--logging_steps="1000" \
|
20 |
--save_steps="1000" \
|
21 |
--eval_steps="1000" \
|
22 |
+
--auth_token="True" \
|
23 |
--do_train \
|
24 |
--do_eval \
|
25 |
--dtype="bfloat16" \
|
run_mlm_flax.py
CHANGED
@@ -224,6 +224,10 @@ class DataTrainingArguments:
|
|
224 |
default=False,
|
225 |
metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
|
226 |
)
|
|
|
|
|
|
|
|
|
227 |
|
228 |
def __post_init__(self):
|
229 |
if self.dataset_name is None and self.train_file is None and self.validation_file is None:
|
@@ -376,14 +380,14 @@ def main():
|
|
376 |
set_seed(training_args.seed)
|
377 |
|
378 |
# Handle the repository creation
|
379 |
-
if training_args.push_to_hub:
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
|
388 |
# Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
|
389 |
# or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
|
@@ -396,7 +400,7 @@ def main():
|
|
396 |
# download the dataset.
|
397 |
if data_args.dataset_name is not None:
|
398 |
# Downloading and loading a dataset from the hub.
|
399 |
-
datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
|
400 |
|
401 |
if "validation" not in datasets.keys():
|
402 |
datasets["validation"] = load_dataset(
|
@@ -404,12 +408,14 @@ def main():
|
|
404 |
data_args.dataset_config_name,
|
405 |
split=f"train[:{data_args.validation_split_percentage}%]",
|
406 |
cache_dir=model_args.cache_dir,
|
|
|
407 |
)
|
408 |
datasets["train"] = load_dataset(
|
409 |
data_args.dataset_name,
|
410 |
data_args.dataset_config_name,
|
411 |
split=f"train[{data_args.validation_split_percentage}%:]",
|
412 |
cache_dir=model_args.cache_dir,
|
|
|
413 |
)
|
414 |
else:
|
415 |
data_files = {}
|
|
|
224 |
default=False,
|
225 |
metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
|
226 |
)
|
227 |
+
|
228 |
+
auth_token: bool = field(
|
229 |
+
default=False, metadata={"help": "Use authorisation token"}
|
230 |
+
)
|
231 |
|
232 |
def __post_init__(self):
|
233 |
if self.dataset_name is None and self.train_file is None and self.validation_file is None:
|
|
|
380 |
set_seed(training_args.seed)
|
381 |
|
382 |
# Handle the repository creation
|
383 |
+
# if training_args.push_to_hub:
|
384 |
+
# if training_args.hub_model_id is None:
|
385 |
+
# repo_name = get_full_repo_name(
|
386 |
+
# Path(training_args.output_dir).absolute().name, token=training_args.hub_token
|
387 |
+
# )
|
388 |
+
# else:
|
389 |
+
# repo_name = training_args.hub_model_id
|
390 |
+
# repo = Repository(training_args.output_dir, clone_from=repo_name)
|
391 |
|
392 |
# Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
|
393 |
# or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
|
|
|
400 |
# download the dataset.
|
401 |
if data_args.dataset_name is not None:
|
402 |
# Downloading and loading a dataset from the hub.
|
403 |
+
datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, use_auth_token=data_args.auth_token, cache_dir=model_args.cache_dir)
|
404 |
|
405 |
if "validation" not in datasets.keys():
|
406 |
datasets["validation"] = load_dataset(
|
|
|
408 |
data_args.dataset_config_name,
|
409 |
split=f"train[:{data_args.validation_split_percentage}%]",
|
410 |
cache_dir=model_args.cache_dir,
|
411 |
+
use_auth_token=data_args.auth_token,
|
412 |
)
|
413 |
datasets["train"] = load_dataset(
|
414 |
data_args.dataset_name,
|
415 |
data_args.dataset_config_name,
|
416 |
split=f"train[{data_args.validation_split_percentage}%:]",
|
417 |
cache_dir=model_args.cache_dir,
|
418 |
+
use_auth_token=data_args.auth_token,
|
419 |
)
|
420 |
else:
|
421 |
data_files = {}
|