updated training script with auth_token
Browse files
events.out.tfevents.1629051212.t1v-n-358ff5d1-w-0.196309.3.v2
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3b2349e130dcf873a6c1b6a72c7905545522989ef1325b13371a5f71d077ff8c
|
3 |
+
size 77452220
|
run_mlm_flax_stream.py
CHANGED
@@ -129,6 +129,9 @@ class DataTrainingArguments:
|
|
129 |
default=None,
|
130 |
metadata={"help": "An optional input validation ref data file for whole word masking in Chinese."},
|
131 |
)
|
|
|
|
|
|
|
132 |
overwrite_cache: bool = field(
|
133 |
default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
|
134 |
)
|
@@ -361,6 +364,7 @@ if __name__ == "__main__":
|
|
361 |
#
|
362 |
# For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
|
363 |
# 'text' is found. You can easily tweak this behavior (see below).
|
|
|
364 |
if data_args.dataset_name is not None:
|
365 |
# Downloading and loading a dataset from the hub.
|
366 |
dataset = load_dataset(
|
@@ -368,6 +372,7 @@ if __name__ == "__main__":
|
|
368 |
data_args.dataset_config_name,
|
369 |
cache_dir=model_args.cache_dir,
|
370 |
streaming=True,
|
|
|
371 |
split="train",
|
372 |
)
|
373 |
|
|
|
129 |
default=None,
|
130 |
metadata={"help": "An optional input validation ref data file for whole word masking in Chinese."},
|
131 |
)
|
132 |
+
auth_token: bool = field(
|
133 |
+
default=False, metadata={"help": "Use authorisation token"}
|
134 |
+
)
|
135 |
overwrite_cache: bool = field(
|
136 |
default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
|
137 |
)
|
|
|
364 |
#
|
365 |
# For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
|
366 |
# 'text' is found. You can easily tweak this behavior (see below).
|
367 |
+
|
368 |
if data_args.dataset_name is not None:
|
369 |
# Downloading and loading a dataset from the hub.
|
370 |
dataset = load_dataset(
|
|
|
372 |
data_args.dataset_config_name,
|
373 |
cache_dir=model_args.cache_dir,
|
374 |
streaming=True,
|
375 |
+
use_auth_token=data_args.auth_token,
|
376 |
split="train",
|
377 |
)
|
378 |
|
run_recover_1350_stream.sh
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
./run_mlm_flax_stream.py \
|
2 |
+
--output_dir="./" \
|
3 |
+
--model_type="roberta" \
|
4 |
+
--config_name="./" \
|
5 |
+
--tokenizer_name="./" \
|
6 |
+
--model_name_or_path="./"
|
7 |
+
--dataset_name="NbAiLab/NCC2" \
|
8 |
+
--max_seq_length="128" \
|
9 |
+
--weight_decay="0.01" \
|
10 |
+
--per_device_train_batch_size="128" \
|
11 |
+
--per_device_eval_batch_size="128" \
|
12 |
+
--learning_rate="3e-4" \
|
13 |
+
--warmup_steps="0" \
|
14 |
+
--overwrite_output_dir \
|
15 |
+
--cache_dir /mnt/disks/flaxdisk/cache/ \
|
16 |
+
--num_train_steps="1150000" \
|
17 |
+
--adam_beta1="0.9" \
|
18 |
+
--adam_beta2="0.98" \
|
19 |
+
--logging_steps="10000" \
|
20 |
+
--save_steps="100000" \
|
21 |
+
--eval_steps="50000" \
|
22 |
+
--preprocessing_num_workers 96 \
|
23 |
+
--auth_token True \
|
24 |
+
--adafactor \
|
25 |
+
--push_to_hub
|