pere commited on
Commit
22af785
1 Parent(s): 535adbf

updated training script with auth_token

Browse files
events.out.tfevents.1629051212.t1v-n-358ff5d1-w-0.196309.3.v2 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a472266ff2fda9a2b30fa12a34c12eaf6276efefac5a60cb147cf96b632ebab1
3
- size 74472080
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b2349e130dcf873a6c1b6a72c7905545522989ef1325b13371a5f71d077ff8c
3
+ size 77452220
run_mlm_flax_stream.py CHANGED
@@ -129,6 +129,9 @@ class DataTrainingArguments:
129
  default=None,
130
  metadata={"help": "An optional input validation ref data file for whole word masking in Chinese."},
131
  )
 
 
 
132
  overwrite_cache: bool = field(
133
  default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
134
  )
@@ -361,6 +364,7 @@ if __name__ == "__main__":
361
  #
362
  # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
363
  # 'text' is found. You can easily tweak this behavior (see below).
 
364
  if data_args.dataset_name is not None:
365
  # Downloading and loading a dataset from the hub.
366
  dataset = load_dataset(
@@ -368,6 +372,7 @@ if __name__ == "__main__":
368
  data_args.dataset_config_name,
369
  cache_dir=model_args.cache_dir,
370
  streaming=True,
 
371
  split="train",
372
  )
373
 
 
129
  default=None,
130
  metadata={"help": "An optional input validation ref data file for whole word masking in Chinese."},
131
  )
132
+ auth_token: bool = field(
133
+ default=False, metadata={"help": "Use authorisation token"}
134
+ )
135
  overwrite_cache: bool = field(
136
  default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
137
  )
 
364
  #
365
  # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
366
  # 'text' is found. You can easily tweak this behavior (see below).
367
+
368
  if data_args.dataset_name is not None:
369
  # Downloading and loading a dataset from the hub.
370
  dataset = load_dataset(
 
372
  data_args.dataset_config_name,
373
  cache_dir=model_args.cache_dir,
374
  streaming=True,
375
+ use_auth_token=data_args.auth_token,
376
  split="train",
377
  )
378
 
run_recover_1350_stream.sh ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ./run_mlm_flax_stream.py \
2
+ --output_dir="./" \
3
+ --model_type="roberta" \
4
+ --config_name="./" \
5
+ --tokenizer_name="./" \
6
+ --model_name_or_path="./"
7
+ --dataset_name="NbAiLab/NCC2" \
8
+ --max_seq_length="128" \
9
+ --weight_decay="0.01" \
10
+ --per_device_train_batch_size="128" \
11
+ --per_device_eval_batch_size="128" \
12
+ --learning_rate="3e-4" \
13
+ --warmup_steps="0" \
14
+ --overwrite_output_dir \
15
+ --cache_dir /mnt/disks/flaxdisk/cache/ \
16
+ --num_train_steps="1150000" \
17
+ --adam_beta1="0.9" \
18
+ --adam_beta2="0.98" \
19
+ --logging_steps="10000" \
20
+ --save_steps="100000" \
21
+ --eval_steps="50000" \
22
+ --preprocessing_num_workers 96 \
23
+ --auth_token True \
24
+ --adafactor \
25
+ --push_to_hub