root commited on
Commit
b4e31f5
1 Parent(s): f307d2f

update scripts

Browse files
Files changed (2) hide show
  1. run_mlm.py +556 -0
  2. xla_spawn.py +85 -0
run_mlm.py ADDED
@@ -0,0 +1,556 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ # Copyright 2020 The HuggingFace Team All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """
17
+ Fine-tuning the library models for masked language modeling (BERT, ALBERT, RoBERTa...) on a text file or a dataset.
18
+
19
+ Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
20
+ https://huggingface.co/models?filter=masked-lm
21
+ """
22
+ # You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments.
23
+
24
+ import logging
25
+ import math
26
+ import os
27
+ import sys
28
+ from dataclasses import dataclass, field
29
+ from itertools import chain
30
+ from typing import Optional
31
+
32
+ import datasets
33
+ from datasets import load_dataset
34
+
35
+ import transformers
36
+ from transformers import (
37
+ CONFIG_MAPPING,
38
+ MODEL_FOR_MASKED_LM_MAPPING,
39
+ AutoConfig,
40
+ AutoModelForMaskedLM,
41
+ AutoTokenizer,
42
+ DataCollatorForLanguageModeling,
43
+ HfArgumentParser,
44
+ Trainer,
45
+ TrainingArguments,
46
+ set_seed,
47
+ )
48
+ from transformers.trainer_utils import get_last_checkpoint
49
+ from transformers.utils import check_min_version
50
+ from transformers.utils.versions import require_version
51
+
52
+
53
+ # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
54
+ check_min_version("4.13.0.dev0")
55
+
56
+ require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
57
+
58
+ logger = logging.getLogger(__name__)
59
+ MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys())
60
+ MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
61
+
62
+
63
+ @dataclass
64
+ class ModelArguments:
65
+ """
66
+ Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
67
+ """
68
+
69
+ model_name_or_path: Optional[str] = field(
70
+ default=None,
71
+ metadata={
72
+ "help": "The model checkpoint for weights initialization."
73
+ "Don't set if you want to train a model from scratch."
74
+ },
75
+ )
76
+ model_type: Optional[str] = field(
77
+ default=None,
78
+ metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
79
+ )
80
+ config_overrides: Optional[str] = field(
81
+ default=None,
82
+ metadata={
83
+ "help": "Override some existing default config settings when a model is trained from scratch. Example: "
84
+ "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
85
+ },
86
+ )
87
+ config_name: Optional[str] = field(
88
+ default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
89
+ )
90
+ tokenizer_name: Optional[str] = field(
91
+ default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
92
+ )
93
+ cache_dir: Optional[str] = field(
94
+ default=None,
95
+ metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
96
+ )
97
+ use_fast_tokenizer: bool = field(
98
+ default=True,
99
+ metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
100
+ )
101
+ model_revision: str = field(
102
+ default="main",
103
+ metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
104
+ )
105
+ use_auth_token: bool = field(
106
+ default=False,
107
+ metadata={
108
+ "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
109
+ "with private models)."
110
+ },
111
+ )
112
+
113
+ def __post_init__(self):
114
+ if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
115
+ raise ValueError(
116
+ "--config_overrides can't be used in combination with --config_name or --model_name_or_path"
117
+ )
118
+
119
+
120
+ @dataclass
121
+ class DataTrainingArguments:
122
+ """
123
+ Arguments pertaining to what data we are going to input our model for training and eval.
124
+ """
125
+
126
+ dataset_name: Optional[str] = field(
127
+ default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
128
+ )
129
+ dataset_config_name: Optional[str] = field(
130
+ default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
131
+ )
132
+ train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
133
+ validation_file: Optional[str] = field(
134
+ default=None,
135
+ metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
136
+ )
137
+ overwrite_cache: bool = field(
138
+ default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
139
+ )
140
+ validation_split_percentage: Optional[int] = field(
141
+ default=5,
142
+ metadata={
143
+ "help": "The percentage of the train set used as validation set in case there's no validation split"
144
+ },
145
+ )
146
+ max_seq_length: Optional[int] = field(
147
+ default=None,
148
+ metadata={
149
+ "help": "The maximum total input sequence length after tokenization. Sequences longer "
150
+ "than this will be truncated."
151
+ },
152
+ )
153
+ preprocessing_num_workers: Optional[int] = field(
154
+ default=None,
155
+ metadata={"help": "The number of processes to use for the preprocessing."},
156
+ )
157
+ mlm_probability: float = field(
158
+ default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
159
+ )
160
+ line_by_line: bool = field(
161
+ default=False,
162
+ metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
163
+ )
164
+ pad_to_max_length: bool = field(
165
+ default=False,
166
+ metadata={
167
+ "help": "Whether to pad all samples to `max_seq_length`. "
168
+ "If False, will pad the samples dynamically when batching to the maximum length in the batch."
169
+ },
170
+ )
171
+ max_train_samples: Optional[int] = field(
172
+ default=None,
173
+ metadata={
174
+ "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
175
+ "value if set."
176
+ },
177
+ )
178
+ max_eval_samples: Optional[int] = field(
179
+ default=None,
180
+ metadata={
181
+ "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
182
+ "value if set."
183
+ },
184
+ )
185
+
186
+ def __post_init__(self):
187
+ if self.dataset_name is None and self.train_file is None and self.validation_file is None:
188
+ raise ValueError("Need either a dataset name or a training/validation file.")
189
+ else:
190
+ if self.train_file is not None:
191
+ extension = self.train_file.split(".")[-1]
192
+ if extension not in ["csv", "json", "txt"]:
193
+ raise ValueError("`train_file` should be a csv, a json or a txt file.")
194
+ if self.validation_file is not None:
195
+ extension = self.validation_file.split(".")[-1]
196
+ if extension not in ["csv", "json", "txt"]:
197
+ raise ValueError("`validation_file` should be a csv, a json or a txt file.")
198
+
199
+
200
+ def main():
201
+ # See all possible arguments in src/transformers/training_args.py
202
+ # or by passing the --help flag to this script.
203
+ # We now keep distinct sets of args, for a cleaner separation of concerns.
204
+
205
+ parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
206
+ if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
207
+ # If we pass only one argument to the script and it's the path to a json file,
208
+ # let's parse it to get our arguments.
209
+ model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
210
+ else:
211
+ model_args, data_args, training_args = parser.parse_args_into_dataclasses()
212
+
213
+ # Setup logging
214
+ logging.basicConfig(
215
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
216
+ datefmt="%m/%d/%Y %H:%M:%S",
217
+ handlers=[logging.StreamHandler(sys.stdout)],
218
+ )
219
+
220
+ log_level = training_args.get_process_log_level()
221
+ logger.setLevel(log_level)
222
+ datasets.utils.logging.set_verbosity(log_level)
223
+ transformers.utils.logging.set_verbosity(log_level)
224
+ transformers.utils.logging.enable_default_handler()
225
+ transformers.utils.logging.enable_explicit_format()
226
+
227
+ # Log on each process the small summary:
228
+ logger.warning(
229
+ f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
230
+ + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
231
+ )
232
+ # Set the verbosity to info of the Transformers logger (on main process only):
233
+ logger.info(f"Training/evaluation parameters {training_args}")
234
+
235
+ # Detecting last checkpoint.
236
+ last_checkpoint = None
237
+ if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
238
+ last_checkpoint = get_last_checkpoint(training_args.output_dir)
239
+ if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
240
+ raise ValueError(
241
+ f"Output directory ({training_args.output_dir}) already exists and is not empty. "
242
+ "Use --overwrite_output_dir to overcome."
243
+ )
244
+ elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
245
+ logger.info(
246
+ f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
247
+ "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
248
+ )
249
+
250
+ # Set seed before initializing model.
251
+ set_seed(training_args.seed)
252
+
253
+ # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
254
+ # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
255
+ # (the dataset will be downloaded automatically from the datasets Hub
256
+ #
257
+ # For CSV/JSON files, this script will use the column called 'text' or the first column. You can easily tweak this
258
+ # behavior (see below)
259
+ #
260
+ # In distributed training, the load_dataset function guarantee that only one local process can concurrently
261
+ # download the dataset.
262
+ if data_args.dataset_name is not None:
263
+ # Downloading and loading a dataset from the hub.
264
+ raw_datasets = load_dataset(
265
+ data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
266
+ )
267
+ if "validation" not in raw_datasets.keys():
268
+ raw_datasets["validation"] = load_dataset(
269
+ data_args.dataset_name,
270
+ data_args.dataset_config_name,
271
+ split=f"train[:{data_args.validation_split_percentage}%]",
272
+ cache_dir=model_args.cache_dir,
273
+ )
274
+ raw_datasets["train"] = load_dataset(
275
+ data_args.dataset_name,
276
+ data_args.dataset_config_name,
277
+ split=f"train[{data_args.validation_split_percentage}%:]",
278
+ cache_dir=model_args.cache_dir,
279
+ )
280
+ else:
281
+ data_files = {}
282
+ if data_args.train_file is not None:
283
+ data_files["train"] = data_args.train_file
284
+ extension = data_args.train_file.split(".")[-1]
285
+ if data_args.validation_file is not None:
286
+ data_files["validation"] = data_args.validation_file
287
+ extension = data_args.validation_file.split(".")[-1]
288
+ if extension == "txt":
289
+ extension = "text"
290
+ raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
291
+
292
+ # If no validation data is there, validation_split_percentage will be used to divide the dataset.
293
+ if "validation" not in raw_datasets.keys():
294
+ raw_datasets["validation"] = load_dataset(
295
+ extension,
296
+ data_files=data_files,
297
+ split=f"train[:{data_args.validation_split_percentage}%]",
298
+ cache_dir=model_args.cache_dir,
299
+ )
300
+ raw_datasets["train"] = load_dataset(
301
+ extension,
302
+ data_files=data_files,
303
+ split=f"train[{data_args.validation_split_percentage}%:]",
304
+ cache_dir=model_args.cache_dir,
305
+ )
306
+
307
+ # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
308
+ # https://huggingface.co/docs/datasets/loading_datasets.html.
309
+
310
+ # Load pretrained model and tokenizer
311
+ #
312
+ # Distributed training:
313
+ # The .from_pretrained methods guarantee that only one local process can concurrently
314
+ # download model & vocab.
315
+ config_kwargs = {
316
+ "cache_dir": model_args.cache_dir,
317
+ "revision": model_args.model_revision,
318
+ "use_auth_token": True if model_args.use_auth_token else None,
319
+ }
320
+ if model_args.config_name:
321
+ config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
322
+ elif model_args.model_name_or_path:
323
+ config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)
324
+ else:
325
+ config = CONFIG_MAPPING[model_args.model_type]()
326
+ logger.warning("You are instantiating a new config instance from scratch.")
327
+ if model_args.config_overrides is not None:
328
+ logger.info(f"Overriding config: {model_args.config_overrides}")
329
+ config.update_from_string(model_args.config_overrides)
330
+ logger.info(f"New config: {config}")
331
+
332
+ tokenizer_kwargs = {
333
+ "cache_dir": model_args.cache_dir,
334
+ "use_fast": model_args.use_fast_tokenizer,
335
+ "revision": model_args.model_revision,
336
+ "use_auth_token": True if model_args.use_auth_token else None,
337
+ }
338
+ if model_args.tokenizer_name:
339
+ tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
340
+ elif model_args.model_name_or_path:
341
+ tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs)
342
+ else:
343
+ raise ValueError(
344
+ "You are instantiating a new tokenizer from scratch. This is not supported by this script."
345
+ "You can do it from another script, save it, and load it from here, using --tokenizer_name."
346
+ )
347
+
348
+ if model_args.model_name_or_path:
349
+ model = AutoModelForMaskedLM.from_pretrained(
350
+ model_args.model_name_or_path,
351
+ from_tf=bool(".ckpt" in model_args.model_name_or_path),
352
+ config=config,
353
+ cache_dir=model_args.cache_dir,
354
+ revision=model_args.model_revision,
355
+ use_auth_token=True if model_args.use_auth_token else None,
356
+ )
357
+ else:
358
+ logger.info("Training new model from scratch")
359
+ model = AutoModelForMaskedLM.from_config(config)
360
+
361
+ model.resize_token_embeddings(len(tokenizer))
362
+
363
+ # Preprocessing the datasets.
364
+ # First we tokenize all the texts.
365
+ if training_args.do_train:
366
+ column_names = raw_datasets["train"].column_names
367
+ else:
368
+ column_names = raw_datasets["validation"].column_names
369
+ text_column_name = "text" if "text" in column_names else column_names[0]
370
+
371
+ if data_args.max_seq_length is None:
372
+ max_seq_length = tokenizer.model_max_length
373
+ if max_seq_length > 1024:
374
+ logger.warning(
375
+ f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
376
+ "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
377
+ )
378
+ max_seq_length = 1024
379
+ else:
380
+ if data_args.max_seq_length > tokenizer.model_max_length:
381
+ logger.warning(
382
+ f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
383
+ f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
384
+ )
385
+ max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
386
+
387
+ if data_args.line_by_line:
388
+ # When using line_by_line, we just tokenize each nonempty line.
389
+ padding = "max_length" if data_args.pad_to_max_length else False
390
+
391
+ def tokenize_function(examples):
392
+ # Remove empty lines
393
+ examples[text_column_name] = [
394
+ line for line in examples[text_column_name] if len(line) > 0 and not line.isspace()
395
+ ]
396
+ return tokenizer(
397
+ examples[text_column_name],
398
+ padding=padding,
399
+ truncation=True,
400
+ max_length=max_seq_length,
401
+ # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it
402
+ # receives the `special_tokens_mask`.
403
+ return_special_tokens_mask=True,
404
+ )
405
+
406
+ with training_args.main_process_first(desc="dataset map tokenization"):
407
+ tokenized_datasets = raw_datasets.map(
408
+ tokenize_function,
409
+ batched=True,
410
+ num_proc=data_args.preprocessing_num_workers,
411
+ remove_columns=[text_column_name],
412
+ load_from_cache_file=not data_args.overwrite_cache,
413
+ desc="Running tokenizer on dataset line_by_line",
414
+ )
415
+ else:
416
+ # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
417
+ # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
418
+ # efficient when it receives the `special_tokens_mask`.
419
+ def tokenize_function(examples):
420
+ return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
421
+
422
+ with training_args.main_process_first(desc="dataset map tokenization"):
423
+ tokenized_datasets = raw_datasets.map(
424
+ tokenize_function,
425
+ batched=True,
426
+ num_proc=data_args.preprocessing_num_workers,
427
+ remove_columns=column_names,
428
+ load_from_cache_file=not data_args.overwrite_cache,
429
+ desc="Running tokenizer on every text in dataset",
430
+ )
431
+
432
+ # Main data processing function that will concatenate all texts from our dataset and generate chunks of
433
+ # max_seq_length.
434
+ def group_texts(examples):
435
+ # Concatenate all texts.
436
+ concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
437
+ total_length = len(concatenated_examples[list(examples.keys())[0]])
438
+ # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
439
+ # customize this part to your needs.
440
+ if total_length >= max_seq_length:
441
+ total_length = (total_length // max_seq_length) * max_seq_length
442
+ # Split by chunks of max_len.
443
+ result = {
444
+ k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
445
+ for k, t in concatenated_examples.items()
446
+ }
447
+ return result
448
+
449
+ # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
450
+ # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
451
+ # might be slower to preprocess.
452
+ #
453
+ # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
454
+ # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
455
+
456
+ with training_args.main_process_first(desc="grouping texts together"):
457
+ tokenized_datasets = tokenized_datasets.map(
458
+ group_texts,
459
+ batched=True,
460
+ num_proc=data_args.preprocessing_num_workers,
461
+ load_from_cache_file=not data_args.overwrite_cache,
462
+ desc=f"Grouping texts in chunks of {max_seq_length}",
463
+ )
464
+
465
+ if training_args.do_train:
466
+ if "train" not in tokenized_datasets:
467
+ raise ValueError("--do_train requires a train dataset")
468
+ train_dataset = tokenized_datasets["train"]
469
+ if data_args.max_train_samples is not None:
470
+ train_dataset = train_dataset.select(range(data_args.max_train_samples))
471
+
472
+ if training_args.do_eval:
473
+ if "validation" not in tokenized_datasets:
474
+ raise ValueError("--do_eval requires a validation dataset")
475
+ eval_dataset = tokenized_datasets["validation"]
476
+ if data_args.max_eval_samples is not None:
477
+ eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
478
+
479
+ # Data collator
480
+ # This one will take care of randomly masking the tokens.
481
+ pad_to_multiple_of_8 = data_args.line_by_line and training_args.fp16 and not data_args.pad_to_max_length
482
+ data_collator = DataCollatorForLanguageModeling(
483
+ tokenizer=tokenizer,
484
+ mlm_probability=data_args.mlm_probability,
485
+ pad_to_multiple_of=8 if pad_to_multiple_of_8 else None,
486
+ )
487
+
488
+ # Initialize our Trainer
489
+ trainer = Trainer(
490
+ model=model,
491
+ args=training_args,
492
+ train_dataset=train_dataset if training_args.do_train else None,
493
+ eval_dataset=eval_dataset if training_args.do_eval else None,
494
+ tokenizer=tokenizer,
495
+ data_collator=data_collator,
496
+ )
497
+
498
+ # Training
499
+ if training_args.do_train:
500
+ checkpoint = None
501
+ if training_args.resume_from_checkpoint is not None:
502
+ checkpoint = training_args.resume_from_checkpoint
503
+ elif last_checkpoint is not None:
504
+ checkpoint = last_checkpoint
505
+ train_result = trainer.train(resume_from_checkpoint=checkpoint)
506
+ trainer.save_model() # Saves the tokenizer too for easy upload
507
+ metrics = train_result.metrics
508
+
509
+ max_train_samples = (
510
+ data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
511
+ )
512
+ metrics["train_samples"] = min(max_train_samples, len(train_dataset))
513
+
514
+ trainer.log_metrics("train", metrics)
515
+ trainer.save_metrics("train", metrics)
516
+ trainer.save_state()
517
+
518
+ # Evaluation
519
+ if training_args.do_eval:
520
+ logger.info("*** Evaluate ***")
521
+
522
+ metrics = trainer.evaluate()
523
+
524
+ max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
525
+ metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
526
+ try:
527
+ perplexity = math.exp(metrics["eval_loss"])
528
+ except OverflowError:
529
+ perplexity = float("inf")
530
+ metrics["perplexity"] = perplexity
531
+
532
+ trainer.log_metrics("eval", metrics)
533
+ trainer.save_metrics("eval", metrics)
534
+
535
+ kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "fill-mask"}
536
+ if data_args.dataset_name is not None:
537
+ kwargs["dataset_tags"] = data_args.dataset_name
538
+ if data_args.dataset_config_name is not None:
539
+ kwargs["dataset_args"] = data_args.dataset_config_name
540
+ kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
541
+ else:
542
+ kwargs["dataset"] = data_args.dataset_name
543
+
544
+ if training_args.push_to_hub:
545
+ trainer.push_to_hub(**kwargs)
546
+ else:
547
+ trainer.create_model_card(**kwargs)
548
+
549
+
550
+ def _mp_fn(index):
551
+ # For xla_spawn (TPUs)
552
+ main()
553
+
554
+
555
+ if __name__ == "__main__":
556
+ main()
xla_spawn.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """
15
+ A simple launcher script for TPU training
16
+
17
+ Inspired by https://github.com/pytorch/pytorch/blob/master/torch/distributed/launch.py
18
+
19
+ ::
20
+ >>> python xla_spawn.py --num_cores=NUM_CORES_YOU_HAVE
21
+ YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 and all other
22
+ arguments of your training script)
23
+
24
+ """
25
+
26
+
27
+ import importlib
28
+ import sys
29
+ from argparse import REMAINDER, ArgumentParser
30
+ from pathlib import Path
31
+
32
+ import torch_xla.distributed.xla_multiprocessing as xmp
33
+
34
+
35
+ def parse_args():
36
+ """
37
+ Helper function parsing the command line options
38
+ @retval ArgumentParser
39
+ """
40
+ parser = ArgumentParser(
41
+ description=(
42
+ "PyTorch TPU distributed training launch "
43
+ "helper utility that will spawn up "
44
+ "multiple distributed processes"
45
+ )
46
+ )
47
+
48
+ # Optional arguments for the launch helper
49
+ parser.add_argument("--num_cores", type=int, default=1, help="Number of TPU cores to use (1 or 8).")
50
+
51
+ # positional
52
+ parser.add_argument(
53
+ "training_script",
54
+ type=str,
55
+ help=(
56
+ "The full path to the single TPU training "
57
+ "program/script to be launched in parallel, "
58
+ "followed by all the arguments for the "
59
+ "training script"
60
+ ),
61
+ )
62
+
63
+ # rest from the training program
64
+ parser.add_argument("training_script_args", nargs=REMAINDER)
65
+
66
+ return parser.parse_args()
67
+
68
+
69
+ def main():
70
+ args = parse_args()
71
+
72
+ # Import training_script as a module.
73
+ script_fpath = Path(args.training_script)
74
+ sys.path.append(str(script_fpath.parent.resolve()))
75
+ mod_name = script_fpath.stem
76
+ mod = importlib.import_module(mod_name)
77
+
78
+ # Patch sys.argv
79
+ sys.argv = [args.training_script] + args.training_script_args + ["--tpu_num_cores", str(args.num_cores)]
80
+
81
+ xmp.spawn(mod._mp_fn, args=(), nprocs=args.num_cores)
82
+
83
+
84
+ if __name__ == "__main__":
85
+ main()