versae commited on
Commit
c4599cd
1 Parent(s): 14b1ca9

Add eval scripts

Browse files
evaluation/paws.yaml ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: BERTIN PAWS-X es
2
+ project: bertin-eval
3
+ enitity: versae
4
+ program: run_glue.py
5
+ command:
6
+ - ${env}
7
+ - ${interpreter}
8
+ - ${program}
9
+ - ${args}
10
+ method: grid
11
+ metric:
12
+ name: eval/accuracy
13
+ goal: maximize
14
+ parameters:
15
+ model_name_or_path:
16
+ values:
17
+ - bertin-project/bertin-base-gaussian-exp-512seqlen
18
+ - bertin-project/bertin-base-random-exp-512seqlen
19
+ - bertin-project/bertin-base-gaussian
20
+ - bertin-project/bertin-base-stepwise
21
+ - bertin-project/bertin-base-random
22
+ - bertin-project/bertin-roberta-base-spanish
23
+ - flax-community/bertin-roberta-large-spanish
24
+ - BSC-TeMU/roberta-base-bne
25
+ - dccuchile/bert-base-spanish-wwm-cased
26
+ - bert-base-multilingual-cased
27
+ num_train_epochs:
28
+ values: [5]
29
+ task_name:
30
+ value: paws-x
31
+ dataset_name:
32
+ value: paws-x
33
+ dataset_config_name:
34
+ value: es
35
+ output_dir:
36
+ value: ./outputs
37
+ overwrite_output_dir:
38
+ value: true
39
+ resume_from_checkpoint:
40
+ value: false
41
+ max_seq_length:
42
+ value: 512
43
+ pad_to_max_length:
44
+ value: true
45
+ per_device_train_batch_size:
46
+ value: 16
47
+ per_device_eval_batch_size:
48
+ value: 16
49
+ save_total_limit:
50
+ value: 1
51
+ do_train:
52
+ value: true
53
+ do_eval:
54
+ value: true
55
+
evaluation/run_glue.py ADDED
@@ -0,0 +1,576 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ # Copyright 2020 The HuggingFace Inc. team. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """ Finetuning the library models for sequence classification on GLUE."""
17
+ # You can also adapt this script on your own text classification task. Pointers for this are left as comments.
18
+
19
+ import logging
20
+ import os
21
+ import random
22
+ import sys
23
+ from dataclasses import dataclass, field
24
+ from pathlib import Path
25
+ from typing import Optional
26
+
27
+ import datasets
28
+ import numpy as np
29
+ from datasets import load_dataset, load_metric
30
+
31
+ import transformers
32
+ from transformers import (
33
+ AutoConfig,
34
+ AutoModelForSequenceClassification,
35
+ AutoTokenizer,
36
+ DataCollatorWithPadding,
37
+ EvalPrediction,
38
+ HfArgumentParser,
39
+ PretrainedConfig,
40
+ Trainer,
41
+ TrainingArguments,
42
+ default_data_collator,
43
+ set_seed,
44
+ )
45
+ from transformers.trainer_utils import get_last_checkpoint
46
+ from transformers.utils import check_min_version
47
+ from transformers.utils.versions import require_version
48
+
49
+
50
+ # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
51
+ check_min_version("4.9.0.dev0")
52
+
53
+ require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
54
+
55
+ task_to_keys = {
56
+ "cola": ("sentence", None),
57
+ "mnli": ("premise", "hypothesis"),
58
+ "xnli": ("premise", "hypothesis"),
59
+ "mrpc": ("sentence1", "sentence2"),
60
+ "qnli": ("question", "sentence"),
61
+ "qqp": ("question1", "question2"),
62
+ "rte": ("sentence1", "sentence2"),
63
+ "sst2": ("sentence", None),
64
+ "stsb": ("sentence1", "sentence2"),
65
+ "wnli": ("sentence1", "sentence2"),
66
+ "paws-x": ("sentence1", "sentence2"),
67
+ }
68
+ task_to_metrics = {
69
+ "paws-x": "accuracy",
70
+ "xnli": "accuracy",
71
+ }
72
+
73
+ logger = logging.getLogger(__name__)
74
+
75
+
76
+ @dataclass
77
+ class DataTrainingArguments:
78
+ """
79
+ Arguments pertaining to what data we are going to input our model for training and eval.
80
+
81
+ Using `HfArgumentParser` we can turn this class
82
+ into argparse arguments to be able to specify them on
83
+ the command line.
84
+ """
85
+
86
+ task_name: Optional[str] = field(
87
+ default=None,
88
+ metadata={"help": "The name of the task to train on: " + ", ".join(task_to_keys.keys())},
89
+ )
90
+ dataset_name: Optional[str] = field(
91
+ default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
92
+ )
93
+ dataset_config_name: Optional[str] = field(
94
+ default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
95
+ )
96
+ max_seq_length: int = field(
97
+ default=128,
98
+ metadata={
99
+ "help": "The maximum total input sequence length after tokenization. Sequences longer "
100
+ "than this will be truncated, sequences shorter will be padded."
101
+ },
102
+ )
103
+ overwrite_cache: bool = field(
104
+ default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
105
+ )
106
+ pad_to_max_length: bool = field(
107
+ default=True,
108
+ metadata={
109
+ "help": "Whether to pad all samples to `max_seq_length`. "
110
+ "If False, will pad the samples dynamically when batching to the maximum length in the batch."
111
+ },
112
+ )
113
+ max_train_samples: Optional[int] = field(
114
+ default=None,
115
+ metadata={
116
+ "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
117
+ "value if set."
118
+ },
119
+ )
120
+ max_eval_samples: Optional[int] = field(
121
+ default=None,
122
+ metadata={
123
+ "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
124
+ "value if set."
125
+ },
126
+ )
127
+ max_predict_samples: Optional[int] = field(
128
+ default=None,
129
+ metadata={
130
+ "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
131
+ "value if set."
132
+ },
133
+ )
134
+ train_file: Optional[str] = field(
135
+ default=None, metadata={"help": "A csv or a json file containing the training data."}
136
+ )
137
+ validation_file: Optional[str] = field(
138
+ default=None, metadata={"help": "A csv or a json file containing the validation data."}
139
+ )
140
+ test_file: Optional[str] = field(default=None, metadata={"help": "A csv or a json file containing the test data."})
141
+
142
+ def __post_init__(self):
143
+ if self.task_name is not None:
144
+ self.task_name = self.task_name.lower()
145
+ if self.task_name not in task_to_keys.keys():
146
+ raise ValueError("Unknown task, you should pick one in " + ",".join(task_to_keys.keys()))
147
+ elif self.dataset_name is not None:
148
+ pass
149
+ elif self.train_file is None or self.validation_file is None:
150
+ raise ValueError("Need either a GLUE task, a training/validation file or a dataset name.")
151
+ else:
152
+ train_extension = self.train_file.split(".")[-1]
153
+ assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file."
154
+ validation_extension = self.validation_file.split(".")[-1]
155
+ assert (
156
+ validation_extension == train_extension
157
+ ), "`validation_file` should have the same extension (csv or json) as `train_file`."
158
+
159
+
160
+ @dataclass
161
+ class ModelArguments:
162
+ """
163
+ Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
164
+ """
165
+
166
+ model_name_or_path: str = field(
167
+ metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
168
+ )
169
+ config_name: Optional[str] = field(
170
+ default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
171
+ )
172
+ tokenizer_name: Optional[str] = field(
173
+ default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
174
+ )
175
+ cache_dir: Optional[str] = field(
176
+ default=None,
177
+ metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
178
+ )
179
+ use_fast_tokenizer: bool = field(
180
+ default=True,
181
+ metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
182
+ )
183
+ model_revision: str = field(
184
+ default="main",
185
+ metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
186
+ )
187
+ use_auth_token: bool = field(
188
+ default=False,
189
+ metadata={
190
+ "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
191
+ "with private models)."
192
+ },
193
+ )
194
+
195
+
196
+ def main():
197
+ # See all possible arguments in src/transformers/training_args.py
198
+ # or by passing the --help flag to this script.
199
+ # We now keep distinct sets of args, for a cleaner separation of concerns.
200
+
201
+ parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
202
+ if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
203
+ # If we pass only one argument to the script and it's the path to a json file,
204
+ # let's parse it to get our arguments.
205
+ model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
206
+ else:
207
+ model_args, data_args, training_args = parser.parse_args_into_dataclasses()
208
+
209
+ # Setup logging
210
+ logging.basicConfig(
211
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
212
+ datefmt="%m/%d/%Y %H:%M:%S",
213
+ handlers=[logging.StreamHandler(sys.stdout)],
214
+ )
215
+
216
+ log_level = training_args.get_process_log_level()
217
+ logger.setLevel(log_level)
218
+ datasets.utils.logging.set_verbosity(log_level)
219
+ transformers.utils.logging.set_verbosity(log_level)
220
+ transformers.utils.logging.enable_default_handler()
221
+ transformers.utils.logging.enable_explicit_format()
222
+
223
+ # Log on each process the small summary:
224
+ logger.warning(
225
+ f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
226
+ + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
227
+ )
228
+ logger.info(f"Training/evaluation parameters {training_args}")
229
+
230
+ # Detecting last checkpoint.
231
+ last_checkpoint = None
232
+ run_name = f"{model_args.model_name_or_path}-{np.random.randint(1000):04d}"
233
+ training_args.output_dir = str(Path(training_args.output_dir) / run_name)
234
+ if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
235
+ last_checkpoint = get_last_checkpoint(training_args.output_dir)
236
+ if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
237
+ raise ValueError(
238
+ f"Output directory ({training_args.output_dir}) already exists and is not empty. "
239
+ "Use --overwrite_output_dir to overcome."
240
+ )
241
+ elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
242
+ logger.info(
243
+ f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
244
+ "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
245
+ )
246
+
247
+ # Set seed before initializing model.
248
+ set_seed(training_args.seed)
249
+
250
+ # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
251
+ # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub).
252
+ #
253
+ # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the
254
+ # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named
255
+ # label if at least two columns are provided.
256
+ #
257
+ # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this
258
+ # single column. You can easily tweak this behavior (see below)
259
+ #
260
+ # In distributed training, the load_dataset function guarantee that only one local process can concurrently
261
+ # download the dataset.
262
+ if data_args.dataset_name is not None:
263
+ # Downloading and loading a dataset from the hub.
264
+ raw_datasets = load_dataset(
265
+ data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
266
+ )
267
+ elif data_args.task_name is not None:
268
+ # Downloading and loading a dataset from the hub.
269
+ raw_datasets = load_dataset("glue", data_args.task_name, cache_dir=model_args.cache_dir)
270
+ else:
271
+ # Loading a dataset from your local files.
272
+ # CSV/JSON training and evaluation files are needed.
273
+ data_files = {"train": data_args.train_file, "validation": data_args.validation_file}
274
+
275
+ # Get the test dataset: you can provide your own CSV/JSON test file (see below)
276
+ # when you use `do_predict` without specifying a GLUE benchmark task.
277
+ if training_args.do_predict:
278
+ if data_args.test_file is not None:
279
+ train_extension = data_args.train_file.split(".")[-1]
280
+ test_extension = data_args.test_file.split(".")[-1]
281
+ assert (
282
+ test_extension == train_extension
283
+ ), "`test_file` should have the same extension (csv or json) as `train_file`."
284
+ data_files["test"] = data_args.test_file
285
+ else:
286
+ raise ValueError("Need either a GLUE task or a test file for `do_predict`.")
287
+
288
+ for key in data_files.keys():
289
+ logger.info(f"load a local file for {key}: {data_files[key]}")
290
+
291
+ if data_args.train_file.endswith(".csv"):
292
+ # Loading a dataset from local csv files
293
+ raw_datasets = load_dataset("csv", data_files=data_files, cache_dir=model_args.cache_dir)
294
+ else:
295
+ # Loading a dataset from local json files
296
+ raw_datasets = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir)
297
+ # See more about loading any type of standard or custom dataset at
298
+ # https://huggingface.co/docs/datasets/loading_datasets.html.
299
+
300
+ # Labels
301
+ if data_args.task_name is not None:
302
+ is_regression = data_args.task_name == "stsb"
303
+ if not is_regression:
304
+ label_list = raw_datasets["train"].features["label"].names
305
+ num_labels = len(label_list)
306
+ else:
307
+ num_labels = 1
308
+ else:
309
+ # Trying to have good defaults here, don't hesitate to tweak to your needs.
310
+ is_regression = raw_datasets["train"].features["label"].dtype in ["float32", "float64"]
311
+ if is_regression:
312
+ num_labels = 1
313
+ else:
314
+ # A useful fast method:
315
+ # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique
316
+ label_list = raw_datasets["train"].unique("label")
317
+ label_list.sort() # Let's sort it for determinism
318
+ num_labels = len(label_list)
319
+
320
+ # Load pretrained model and tokenizer
321
+ #
322
+ # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
323
+ # download model & vocab.
324
+ config = AutoConfig.from_pretrained(
325
+ model_args.config_name if model_args.config_name else model_args.model_name_or_path,
326
+ num_labels=num_labels,
327
+ finetuning_task=data_args.task_name,
328
+ cache_dir=model_args.cache_dir,
329
+ revision=model_args.model_revision,
330
+ use_auth_token=True if model_args.use_auth_token else None,
331
+ )
332
+ tokenizer = AutoTokenizer.from_pretrained(
333
+ model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
334
+ cache_dir=model_args.cache_dir,
335
+ use_fast=model_args.use_fast_tokenizer,
336
+ revision=model_args.model_revision,
337
+ use_auth_token=True if model_args.use_auth_token else None,
338
+ )
339
+ model = AutoModelForSequenceClassification.from_pretrained(
340
+ model_args.model_name_or_path,
341
+ from_tf=bool(".ckpt" in model_args.model_name_or_path),
342
+ config=config,
343
+ cache_dir=model_args.cache_dir,
344
+ revision=model_args.model_revision,
345
+ use_auth_token=True if model_args.use_auth_token else None,
346
+ )
347
+ tokenizer.model_max_length = 512
348
+
349
+ # Preprocessing the raw_datasets
350
+ if data_args.task_name is not None:
351
+ sentence1_key, sentence2_key = task_to_keys[data_args.task_name]
352
+ else:
353
+ # Again, we try to have some nice defaults but don't hesitate to tweak to your use case.
354
+ non_label_column_names = [name for name in raw_datasets["train"].column_names if name != "label"]
355
+ if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names:
356
+ sentence1_key, sentence2_key = "sentence1", "sentence2"
357
+ else:
358
+ if len(non_label_column_names) >= 2:
359
+ sentence1_key, sentence2_key = non_label_column_names[:2]
360
+ else:
361
+ sentence1_key, sentence2_key = non_label_column_names[0], None
362
+
363
+ # Padding strategy
364
+ if data_args.pad_to_max_length:
365
+ padding = "max_length"
366
+ else:
367
+ # We will pad later, dynamically at batch creation, to the max sequence length in each batch
368
+ padding = False
369
+
370
+ # Some models have set the order of the labels to use, so let's make sure we do use it.
371
+ label_to_id = None
372
+ if (
373
+ model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id
374
+ and data_args.task_name is not None
375
+ and not is_regression
376
+ ):
377
+ # Some have all caps in their config, some don't.
378
+ label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()}
379
+ if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
380
+ label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)}
381
+ else:
382
+ logger.warning(
383
+ "Your model seems to have been trained with labels, but they don't match the dataset: ",
384
+ f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
385
+ "\nIgnoring the model labels as a result.",
386
+ )
387
+ elif data_args.task_name is None and not is_regression:
388
+ label_to_id = {v: i for i, v in enumerate(label_list)}
389
+
390
+ if label_to_id is not None:
391
+ model.config.label2id = label_to_id
392
+ model.config.id2label = {id: label for label, id in config.label2id.items()}
393
+
394
+ if data_args.max_seq_length > tokenizer.model_max_length:
395
+ logger.warning(
396
+ f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
397
+ f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
398
+ )
399
+ max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
400
+
401
+ def preprocess_function(examples):
402
+ # Tokenize the texts
403
+ args = (
404
+ (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
405
+ )
406
+ result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True)
407
+
408
+ # Map labels to IDs (not necessary for GLUE tasks)
409
+ if label_to_id is not None and "label" in examples:
410
+ result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]]
411
+ return result
412
+
413
+ with training_args.main_process_first(desc="dataset map pre-processing"):
414
+ raw_datasets = raw_datasets.map(
415
+ preprocess_function,
416
+ batched=True,
417
+ load_from_cache_file=not data_args.overwrite_cache,
418
+ desc="Running tokenizer on dataset",
419
+ )
420
+ if training_args.do_train:
421
+ if "train" not in raw_datasets:
422
+ raise ValueError("--do_train requires a train dataset")
423
+ train_dataset = raw_datasets["train"]
424
+ if data_args.max_train_samples is not None:
425
+ train_dataset = train_dataset.select(range(data_args.max_train_samples))
426
+
427
+ if training_args.do_eval:
428
+ if "validation" not in raw_datasets and "validation_matched" not in raw_datasets:
429
+ raise ValueError("--do_eval requires a validation dataset")
430
+ eval_dataset = raw_datasets["validation_matched" if data_args.task_name == "mnli" else "validation"]
431
+ if data_args.max_eval_samples is not None:
432
+ eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
433
+
434
+ if training_args.do_predict or data_args.task_name is not None or data_args.test_file is not None:
435
+ if "test" not in raw_datasets and "test_matched" not in raw_datasets:
436
+ raise ValueError("--do_predict requires a test dataset")
437
+ predict_dataset = raw_datasets["test_matched" if data_args.task_name == "mnli" else "test"]
438
+ if data_args.max_predict_samples is not None:
439
+ predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
440
+
441
+ # Log a few random samples from the training set:
442
+ if training_args.do_train:
443
+ for index in random.sample(range(len(train_dataset)), 3):
444
+ logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
445
+
446
+ # Get the metric function
447
+ if data_args.task_name in task_to_metrics:
448
+ metric = load_metric(task_to_metrics[data_args.task_name])
449
+ elif data_args.task_name is not None:
450
+ metric = load_metric("glue", data_args.task_name)
451
+ else:
452
+ metric = load_metric("accuracy")
453
+
454
+ # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
455
+ # predictions and label_ids field) and has to return a dictionary string to float.
456
+ def compute_metrics(p: EvalPrediction):
457
+ preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
458
+ preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
459
+ if data_args.task_name is not None:
460
+ result = metric.compute(predictions=preds, references=p.label_ids)
461
+ if len(result) > 1:
462
+ result["combined_score"] = np.mean(list(result.values())).item()
463
+ return result
464
+ elif is_regression:
465
+ return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
466
+ else:
467
+ return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}
468
+
469
+ # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
470
+ if data_args.pad_to_max_length:
471
+ data_collator = default_data_collator
472
+ elif training_args.fp16:
473
+ data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
474
+ else:
475
+ data_collator = None
476
+
477
+ training_args.run_name = run_name
478
+ # Initialize our Trainer
479
+ trainer = Trainer(
480
+ model=model,
481
+ args=training_args,
482
+ train_dataset=train_dataset if training_args.do_train else None,
483
+ eval_dataset=eval_dataset if training_args.do_eval else None,
484
+ compute_metrics=compute_metrics,
485
+ tokenizer=tokenizer,
486
+ data_collator=data_collator,
487
+ )
488
+
489
+ # Training
490
+ if training_args.do_train:
491
+ checkpoint = None
492
+ if training_args.resume_from_checkpoint is not None:
493
+ checkpoint = training_args.resume_from_checkpoint
494
+ elif last_checkpoint is not None:
495
+ checkpoint = last_checkpoint
496
+ train_result = trainer.train(resume_from_checkpoint=checkpoint)
497
+ metrics = train_result.metrics
498
+ max_train_samples = (
499
+ data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
500
+ )
501
+ metrics["train_samples"] = min(max_train_samples, len(train_dataset))
502
+
503
+ trainer.save_model() # Saves the tokenizer too for easy upload
504
+
505
+ trainer.log_metrics("train", metrics)
506
+ trainer.save_metrics("train", metrics)
507
+ trainer.save_state()
508
+
509
+ # Evaluation
510
+ if training_args.do_eval:
511
+ logger.info("*** Evaluate ***")
512
+
513
+ # Loop to handle MNLI double evaluation (matched, mis-matched)
514
+ tasks = [data_args.task_name]
515
+ eval_datasets = [eval_dataset]
516
+ if data_args.task_name == "mnli":
517
+ tasks.append("mnli-mm")
518
+ eval_datasets.append(raw_datasets["validation_mismatched"])
519
+
520
+ for eval_dataset, task in zip(eval_datasets, tasks):
521
+ metrics = trainer.evaluate(eval_dataset=eval_dataset)
522
+
523
+ max_eval_samples = (
524
+ data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
525
+ )
526
+ metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
527
+
528
+ trainer.log_metrics("eval", metrics)
529
+ trainer.save_metrics("eval", metrics)
530
+
531
+ if training_args.do_predict:
532
+ logger.info("*** Predict ***")
533
+
534
+ # Loop to handle MNLI double evaluation (matched, mis-matched)
535
+ tasks = [data_args.task_name]
536
+ predict_datasets = [predict_dataset]
537
+ if data_args.task_name == "mnli":
538
+ tasks.append("mnli-mm")
539
+ predict_datasets.append(raw_datasets["test_mismatched"])
540
+
541
+ for predict_dataset, task in zip(predict_datasets, tasks):
542
+ # Removing the `label` columns because it contains -1 and Trainer won't like that.
543
+ predict_dataset = predict_dataset.remove_columns("label")
544
+ predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions
545
+ predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1)
546
+
547
+ output_predict_file = os.path.join(training_args.output_dir, f"predict_results_{task}.txt")
548
+ if trainer.is_world_process_zero():
549
+ with open(output_predict_file, "w") as writer:
550
+ logger.info(f"***** Predict results {task} *****")
551
+ writer.write("index\tprediction\n")
552
+ for index, item in enumerate(predictions):
553
+ if is_regression:
554
+ writer.write(f"{index}\t{item:3.3f}\n")
555
+ else:
556
+ item = label_list[item]
557
+ writer.write(f"{index}\t{item}\n")
558
+
559
+ if training_args.push_to_hub:
560
+ kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-classification"}
561
+ if data_args.task_name is not None:
562
+ kwargs["language"] = "en"
563
+ kwargs["dataset_tags"] = "glue"
564
+ kwargs["dataset_args"] = data_args.task_name
565
+ kwargs["dataset"] = f"GLUE {data_args.task_name.upper()}"
566
+
567
+ trainer.push_to_hub(**kwargs)
568
+
569
+
570
+ def _mp_fn(index):
571
+ # For xla_spawn (TPUs)
572
+ main()
573
+
574
+
575
+ if __name__ == "__main__":
576
+ main()
evaluation/run_ner.py ADDED
@@ -0,0 +1,562 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ # Copyright 2020 The HuggingFace Team All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """
17
+ Fine-tuning the library models for token classification.
18
+ """
19
+ # You can also adapt this script on your own token classification task and datasets. Pointers for this are left as
20
+ # comments.
21
+
22
+ import logging
23
+ import os
24
+ import sys
25
+ from dataclasses import dataclass, field
26
+ from pathlib import Path
27
+ from typing import Optional
28
+
29
+ import datasets
30
+ import numpy as np
31
+ from datasets import ClassLabel, load_dataset, load_metric
32
+
33
+ import transformers
34
+ from transformers import (
35
+ AutoConfig,
36
+ AutoModelForTokenClassification,
37
+ AutoTokenizer,
38
+ DataCollatorForTokenClassification,
39
+ HfArgumentParser,
40
+ PreTrainedTokenizerFast,
41
+ Trainer,
42
+ TrainingArguments,
43
+ set_seed,
44
+ )
45
+ from transformers.trainer_utils import get_last_checkpoint
46
+ from transformers.utils import check_min_version
47
+ from transformers.utils.versions import require_version
48
+
49
+
50
+ # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
51
+ check_min_version("4.9.0.dev0")
52
+
53
+ require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
54
+
55
+ logger = logging.getLogger(__name__)
56
+
57
+
58
+ @dataclass
59
+ class ModelArguments:
60
+ """
61
+ Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
62
+ """
63
+
64
+ model_name_or_path: str = field(
65
+ metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
66
+ )
67
+ config_name: Optional[str] = field(
68
+ default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
69
+ )
70
+ tokenizer_name: Optional[str] = field(
71
+ default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
72
+ )
73
+ cache_dir: Optional[str] = field(
74
+ default=None,
75
+ metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
76
+ )
77
+ model_revision: str = field(
78
+ default="main",
79
+ metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
80
+ )
81
+ use_auth_token: bool = field(
82
+ default=False,
83
+ metadata={
84
+ "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
85
+ "with private models)."
86
+ },
87
+ )
88
+
89
+
90
+ @dataclass
91
+ class DataTrainingArguments:
92
+ """
93
+ Arguments pertaining to what data we are going to input our model for training and eval.
94
+ """
95
+
96
+ task_name: Optional[str] = field(default="ner", metadata={"help": "The name of the task (ner, pos...)."})
97
+ dataset_name: Optional[str] = field(
98
+ default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
99
+ )
100
+ dataset_config_name: Optional[str] = field(
101
+ default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
102
+ )
103
+ train_file: Optional[str] = field(
104
+ default=None, metadata={"help": "The input training data file (a csv or JSON file)."}
105
+ )
106
+ validation_file: Optional[str] = field(
107
+ default=None,
108
+ metadata={"help": "An optional input evaluation data file to evaluate on (a csv or JSON file)."},
109
+ )
110
+ test_file: Optional[str] = field(
111
+ default=None,
112
+ metadata={"help": "An optional input test data file to predict on (a csv or JSON file)."},
113
+ )
114
+ text_column_name: Optional[str] = field(
115
+ default=None, metadata={"help": "The column name of text to input in the file (a csv or JSON file)."}
116
+ )
117
+ label_column_name: Optional[str] = field(
118
+ default=None, metadata={"help": "The column name of label to input in the file (a csv or JSON file)."}
119
+ )
120
+ overwrite_cache: bool = field(
121
+ default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
122
+ )
123
+ preprocessing_num_workers: Optional[int] = field(
124
+ default=None,
125
+ metadata={"help": "The number of processes to use for the preprocessing."},
126
+ )
127
+ pad_to_max_length: bool = field(
128
+ default=False,
129
+ metadata={
130
+ "help": "Whether to pad all samples to model maximum sentence length. "
131
+ "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
132
+ "efficient on GPU but very bad for TPU."
133
+ },
134
+ )
135
+ max_train_samples: Optional[int] = field(
136
+ default=None,
137
+ metadata={
138
+ "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
139
+ "value if set."
140
+ },
141
+ )
142
+ max_eval_samples: Optional[int] = field(
143
+ default=None,
144
+ metadata={
145
+ "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
146
+ "value if set."
147
+ },
148
+ )
149
+ max_predict_samples: Optional[int] = field(
150
+ default=None,
151
+ metadata={
152
+ "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
153
+ "value if set."
154
+ },
155
+ )
156
+ label_all_tokens: bool = field(
157
+ default=False,
158
+ metadata={
159
+ "help": "Whether to put the label for one word on all tokens of generated by that word or just on the "
160
+ "one (in which case the other tokens will have a padding index)."
161
+ },
162
+ )
163
+ return_entity_level_metrics: bool = field(
164
+ default=False,
165
+ metadata={"help": "Whether to return all the entity levels during evaluation or just the overall ones."},
166
+ )
167
+
168
+ def __post_init__(self):
169
+ if self.dataset_name is None and self.train_file is None and self.validation_file is None:
170
+ raise ValueError("Need either a dataset name or a training/validation file.")
171
+ else:
172
+ if self.train_file is not None:
173
+ extension = self.train_file.split(".")[-1]
174
+ assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
175
+ if self.validation_file is not None:
176
+ extension = self.validation_file.split(".")[-1]
177
+ assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
178
+ self.task_name = self.task_name.lower()
179
+
180
+
181
+ def main():
182
+ # See all possible arguments in src/transformers/training_args.py
183
+ # or by passing the --help flag to this script.
184
+ # We now keep distinct sets of args, for a cleaner separation of concerns.
185
+
186
+ parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
187
+ if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
188
+ # If we pass only one argument to the script and it's the path to a json file,
189
+ # let's parse it to get our arguments.
190
+ model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
191
+ else:
192
+ model_args, data_args, training_args = parser.parse_args_into_dataclasses()
193
+
194
+ # Setup logging
195
+ logging.basicConfig(
196
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
197
+ datefmt="%m/%d/%Y %H:%M:%S",
198
+ handlers=[logging.StreamHandler(sys.stdout)],
199
+ )
200
+
201
+ log_level = training_args.get_process_log_level()
202
+ logger.setLevel(log_level)
203
+ datasets.utils.logging.set_verbosity(log_level)
204
+ transformers.utils.logging.set_verbosity(log_level)
205
+ transformers.utils.logging.enable_default_handler()
206
+ transformers.utils.logging.enable_explicit_format()
207
+
208
+ # Log on each process the small summary:
209
+ logger.warning(
210
+ f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
211
+ + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
212
+ )
213
+ logger.info(f"Training/evaluation parameters {training_args}")
214
+
215
+ # Detecting last checkpoint.
216
+ last_checkpoint = None
217
+ run_name = f"{model_args.model_name_or_path}-{np.random.randint(1000):04d}"
218
+ training_args.output_dir = str(Path(training_args.output_dir) / run_name)
219
+ if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
220
+ last_checkpoint = get_last_checkpoint(training_args.output_dir)
221
+ if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
222
+ raise ValueError(
223
+ f"Output directory ({training_args.output_dir}) already exists and is not empty. "
224
+ "Use --overwrite_output_dir to overcome."
225
+ )
226
+ elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
227
+ logger.info(
228
+ f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
229
+ "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
230
+ )
231
+
232
+ # Set seed before initializing model.
233
+ set_seed(training_args.seed)
234
+
235
+ # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
236
+ # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
237
+ # (the dataset will be downloaded automatically from the datasets Hub).
238
+ #
239
+ # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
240
+ # 'text' is found. You can easily tweak this behavior (see below).
241
+ #
242
+ # In distributed training, the load_dataset function guarantee that only one local process can concurrently
243
+ # download the dataset.
244
+ if data_args.dataset_name is not None:
245
+ # Downloading and loading a dataset from the hub.
246
+ raw_datasets = load_dataset(
247
+ data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
248
+ )
249
+ else:
250
+ data_files = {}
251
+ if data_args.train_file is not None:
252
+ data_files["train"] = data_args.train_file
253
+ if data_args.validation_file is not None:
254
+ data_files["validation"] = data_args.validation_file
255
+ if data_args.test_file is not None:
256
+ data_files["test"] = data_args.test_file
257
+ extension = data_args.train_file.split(".")[-1]
258
+ raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
259
+ # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
260
+ # https://huggingface.co/docs/datasets/loading_datasets.html.
261
+
262
+ if training_args.do_train:
263
+ column_names = raw_datasets["train"].column_names
264
+ features = raw_datasets["train"].features
265
+ else:
266
+ column_names = raw_datasets["validation"].column_names
267
+ features = raw_datasets["validation"].features
268
+
269
+ if data_args.text_column_name is not None:
270
+ text_column_name = data_args.text_column_name
271
+ elif "tokens" in column_names:
272
+ text_column_name = "tokens"
273
+ else:
274
+ text_column_name = column_names[0]
275
+
276
+ if data_args.label_column_name is not None:
277
+ label_column_name = data_args.label_column_name
278
+ elif f"{data_args.task_name}_tags" in column_names:
279
+ label_column_name = f"{data_args.task_name}_tags"
280
+ else:
281
+ label_column_name = column_names[1]
282
+
283
+ # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the
284
+ # unique labels.
285
+ def get_label_list(labels):
286
+ unique_labels = set()
287
+ for label in labels:
288
+ unique_labels = unique_labels | set(label)
289
+ label_list = list(unique_labels)
290
+ label_list.sort()
291
+ return label_list
292
+
293
+ if isinstance(features[label_column_name].feature, ClassLabel):
294
+ label_list = features[label_column_name].feature.names
295
+ # No need to convert the labels since they are already ints.
296
+ label_to_id = {i: i for i in range(len(label_list))}
297
+ else:
298
+ label_list = get_label_list(raw_datasets["train"][label_column_name])
299
+ label_to_id = {l: i for i, l in enumerate(label_list)}
300
+ num_labels = len(label_list)
301
+
302
+ # Load pretrained model and tokenizer
303
+ #
304
+ # Distributed training:
305
+ # The .from_pretrained methods guarantee that only one local process can concurrently
306
+ # download model & vocab.
307
+ config = AutoConfig.from_pretrained(
308
+ model_args.config_name if model_args.config_name else model_args.model_name_or_path,
309
+ num_labels=num_labels,
310
+ label2id=label_to_id,
311
+ id2label={i: l for l, i in label_to_id.items()},
312
+ finetuning_task=data_args.task_name,
313
+ cache_dir=model_args.cache_dir,
314
+ revision=model_args.model_revision,
315
+ use_auth_token=True if model_args.use_auth_token else None,
316
+ )
317
+
318
+ tokenizer_name_or_path = model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path
319
+ if config.model_type in {"gpt2", "roberta"}:
320
+ tokenizer = AutoTokenizer.from_pretrained(
321
+ tokenizer_name_or_path,
322
+ cache_dir=model_args.cache_dir,
323
+ use_fast=True,
324
+ revision=model_args.model_revision,
325
+ use_auth_token=True if model_args.use_auth_token else None,
326
+ add_prefix_space=True,
327
+ )
328
+ else:
329
+ tokenizer = AutoTokenizer.from_pretrained(
330
+ tokenizer_name_or_path,
331
+ cache_dir=model_args.cache_dir,
332
+ use_fast=True,
333
+ revision=model_args.model_revision,
334
+ use_auth_token=True if model_args.use_auth_token else None,
335
+ )
336
+ tokenizer.model_max_length = 512
337
+
338
+ model = AutoModelForTokenClassification.from_pretrained(
339
+ model_args.model_name_or_path,
340
+ from_tf=bool(".ckpt" in model_args.model_name_or_path),
341
+ config=config,
342
+ cache_dir=model_args.cache_dir,
343
+ revision=model_args.model_revision,
344
+ use_auth_token=True if model_args.use_auth_token else None,
345
+ )
346
+
347
+ # Tokenizer check: this script requires a fast tokenizer.
348
+ if not isinstance(tokenizer, PreTrainedTokenizerFast):
349
+ raise ValueError(
350
+ "This example script only works for models that have a fast tokenizer. Checkout the big table of models "
351
+ "at https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet this "
352
+ "requirement"
353
+ )
354
+
355
+ # Preprocessing the dataset
356
+ # Padding strategy
357
+ padding = "max_length" if data_args.pad_to_max_length else False
358
+
359
+ # Tokenize all texts and align the labels with them.
360
+ def tokenize_and_align_labels(examples):
361
+ tokenized_inputs = tokenizer(
362
+ examples[text_column_name],
363
+ padding=padding,
364
+ max_length=512,
365
+ truncation=True,
366
+ # We use this argument because the texts in our dataset are lists of words (with a label for each word).
367
+ is_split_into_words=True,
368
+ )
369
+ labels = []
370
+ for i, label in enumerate(examples[label_column_name]):
371
+ word_ids = tokenized_inputs.word_ids(batch_index=i)
372
+ previous_word_idx = None
373
+ label_ids = []
374
+ for word_idx in word_ids:
375
+ # Special tokens have a word id that is None. We set the label to -100 so they are automatically
376
+ # ignored in the loss function.
377
+ if word_idx is None:
378
+ label_ids.append(-100)
379
+ # We set the label for the first token of each word.
380
+ elif word_idx != previous_word_idx:
381
+ label_ids.append(label_to_id[label[word_idx]])
382
+ # For the other tokens in a word, we set the label to either the current label or -100, depending on
383
+ # the label_all_tokens flag.
384
+ else:
385
+ label_ids.append(label_to_id[label[word_idx]] if data_args.label_all_tokens else -100)
386
+ previous_word_idx = word_idx
387
+
388
+ labels.append(label_ids)
389
+ tokenized_inputs["labels"] = labels
390
+ return tokenized_inputs
391
+
392
+ if training_args.do_train:
393
+ if "train" not in raw_datasets:
394
+ raise ValueError("--do_train requires a train dataset")
395
+ train_dataset = raw_datasets["train"]
396
+ if data_args.max_train_samples is not None:
397
+ train_dataset = train_dataset.select(range(data_args.max_train_samples))
398
+ with training_args.main_process_first(desc="train dataset map pre-processing"):
399
+ train_dataset = train_dataset.map(
400
+ tokenize_and_align_labels,
401
+ batched=True,
402
+ num_proc=data_args.preprocessing_num_workers,
403
+ load_from_cache_file=not data_args.overwrite_cache,
404
+ desc="Running tokenizer on train dataset",
405
+ )
406
+
407
+ if training_args.do_eval:
408
+ if "validation" not in raw_datasets:
409
+ raise ValueError("--do_eval requires a validation dataset")
410
+ eval_dataset = raw_datasets["validation"]
411
+ if data_args.max_eval_samples is not None:
412
+ eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
413
+ with training_args.main_process_first(desc="validation dataset map pre-processing"):
414
+ eval_dataset = eval_dataset.map(
415
+ tokenize_and_align_labels,
416
+ batched=True,
417
+ num_proc=data_args.preprocessing_num_workers,
418
+ load_from_cache_file=not data_args.overwrite_cache,
419
+ desc="Running tokenizer on validation dataset",
420
+ )
421
+
422
+ if training_args.do_predict:
423
+ if "test" not in raw_datasets:
424
+ raise ValueError("--do_predict requires a test dataset")
425
+ predict_dataset = raw_datasets["test"]
426
+ if data_args.max_predict_samples is not None:
427
+ predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
428
+ with training_args.main_process_first(desc="prediction dataset map pre-processing"):
429
+ predict_dataset = predict_dataset.map(
430
+ tokenize_and_align_labels,
431
+ batched=True,
432
+ num_proc=data_args.preprocessing_num_workers,
433
+ load_from_cache_file=not data_args.overwrite_cache,
434
+ desc="Running tokenizer on prediction dataset",
435
+ )
436
+
437
+ # Data collator
438
+ data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)
439
+
440
+ # Metrics
441
+ metric = load_metric("seqeval")
442
+
443
+ def compute_metrics(p):
444
+ predictions, labels = p
445
+ predictions = np.argmax(predictions, axis=2)
446
+
447
+ # Remove ignored index (special tokens)
448
+ true_predictions = [
449
+ [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
450
+ for prediction, label in zip(predictions, labels)
451
+ ]
452
+ true_labels = [
453
+ [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
454
+ for prediction, label in zip(predictions, labels)
455
+ ]
456
+
457
+ results = metric.compute(predictions=true_predictions, references=true_labels)
458
+ if data_args.return_entity_level_metrics:
459
+ # Unpack nested dictionaries
460
+ final_results = {}
461
+ for key, value in results.items():
462
+ if isinstance(value, dict):
463
+ for n, v in value.items():
464
+ final_results[f"{key}_{n}"] = v
465
+ else:
466
+ final_results[key] = value
467
+ return final_results
468
+ else:
469
+ return {
470
+ "precision": results["overall_precision"],
471
+ "recall": results["overall_recall"],
472
+ "f1": results["overall_f1"],
473
+ "accuracy": results["overall_accuracy"],
474
+ }
475
+
476
+ # Initialize our Trainer
477
+ training_args.run_name = run_name
478
+ trainer = Trainer(
479
+ model=model,
480
+ args=training_args,
481
+ train_dataset=train_dataset if training_args.do_train else None,
482
+ eval_dataset=eval_dataset if training_args.do_eval else None,
483
+ tokenizer=tokenizer,
484
+ data_collator=data_collator,
485
+ compute_metrics=compute_metrics,
486
+ )
487
+
488
+ # Training
489
+ if training_args.do_train:
490
+ checkpoint = None
491
+ if training_args.resume_from_checkpoint is not None:
492
+ checkpoint = training_args.resume_from_checkpoint
493
+ elif last_checkpoint is not None:
494
+ checkpoint = last_checkpoint
495
+ train_result = trainer.train(resume_from_checkpoint=checkpoint)
496
+ metrics = train_result.metrics
497
+ trainer.save_model() # Saves the tokenizer too for easy upload
498
+
499
+ max_train_samples = (
500
+ data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
501
+ )
502
+ metrics["train_samples"] = min(max_train_samples, len(train_dataset))
503
+
504
+ trainer.log_metrics("train", metrics)
505
+ trainer.save_metrics("train", metrics)
506
+ trainer.save_state()
507
+
508
+ # Evaluation
509
+ if training_args.do_eval:
510
+ logger.info("*** Evaluate ***")
511
+
512
+ metrics = trainer.evaluate()
513
+
514
+ max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
515
+ metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
516
+
517
+ trainer.log_metrics("eval", metrics)
518
+ trainer.save_metrics("eval", metrics)
519
+
520
+ # Predict
521
+ if training_args.do_predict:
522
+ logger.info("*** Predict ***")
523
+
524
+ predictions, labels, metrics = trainer.predict(predict_dataset, metric_key_prefix="predict")
525
+ predictions = np.argmax(predictions, axis=2)
526
+
527
+ # Remove ignored index (special tokens)
528
+ true_predictions = [
529
+ [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
530
+ for prediction, label in zip(predictions, labels)
531
+ ]
532
+
533
+ trainer.log_metrics("predict", metrics)
534
+ trainer.save_metrics("predict", metrics)
535
+
536
+ # Save predictions
537
+ output_predictions_file = os.path.join(training_args.output_dir, "predictions.txt")
538
+ if trainer.is_world_process_zero():
539
+ with open(output_predictions_file, "w") as writer:
540
+ for prediction in true_predictions:
541
+ writer.write(" ".join(prediction) + "\n")
542
+
543
+ if training_args.push_to_hub:
544
+ kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "token-classification"}
545
+ if data_args.dataset_name is not None:
546
+ kwargs["dataset_tags"] = data_args.dataset_name
547
+ if data_args.dataset_config_name is not None:
548
+ kwargs["dataset_args"] = data_args.dataset_config_name
549
+ kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
550
+ else:
551
+ kwargs["dataset"] = data_args.dataset_name
552
+
553
+ trainer.push_to_hub(**kwargs)
554
+
555
+
556
+ def _mp_fn(index):
557
+ # For xla_spawn (TPUs)
558
+ main()
559
+
560
+
561
+ if __name__ == "__main__":
562
+ main()
evaluation/xnli.yaml ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: BERTIN XNLI es
2
+ project: bertin-eval
3
+ enitity: versae
4
+ program: run_glue.py
5
+ command:
6
+ - ${env}
7
+ - ${interpreter}
8
+ - ${program}
9
+ - ${args}
10
+ method: grid
11
+ metric:
12
+ name: eval/accuracy
13
+ goal: maximize
14
+ parameters:
15
+ model_name_or_path:
16
+ values:
17
+ - bertin-project/bertin-base-gaussian-exp-512seqlen
18
+ - bertin-project/bertin-base-random-exp-512seqlen
19
+ - bertin-project/bertin-base-gaussian
20
+ - bertin-project/bertin-base-stepwise
21
+ - bertin-project/bertin-base-random
22
+ - bertin-project/bertin-roberta-base-spanish
23
+ - flax-community/bertin-roberta-large-spanish
24
+ - BSC-TeMU/roberta-base-bne
25
+ - dccuchile/bert-base-spanish-wwm-cased
26
+ - bert-base-multilingual-cased
27
+ num_train_epochs:
28
+ values: [5]
29
+ task_name:
30
+ value: xnli
31
+ dataset_name:
32
+ value: xnli
33
+ dataset_config_name:
34
+ value: es
35
+ output_dir:
36
+ value: ./outputs
37
+ overwrite_output_dir:
38
+ value: true
39
+ resume_from_checkpoint:
40
+ value: false
41
+ max_seq_length:
42
+ value: 512
43
+ pad_to_max_length:
44
+ value: true
45
+ per_device_train_batch_size:
46
+ value: 16
47
+ per_device_eval_batch_size:
48
+ value: 16
49
+ save_total_limit:
50
+ value: 1
51
+ do_train:
52
+ value: true
53
+ do_eval:
54
+ value: true
55
+
mc4/mc4.py CHANGED
@@ -376,13 +376,13 @@ class Mc4(datasets.GeneratorBasedBuilder):
376
  for lang in self.config.languages
377
  for index in range(_N_SHARDS_PER_SPLIT[lang][split])
378
  ]
379
- if "train" in self.data_files:
380
  train_downloaded_files = self.data_files["train"]
381
  if not isinstance(train_downloaded_files, (tuple, list)):
382
  train_downloaded_files = [train_downloaded_files]
383
  else:
384
  train_downloaded_files = dl_manager.download(data_urls["train"])
385
- if "validation" in self.data_files:
386
  validation_downloaded_files = self.data_files["validation"]
387
  if not isinstance(validation_downloaded_files, (tuple, list)):
388
  validation_downloaded_files = [validation_downloaded_files]
@@ -417,7 +417,7 @@ class Mc4(datasets.GeneratorBasedBuilder):
417
  if self.should_keep_doc(
418
  example["text"],
419
  factor=self.sampling_factor,
420
- boundaries=self.boundaries
421
  **self.kwargs):
422
  yield id_, example
423
  id_ += 1
376
  for lang in self.config.languages
377
  for index in range(_N_SHARDS_PER_SPLIT[lang][split])
378
  ]
379
+ if self.data_files and "train" in self.data_files:
380
  train_downloaded_files = self.data_files["train"]
381
  if not isinstance(train_downloaded_files, (tuple, list)):
382
  train_downloaded_files = [train_downloaded_files]
383
  else:
384
  train_downloaded_files = dl_manager.download(data_urls["train"])
385
+ if self.data_files and "validation" in self.data_files:
386
  validation_downloaded_files = self.data_files["validation"]
387
  if not isinstance(validation_downloaded_files, (tuple, list)):
388
  validation_downloaded_files = [validation_downloaded_files]
417
  if self.should_keep_doc(
418
  example["text"],
419
  factor=self.sampling_factor,
420
+ boundaries=self.boundaries,
421
  **self.kwargs):
422
  yield id_, example
423
  id_ += 1