mirbostani commited on
Commit
01d9139
1 Parent(s): 87e1f45

Upload run_newsqa.py

Browse files
Files changed (1) hide show
  1. run_newsqa.py +929 -0
run_newsqa.py ADDED
@@ -0,0 +1,929 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """
17
+ Finetuning the library models for question-answering on NewsQA (DistilBERT, Bert, XLM, XLNet).
18
+
19
+ @see examples/legacy/multiple_choice/utils_multiple_choice.py
20
+ @see src/transformers/data/processors/squad.py
21
+ @see examples/legacy/question-answering/run_squad.py
22
+ """
23
+
24
+
25
+ import argparse
26
+ import glob
27
+ import logging
28
+ import os
29
+ import random
30
+ import timeit
31
+ import json
32
+ from matplotlib.style import context
33
+
34
+ import numpy as np
35
+ import torch
36
+ from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
37
+ from torch.utils.data.distributed import DistributedSampler
38
+ from tqdm import tqdm, trange
39
+
40
+ import transformers
41
+ from transformers import (
42
+ MODEL_FOR_QUESTION_ANSWERING_MAPPING,
43
+ WEIGHTS_NAME,
44
+ AdamW,
45
+ AutoConfig,
46
+ AutoModelForQuestionAnswering,
47
+ AutoTokenizer,
48
+ get_linear_schedule_with_warmup,
49
+ squad_convert_examples_to_features,
50
+ )
51
+ from transformers.data.metrics.squad_metrics import (
52
+ compute_predictions_log_probs,
53
+ compute_predictions_logits,
54
+ squad_evaluate,
55
+ )
56
+ from transformers.data.processors.squad import SquadExample, SquadResult, SquadV1Processor, SquadV2Processor
57
+ from transformers.data.processors.utils import DataProcessor
58
+ from transformers.trainer_utils import is_main_process
59
+
60
+
61
+ try:
62
+ from torch.utils.tensorboard import SummaryWriter
63
+ except ImportError:
64
+ from tensorboardX import SummaryWriter
65
+
66
+
67
+ logger = logging.getLogger(__name__)
68
+
69
+ MODEL_CONFIG_CLASSES = list(MODEL_FOR_QUESTION_ANSWERING_MAPPING.keys())
70
+ MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
71
+
72
+
73
+ class NewsQAProcessor(DataProcessor):
74
+ """
75
+ Processor for the NewsQA dataset.
76
+
77
+ https://github.com/Maluuba/newsqa
78
+ """
79
+
80
+ train_file = "combined-newsqa-data-v1.json"
81
+ dev_file = "combined-newsqa-data-v1.json"
82
+
83
+ def get_train_examples(self, data_dir, filename=None):
84
+ if data_dir is None:
85
+ data_dir = ""
86
+
87
+ set_type = "train"
88
+ filepath = os.path.join(data_dir, self.train_file if filename is None else filename)
89
+ with open(filepath, "r", encoding="utf-8") as file:
90
+ source = json.load(file)
91
+ if source["version"] != "1":
92
+ raise ValueError("Invalid NewsQA dataset version")
93
+ input_data = [story for story in source["data"] if story["type"] == set_type]
94
+ return self._create_examples(input_data, set_type)
95
+
96
+ def get_dev_examples(self, data_dir, filename=None):
97
+ if data_dir is None:
98
+ data_dir = ""
99
+
100
+ set_type = "dev"
101
+ filepath = os.path.join(data_dir, self.dev_file if filename is None else filename)
102
+ with open(filepath, "r", encoding="utf-8") as file:
103
+ source = json.load(file)
104
+ if source["version"] != "1":
105
+ raise ValueError("Invalid NewsQA dataset version")
106
+ input_data = [story for story in source["data"] if story["type"] == set_type]
107
+ return self._create_examples(input_data, set_type)
108
+
109
+ def _create_examples(self, input_data, set_type):
110
+ is_training = set_type == "train"
111
+ examples = []
112
+ for story in tqdm(input_data):
113
+ title = story["storyId"] # no title is available in NewsQA
114
+ context_text = story["text"]
115
+
116
+ for iqa, qa in enumerate(story["questions"]):
117
+ qas_id = story["storyId"] + str(iqa)
118
+ question_text = qa["q"]
119
+ start_position_character = None
120
+ answer_text = None
121
+ answers = []
122
+ is_impossible = False
123
+
124
+ if "s" in qa["consensus"].keys() and "e" in qa["consensus"].keys():
125
+ # Append consensus as the first answer for training
126
+ answer_start = qa["consensus"]["s"]
127
+ answer_end = qa["consensus"]["e"]
128
+ answer_text = context_text[answer_start:answer_end].strip()
129
+ start_position_character = answer_start
130
+ answers.append({
131
+ "answer_start": answer_start,
132
+ "text": answer_text
133
+ })
134
+ # Append sourcer answers for validation
135
+ for a in qa["answers"]:
136
+ for sa in a["sourcerAnswers"]:
137
+ if "s" in sa.keys() and "e" in sa.keys():
138
+ answer_start = sa["s"]
139
+ answer_end = sa["e"]
140
+ answers.append({
141
+ "answer_start": answer_start,
142
+ "text": context_text[answer_start:answer_end].strip()
143
+ })
144
+
145
+ is_impossible = not (len(answers) > 0)
146
+
147
+ if not is_impossible:
148
+ if is_training:
149
+ # Use the first answer (consensus) for training
150
+ answers = [answers[0]]
151
+ else:
152
+ # Use all the sourcer answers for validation
153
+ pass
154
+
155
+ # Only examples with a valid answer are considered.
156
+ if not is_impossible:
157
+ example = SquadExample(
158
+ qas_id=qas_id,
159
+ question_text=question_text,
160
+ context_text=context_text,
161
+ answer_text=answer_text,
162
+ start_position_character=start_position_character,
163
+ title=title,
164
+ is_impossible=is_impossible,
165
+ answers=answers
166
+ )
167
+ examples.append(example)
168
+
169
+
170
+ return examples
171
+
172
+ def set_seed(args):
173
+ random.seed(args.seed)
174
+ np.random.seed(args.seed)
175
+ torch.manual_seed(args.seed)
176
+ if args.n_gpu > 0:
177
+ torch.cuda.manual_seed_all(args.seed)
178
+
179
+
180
+ def to_list(tensor):
181
+ return tensor.detach().cpu().tolist()
182
+
183
+
184
+ def train(args, train_dataset, model, tokenizer):
185
+ """Train the model"""
186
+ if args.local_rank in [-1, 0]:
187
+ tb_writer = SummaryWriter()
188
+
189
+ args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
190
+ train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
191
+ train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
192
+
193
+ if args.max_steps > 0:
194
+ t_total = args.max_steps
195
+ args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
196
+ else:
197
+ t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
198
+
199
+ # Prepare optimizer and schedule (linear warmup and decay)
200
+ no_decay = ["bias", "LayerNorm.weight"]
201
+ optimizer_grouped_parameters = [
202
+ {
203
+ "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
204
+ "weight_decay": args.weight_decay,
205
+ },
206
+ {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
207
+ ]
208
+ optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
209
+ scheduler = get_linear_schedule_with_warmup(
210
+ optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
211
+ )
212
+
213
+ # Check if saved optimizer or scheduler states exist
214
+ if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
215
+ os.path.join(args.model_name_or_path, "scheduler.pt")
216
+ ):
217
+ # Load in optimizer and scheduler states
218
+ optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
219
+ scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
220
+
221
+ if args.fp16:
222
+ try:
223
+ from apex import amp
224
+ except ImportError:
225
+ raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
226
+
227
+ model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
228
+
229
+ # multi-gpu training (should be after apex fp16 initialization)
230
+ if args.n_gpu > 1:
231
+ model = torch.nn.DataParallel(model)
232
+
233
+ # Distributed training (should be after apex fp16 initialization)
234
+ if args.local_rank != -1:
235
+ model = torch.nn.parallel.DistributedDataParallel(
236
+ model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
237
+ )
238
+
239
+ # Train!
240
+ logger.info("***** Running training *****")
241
+ logger.info(" Num examples = %d", len(train_dataset))
242
+ logger.info(" Num Epochs = %d", args.num_train_epochs)
243
+ logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
244
+ logger.info(
245
+ " Total train batch size (w. parallel, distributed & accumulation) = %d",
246
+ args.train_batch_size
247
+ * args.gradient_accumulation_steps
248
+ * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
249
+ )
250
+ logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
251
+ logger.info(" Total optimization steps = %d", t_total)
252
+
253
+ global_step = 1
254
+ epochs_trained = 0
255
+ steps_trained_in_current_epoch = 0
256
+ # Check if continuing training from a checkpoint
257
+ if os.path.exists(args.model_name_or_path):
258
+ try:
259
+ # set global_step to gobal_step of last saved checkpoint from model path
260
+ checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
261
+ global_step = int(checkpoint_suffix)
262
+ epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
263
+ steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
264
+
265
+ logger.info(" Continuing training from checkpoint, will skip to saved global_step")
266
+ logger.info(" Continuing training from epoch %d", epochs_trained)
267
+ logger.info(" Continuing training from global step %d", global_step)
268
+ logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
269
+ except ValueError:
270
+ logger.info(" Starting fine-tuning.")
271
+
272
+ tr_loss, logging_loss = 0.0, 0.0
273
+ model.zero_grad()
274
+ train_iterator = trange(
275
+ epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
276
+ )
277
+ # Added here for reproductibility
278
+ set_seed(args)
279
+
280
+ for _ in train_iterator:
281
+ epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
282
+ for step, batch in enumerate(epoch_iterator):
283
+
284
+ # Skip past any already trained steps if resuming training
285
+ if steps_trained_in_current_epoch > 0:
286
+ steps_trained_in_current_epoch -= 1
287
+ continue
288
+
289
+ model.train()
290
+ batch = tuple(t.to(args.device) for t in batch)
291
+
292
+ inputs = {
293
+ "input_ids": batch[0],
294
+ "attention_mask": batch[1],
295
+ "token_type_ids": batch[2],
296
+ "start_positions": batch[3],
297
+ "end_positions": batch[4],
298
+ }
299
+
300
+ if args.model_type in ["xlm", "roberta", "distilbert", "camembert", "bart", "longformer"]:
301
+ del inputs["token_type_ids"]
302
+
303
+ if args.model_type in ["xlnet", "xlm"]:
304
+ inputs.update({"cls_index": batch[5], "p_mask": batch[6]})
305
+ if args.version_2_with_negative:
306
+ inputs.update({"is_impossible": batch[7]})
307
+ if hasattr(model, "config") and hasattr(model.config, "lang2id"):
308
+ inputs.update(
309
+ {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)}
310
+ )
311
+
312
+ outputs = model(**inputs)
313
+ # model outputs are always tuple in transformers (see doc)
314
+ loss = outputs[0]
315
+
316
+ if args.n_gpu > 1:
317
+ loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
318
+ if args.gradient_accumulation_steps > 1:
319
+ loss = loss / args.gradient_accumulation_steps
320
+
321
+ if args.fp16:
322
+ with amp.scale_loss(loss, optimizer) as scaled_loss:
323
+ scaled_loss.backward()
324
+ else:
325
+ loss.backward()
326
+
327
+ tr_loss += loss.item()
328
+ if (step + 1) % args.gradient_accumulation_steps == 0:
329
+ if args.fp16:
330
+ torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
331
+ else:
332
+ torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
333
+
334
+ optimizer.step()
335
+ scheduler.step() # Update learning rate schedule
336
+ model.zero_grad()
337
+ global_step += 1
338
+
339
+ # Log metrics
340
+ if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
341
+ # Only evaluate when single GPU otherwise metrics may not average well
342
+ if args.local_rank == -1 and args.evaluate_during_training:
343
+ results = evaluate(args, model, tokenizer)
344
+ for key, value in results.items():
345
+ tb_writer.add_scalar("eval_{}".format(key), value, global_step)
346
+ tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
347
+ tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
348
+ logging_loss = tr_loss
349
+
350
+ # Save model checkpoint
351
+ if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
352
+ output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
353
+ # Take care of distributed/parallel training
354
+ model_to_save = model.module if hasattr(model, "module") else model
355
+ model_to_save.save_pretrained(output_dir)
356
+ tokenizer.save_pretrained(output_dir)
357
+
358
+ torch.save(args, os.path.join(output_dir, "training_args.bin"))
359
+ logger.info("Saving model checkpoint to %s", output_dir)
360
+
361
+ torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
362
+ torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
363
+ logger.info("Saving optimizer and scheduler states to %s", output_dir)
364
+
365
+ if args.max_steps > 0 and global_step > args.max_steps:
366
+ epoch_iterator.close()
367
+ break
368
+ if args.max_steps > 0 and global_step > args.max_steps:
369
+ train_iterator.close()
370
+ break
371
+
372
+ if args.local_rank in [-1, 0]:
373
+ tb_writer.close()
374
+
375
+ return global_step, tr_loss / global_step
376
+
377
+
378
+ def evaluate(args, model, tokenizer, prefix=""):
379
+ dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)
380
+
381
+ if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
382
+ os.makedirs(args.output_dir)
383
+
384
+ args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
385
+
386
+ # Note that DistributedSampler samples randomly
387
+ eval_sampler = SequentialSampler(dataset)
388
+ eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
389
+
390
+ # multi-gpu evaluate
391
+ if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
392
+ model = torch.nn.DataParallel(model)
393
+
394
+ # Eval!
395
+ logger.info("***** Running evaluation {} *****".format(prefix))
396
+ logger.info(" Num examples = %d", len(dataset))
397
+ logger.info(" Batch size = %d", args.eval_batch_size)
398
+
399
+ all_results = []
400
+ start_time = timeit.default_timer()
401
+
402
+ for batch in tqdm(eval_dataloader, desc="Evaluating"):
403
+ model.eval()
404
+ batch = tuple(t.to(args.device) for t in batch)
405
+
406
+ with torch.no_grad():
407
+ inputs = {
408
+ "input_ids": batch[0],
409
+ "attention_mask": batch[1],
410
+ "token_type_ids": batch[2],
411
+ }
412
+
413
+ if args.model_type in ["xlm", "roberta", "distilbert", "camembert", "bart", "longformer"]:
414
+ del inputs["token_type_ids"]
415
+
416
+ feature_indices = batch[3]
417
+
418
+ # XLNet and XLM use more arguments for their predictions
419
+ if args.model_type in ["xlnet", "xlm"]:
420
+ inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
421
+ # for lang_id-sensitive xlm models
422
+ if hasattr(model, "config") and hasattr(model.config, "lang2id"):
423
+ inputs.update(
424
+ {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)}
425
+ )
426
+ outputs = model(**inputs)
427
+
428
+ for i, feature_index in enumerate(feature_indices):
429
+ eval_feature = features[feature_index.item()]
430
+ unique_id = int(eval_feature.unique_id)
431
+
432
+ output = [to_list(output[i]) for output in outputs.to_tuple()]
433
+
434
+ # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
435
+ # models only use two.
436
+ if len(output) >= 5:
437
+ start_logits = output[0]
438
+ start_top_index = output[1]
439
+ end_logits = output[2]
440
+ end_top_index = output[3]
441
+ cls_logits = output[4]
442
+
443
+ result = SquadResult(
444
+ unique_id,
445
+ start_logits,
446
+ end_logits,
447
+ start_top_index=start_top_index,
448
+ end_top_index=end_top_index,
449
+ cls_logits=cls_logits,
450
+ )
451
+
452
+ else:
453
+ start_logits, end_logits = output
454
+ result = SquadResult(unique_id, start_logits, end_logits)
455
+
456
+ all_results.append(result)
457
+
458
+ evalTime = timeit.default_timer() - start_time
459
+ logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))
460
+
461
+ # Compute predictions
462
+ output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
463
+ output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))
464
+
465
+ if args.version_2_with_negative:
466
+ output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
467
+ else:
468
+ output_null_log_odds_file = None
469
+
470
+ # XLNet and XLM use a more complex post-processing procedure
471
+ if args.model_type in ["xlnet", "xlm"]:
472
+ start_n_top = model.config.start_n_top if hasattr(model, "config") else model.module.config.start_n_top
473
+ end_n_top = model.config.end_n_top if hasattr(model, "config") else model.module.config.end_n_top
474
+
475
+ predictions = compute_predictions_log_probs(
476
+ examples,
477
+ features,
478
+ all_results,
479
+ args.n_best_size,
480
+ args.max_answer_length,
481
+ output_prediction_file,
482
+ output_nbest_file,
483
+ output_null_log_odds_file,
484
+ start_n_top,
485
+ end_n_top,
486
+ args.version_2_with_negative,
487
+ tokenizer,
488
+ args.verbose_logging,
489
+ )
490
+ else:
491
+ predictions = compute_predictions_logits(
492
+ examples,
493
+ features,
494
+ all_results,
495
+ args.n_best_size,
496
+ args.max_answer_length,
497
+ args.do_lower_case,
498
+ output_prediction_file,
499
+ output_nbest_file,
500
+ output_null_log_odds_file,
501
+ args.verbose_logging,
502
+ args.version_2_with_negative,
503
+ args.null_score_diff_threshold,
504
+ tokenizer,
505
+ )
506
+
507
+ # Compute the F1 and exact scores.
508
+ results = squad_evaluate(examples, predictions)
509
+ return results
510
+
511
+
512
+ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
513
+ if args.local_rank not in [-1, 0] and not evaluate:
514
+ # Make sure only the first process in distributed training process the dataset, and the others will use the cache
515
+ torch.distributed.barrier()
516
+
517
+ # Load data features from cache or dataset file
518
+ input_dir = args.data_dir if args.data_dir else "."
519
+ cached_features_file = os.path.join(
520
+ input_dir,
521
+ "cached_{}_{}_{}".format(
522
+ "dev" if evaluate else "train",
523
+ list(filter(None, args.model_name_or_path.split("/"))).pop(),
524
+ str(args.max_seq_length),
525
+ ),
526
+ )
527
+
528
+ # Init features and dataset from cache if it exists
529
+ if os.path.exists(cached_features_file) and not args.overwrite_cache:
530
+ logger.info("Loading features from cached file %s", cached_features_file)
531
+ features_and_dataset = torch.load(cached_features_file)
532
+ features, dataset, examples = (
533
+ features_and_dataset["features"],
534
+ features_and_dataset["dataset"],
535
+ features_and_dataset["examples"],
536
+ )
537
+ else:
538
+ logger.info("Creating features from dataset file at %s", input_dir)
539
+
540
+ if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)):
541
+ raise NotImplementedError()
542
+ else:
543
+ processor = NewsQAProcessor()
544
+ if evaluate:
545
+ examples = processor.get_dev_examples(args.data_dir, filename=args.predict_file)
546
+ else:
547
+ examples = processor.get_train_examples(args.data_dir, filename=args.train_file)
548
+
549
+ features, dataset = squad_convert_examples_to_features(
550
+ examples=examples,
551
+ tokenizer=tokenizer,
552
+ max_seq_length=args.max_seq_length,
553
+ doc_stride=args.doc_stride,
554
+ max_query_length=args.max_query_length,
555
+ is_training=not evaluate,
556
+ return_dataset="pt",
557
+ threads=args.threads,
558
+ )
559
+
560
+ if args.local_rank in [-1, 0]:
561
+ logger.info("Saving features into cached file %s", cached_features_file)
562
+ torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file)
563
+
564
+ if args.local_rank == 0 and not evaluate:
565
+ # Make sure only the first process in distributed training process the dataset, and the others will use the cache
566
+ torch.distributed.barrier()
567
+
568
+ if output_examples:
569
+ return dataset, examples, features
570
+ return dataset
571
+
572
+
573
+ def main():
574
+ parser = argparse.ArgumentParser()
575
+
576
+ # Required parameters
577
+ parser.add_argument(
578
+ "--model_type",
579
+ default=None,
580
+ type=str,
581
+ required=True,
582
+ help="Model type selected in the list: " + ", ".join(MODEL_TYPES),
583
+ )
584
+ parser.add_argument(
585
+ "--model_name_or_path",
586
+ default=None,
587
+ type=str,
588
+ required=True,
589
+ help="Path to pretrained model or model identifier from huggingface.co/models",
590
+ )
591
+ parser.add_argument(
592
+ "--output_dir",
593
+ default=None,
594
+ type=str,
595
+ required=True,
596
+ help="The output directory where the model checkpoints and predictions will be written.",
597
+ )
598
+
599
+ # Other parameters
600
+ parser.add_argument(
601
+ "--data_dir",
602
+ default=None,
603
+ type=str,
604
+ help="The input data dir. Should contain the .json files for the task."
605
+ + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
606
+ )
607
+ parser.add_argument(
608
+ "--train_file",
609
+ default=None,
610
+ type=str,
611
+ help="The input training file. If a data dir is specified, will look for the file there"
612
+ + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
613
+ )
614
+ parser.add_argument(
615
+ "--predict_file",
616
+ default=None,
617
+ type=str,
618
+ help="The input evaluation file. If a data dir is specified, will look for the file there"
619
+ + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
620
+ )
621
+ parser.add_argument(
622
+ "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
623
+ )
624
+ parser.add_argument(
625
+ "--tokenizer_name",
626
+ default="",
627
+ type=str,
628
+ help="Pretrained tokenizer name or path if not the same as model_name",
629
+ )
630
+ parser.add_argument(
631
+ "--cache_dir",
632
+ default="",
633
+ type=str,
634
+ help="Where do you want to store the pre-trained models downloaded from huggingface.co",
635
+ )
636
+
637
+ parser.add_argument(
638
+ "--version_2_with_negative",
639
+ action="store_true",
640
+ help="If true, the SQuAD examples contain some that do not have an answer.",
641
+ )
642
+ parser.add_argument(
643
+ "--null_score_diff_threshold",
644
+ type=float,
645
+ default=0.0,
646
+ help="If null_score - best_non_null is greater than the threshold predict null.",
647
+ )
648
+
649
+ parser.add_argument(
650
+ "--max_seq_length",
651
+ default=384,
652
+ type=int,
653
+ help="The maximum total input sequence length after WordPiece tokenization. Sequences "
654
+ "longer than this will be truncated, and sequences shorter than this will be padded.",
655
+ )
656
+ parser.add_argument(
657
+ "--doc_stride",
658
+ default=128,
659
+ type=int,
660
+ help="When splitting up a long document into chunks, how much stride to take between chunks.",
661
+ )
662
+ parser.add_argument(
663
+ "--max_query_length",
664
+ default=64,
665
+ type=int,
666
+ help="The maximum number of tokens for the question. Questions longer than this will "
667
+ "be truncated to this length.",
668
+ )
669
+ parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
670
+ parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
671
+ parser.add_argument(
672
+ "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step."
673
+ )
674
+ parser.add_argument(
675
+ "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
676
+ )
677
+
678
+ parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
679
+ parser.add_argument(
680
+ "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
681
+ )
682
+ parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
683
+ parser.add_argument(
684
+ "--gradient_accumulation_steps",
685
+ type=int,
686
+ default=1,
687
+ help="Number of updates steps to accumulate before performing a backward/update pass.",
688
+ )
689
+ parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
690
+ parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
691
+ parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
692
+ parser.add_argument(
693
+ "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
694
+ )
695
+ parser.add_argument(
696
+ "--max_steps",
697
+ default=-1,
698
+ type=int,
699
+ help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
700
+ )
701
+ parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
702
+ parser.add_argument(
703
+ "--n_best_size",
704
+ default=20,
705
+ type=int,
706
+ help="The total number of n-best predictions to generate in the nbest_predictions.json output file.",
707
+ )
708
+ parser.add_argument(
709
+ "--max_answer_length",
710
+ default=30,
711
+ type=int,
712
+ help="The maximum length of an answer that can be generated. This is needed because the start "
713
+ "and end predictions are not conditioned on one another.",
714
+ )
715
+ parser.add_argument(
716
+ "--verbose_logging",
717
+ action="store_true",
718
+ help="If true, all of the warnings related to data processing will be printed. "
719
+ "A number of warnings are expected for a normal SQuAD evaluation.",
720
+ )
721
+ parser.add_argument(
722
+ "--lang_id",
723
+ default=0,
724
+ type=int,
725
+ help="language id of input for language-specific xlm models (see tokenization_xlm.PRETRAINED_INIT_CONFIGURATION)",
726
+ )
727
+
728
+ parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
729
+ parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
730
+ parser.add_argument(
731
+ "--eval_all_checkpoints",
732
+ action="store_true",
733
+ help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
734
+ )
735
+ parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
736
+ parser.add_argument(
737
+ "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
738
+ )
739
+ parser.add_argument(
740
+ "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
741
+ )
742
+ parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
743
+
744
+ parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
745
+ parser.add_argument(
746
+ "--fp16",
747
+ action="store_true",
748
+ help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
749
+ )
750
+ parser.add_argument(
751
+ "--fp16_opt_level",
752
+ type=str,
753
+ default="O1",
754
+ help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
755
+ "See details at https://nvidia.github.io/apex/amp.html",
756
+ )
757
+ parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
758
+ parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
759
+
760
+ parser.add_argument("--threads", type=int, default=1, help="multiple threads for converting example to features")
761
+ args = parser.parse_args()
762
+
763
+ if args.doc_stride >= args.max_seq_length - args.max_query_length:
764
+ logger.warning(
765
+ "WARNING - You've set a doc stride which may be superior to the document length in some "
766
+ "examples. This could result in errors when building features from the examples. Please reduce the doc "
767
+ "stride or increase the maximum length to ensure the features are correctly built."
768
+ )
769
+
770
+ if (
771
+ os.path.exists(args.output_dir)
772
+ and os.listdir(args.output_dir)
773
+ and args.do_train
774
+ and not args.overwrite_output_dir
775
+ ):
776
+ raise ValueError(
777
+ "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
778
+ args.output_dir
779
+ )
780
+ )
781
+
782
+ # Setup distant debugging if needed
783
+ if args.server_ip and args.server_port:
784
+ # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
785
+ import ptvsd
786
+
787
+ print("Waiting for debugger attach")
788
+ ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
789
+ ptvsd.wait_for_attach()
790
+
791
+ # Setup CUDA, GPU & distributed training
792
+ if args.local_rank == -1 or args.no_cuda:
793
+ device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
794
+ args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
795
+ else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
796
+ torch.cuda.set_device(args.local_rank)
797
+ device = torch.device("cuda", args.local_rank)
798
+ torch.distributed.init_process_group(backend="nccl")
799
+ args.n_gpu = 1
800
+ args.device = device
801
+
802
+ # Setup logging
803
+ logging.basicConfig(
804
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
805
+ datefmt="%m/%d/%Y %H:%M:%S",
806
+ level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
807
+ )
808
+ logger.warning(
809
+ "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
810
+ args.local_rank,
811
+ device,
812
+ args.n_gpu,
813
+ bool(args.local_rank != -1),
814
+ args.fp16,
815
+ )
816
+ # Set the verbosity to info of the Transformers logger (on main process only):
817
+ if is_main_process(args.local_rank):
818
+ transformers.utils.logging.set_verbosity_info()
819
+ transformers.utils.logging.enable_default_handler()
820
+ transformers.utils.logging.enable_explicit_format()
821
+ # Set seed
822
+ set_seed(args)
823
+
824
+ # Load pretrained model and tokenizer
825
+ if args.local_rank not in [-1, 0]:
826
+ # Make sure only the first process in distributed training will download model & vocab
827
+ torch.distributed.barrier()
828
+
829
+ args.model_type = args.model_type.lower()
830
+ config = AutoConfig.from_pretrained(
831
+ args.config_name if args.config_name else args.model_name_or_path,
832
+ cache_dir=args.cache_dir if args.cache_dir else None,
833
+ )
834
+ tokenizer = AutoTokenizer.from_pretrained(
835
+ args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
836
+ do_lower_case=args.do_lower_case,
837
+ cache_dir=args.cache_dir if args.cache_dir else None,
838
+ use_fast=False, # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handeling
839
+ )
840
+ model = AutoModelForQuestionAnswering.from_pretrained(
841
+ args.model_name_or_path,
842
+ from_tf=bool(".ckpt" in args.model_name_or_path),
843
+ config=config,
844
+ cache_dir=args.cache_dir if args.cache_dir else None,
845
+ )
846
+
847
+ if args.local_rank == 0:
848
+ # Make sure only the first process in distributed training will download model & vocab
849
+ torch.distributed.barrier()
850
+
851
+ model.to(args.device)
852
+
853
+ logger.info("Training/evaluation parameters %s", args)
854
+
855
+ # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is set.
856
+ # Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running `--fp16_opt_level="O2"` will
857
+ # remove the need for this code, but it is still valid.
858
+ if args.fp16:
859
+ try:
860
+ import apex
861
+
862
+ apex.amp.register_half_function(torch, "einsum")
863
+ except ImportError:
864
+ raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
865
+
866
+ # Training
867
+ if args.do_train:
868
+ train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False)
869
+ global_step, tr_loss = train(args, train_dataset, model, tokenizer)
870
+ logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
871
+
872
+ # Save the trained model and the tokenizer
873
+ if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
874
+ logger.info("Saving model checkpoint to %s", args.output_dir)
875
+ # Save a trained model, configuration and tokenizer using `save_pretrained()`.
876
+ # They can then be reloaded using `from_pretrained()`
877
+ # Take care of distributed/parallel training
878
+ model_to_save = model.module if hasattr(model, "module") else model
879
+ model_to_save.save_pretrained(args.output_dir)
880
+ tokenizer.save_pretrained(args.output_dir)
881
+
882
+ # Good practice: save your training arguments together with the trained model
883
+ torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
884
+
885
+ # Load a trained model and vocabulary that you have fine-tuned
886
+ model = AutoModelForQuestionAnswering.from_pretrained(args.output_dir) # , force_download=True)
887
+
888
+ # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handeling
889
+ # So we use use_fast=False here for now until Fast-tokenizer-compatible-examples are out
890
+ tokenizer = AutoTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case, use_fast=False)
891
+ model.to(args.device)
892
+
893
+ # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
894
+ results = {}
895
+ if args.do_eval and args.local_rank in [-1, 0]:
896
+ if args.do_train:
897
+ logger.info("Loading checkpoints saved during training for evaluation")
898
+ checkpoints = [args.output_dir]
899
+ if args.eval_all_checkpoints:
900
+ checkpoints = list(
901
+ os.path.dirname(c)
902
+ for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
903
+ )
904
+
905
+ else:
906
+ logger.info("Loading checkpoint %s for evaluation", args.model_name_or_path)
907
+ checkpoints = [args.model_name_or_path]
908
+
909
+ logger.info("Evaluate the following checkpoints: %s", checkpoints)
910
+
911
+ for checkpoint in checkpoints:
912
+ # Reload the model
913
+ global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
914
+ model = AutoModelForQuestionAnswering.from_pretrained(checkpoint) # , force_download=True)
915
+ model.to(args.device)
916
+
917
+ # Evaluate
918
+ result = evaluate(args, model, tokenizer, prefix=global_step)
919
+
920
+ result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items())
921
+ results.update(result)
922
+
923
+ logger.info("Results: {}".format(results))
924
+
925
+ return results
926
+
927
+
928
+ if __name__ == "__main__":
929
+ main()