Youssefk commited on
Commit
6d567db
1 Parent(s): d11da8a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +630 -1
app.py CHANGED
@@ -3,8 +3,637 @@ from streamlit_chat import message
3
  import requests
4
  from transformers import AutoModelWithLMHead, AutoTokenizer
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-small')
7
- model = AutoModelWithLMHead.from_pretrained('model.py')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  st.set_page_config(
10
  page_title="COVID Doctor using DialoGPT",
 
3
  import requests
4
  from transformers import AutoModelWithLMHead, AutoTokenizer
5
 
6
+ # tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-small')
7
+ # model = AutoModelWithLMHead.from_pretrained('model.py')
8
+
9
+ # -*- coding: utf-8 -*-
10
+
11
+ import pandas as pd
12
+
13
+ data = {'Question': ['What is the story about?',
14
+ 'Who is the Phantom Thief Kid?',
15
+ 'What did the police deduce about Kid\'s next target?',
16
+ 'What happened during the heist?',
17
+ 'Who did Conan and Heiji suspect to be the person who shot Kid?',
18
+ 'Who was murdered after the heist and how did it happen?',
19
+ 'Who did the police initially suspect for the murder and who did Conan conclude was the culprit?',
20
+ 'What did Conan deduce about Scorpion\'s next target?',
21
+ 'Who did Conan suspect to be Scorpion and why?',
22
+ 'How did Conan prevent Seiran from killing him?',
23
+ 'Who did Conan suspect Shiratori to be?',
24
+ 'What happened at the end of the story?'],
25
+ 'Answer': ['The story is about the Phantom Thief Kid\'s heist of the Fabergé egg from the Suzuki Modern Art Museum, his apparent death after being shot, and the subsequent investigation to find his killer and recover the stolen egg.',
26
+ 'The Phantom Thief Kid is a notorious thief who specializes in stealing high-profile objects and has a signature calling card left at the scene of the crime.',
27
+ 'The police deduced that Kid\'s next target would be the recently discovered Fabergé egg, which would be displayed at the Suzuki Modern Art Museum in Osaka on August 22.',
28
+ 'Kid successfully stole the egg and fled with Conan and Heiji in pursuit. However, an unknown assailant shot Kid in the right eye, causing him to fall into the sea and apparently die. The police recovered the egg but could not find Kid\'s body.',
29
+ 'Conan and Heiji initially suspected Sonoko\'s father\'s servant, Mr. Nishino, to be the person who shot Kid.',
30
+ 'Ryu Sagawa, a freelance photographer covering the press with news of the egg, was murdered after the heist. He was shot in the right eye in the same fashion as Kid.',
31
+ 'The police initially suspected Sonoko\'s father\'s servant, Mr. Nishino, for the murder, but Conan concluded that the culprit was Scorpion - a mysterious killer who always shoots his victims in the right eye.',
32
+ 'Conan deduced that Scorpion\'s next target was the second egg, which was located at Yokosuka Castle.',
33
+ 'Conan suspected Scorpion to be Seiran the historian because she had a personal vendetta against Kid and had access to information about the second egg.',
34
+ 'Conan prevented Seiran from killing him by wearing bulletproof glass on his glasses, which caused the bullet to ricochet off.',
35
+ 'Conan suspected Shiratori to be Kid in disguise.',
36
+ 'At the end of the story, Kid appeared disguised as Shinichi and distracted Ran while Conan was about to confess to her. Kid then disappeared in a flurry of pigeons.']
37
+ }
38
+
39
+ df = pd.DataFrame(data)
40
+
41
+ # ! pip -q install transformers
42
+
43
+ from transformers import AutoModelWithLMHead, AutoTokenizer
44
+ import torch
45
+ import os
46
+
47
+
48
+ tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
49
+ model = AutoModelWithLMHead.from_pretrained("microsoft/DialoGPT-small")
50
+
51
+ """
52
+ Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
53
+ GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
54
+ using a masked language modeling (MLM) loss.
55
+ """
56
+
57
+ import glob
58
+ import logging
59
+ import os
60
+ import pickle
61
+ import random
62
+ import re
63
+ import shutil
64
+ from typing import Dict, List, Tuple
65
+ import json
66
+
67
+ import pandas as pd
68
+ import numpy as np
69
+ import torch
70
+
71
+ from sklearn.model_selection import train_test_split
72
+
73
+ from torch.nn.utils.rnn import pad_sequence
74
+ from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
75
+ from torch.utils.data.distributed import DistributedSampler
76
+ from tqdm.notebook import tqdm, trange
77
+
78
+ from pathlib import Path
79
+
80
+ from transformers import (
81
+ MODEL_WITH_LM_HEAD_MAPPING,
82
+ WEIGHTS_NAME,
83
+ AdamW,
84
+ AutoConfig,
85
+ AutoModelWithLMHead,
86
+ AutoTokenizer,
87
+ PreTrainedModel,
88
+ PreTrainedTokenizer,
89
+ get_linear_schedule_with_warmup,
90
+ )
91
+
92
+
93
+ try:
94
+ from torch.utils.tensorboard import SummaryWriter
95
+ except ImportError:
96
+ from tensorboardX import SummaryWriter
97
+
98
+ # Configs
99
+ logger = logging.getLogger(__name__)
100
+
101
+ MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
102
+ MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
103
+
104
+ # Args to allow for easy convertion of python script to notebook
105
+ class Args():
106
+ def __init__(self):
107
+ self.output_dir = 'output-small-save'
108
+ self.model_type = 'gpt2'
109
+ self.model_name_or_path = 'microsoft/DialoGPT-small'
110
+ self.config_name = 'microsoft/DialoGPT-small'
111
+ self.tokenizer_name = 'microsoft/DialoGPT-small'
112
+ self.cache_dir = 'cached'
113
+ self.block_size = 512
114
+ self.do_train = True
115
+ self.do_eval = True
116
+ self.evaluate_during_training = False
117
+ self.per_gpu_train_batch_size = 4
118
+ self.per_gpu_eval_batch_size = 4
119
+ self.gradient_accumulation_steps = 1
120
+ self.learning_rate = 5e-5
121
+ self.weight_decay = 0.0
122
+ self.adam_epsilon = 1e-8
123
+ self.max_grad_norm = 1.0
124
+ self.num_train_epochs = 3
125
+ self.max_steps = -1
126
+ self.warmup_steps = 0
127
+ self.logging_steps = 1000
128
+ self.save_steps = 3500
129
+ self.save_total_limit = None
130
+ self.eval_all_checkpoints = False
131
+ self.no_cuda = False
132
+ self.overwrite_output_dir = True
133
+ self.overwrite_cache = True
134
+ self.should_continue = False
135
+ self.seed = 42
136
+ self.local_rank = -1
137
+ self.fp16 = False
138
+ self.fp16_opt_level = 'O1'
139
+
140
+ args = Args()
141
+
142
+ df.head()
143
+
144
+ def construct_conv(row, tokenizer, eos = True):
145
+ flatten = lambda l: [item for sublist in l for item in sublist]
146
+ conv = list(reversed([tokenizer.encode(x) + [tokenizer.eos_token_id] for x in row]))
147
+ conv = flatten(conv)
148
+ return conv
149
+
150
+ class ConversationDataset(Dataset):
151
+ def __init__(self, tokenizer: PreTrainedTokenizer, args, df, block_size=512):
152
+
153
+ block_size = block_size - (tokenizer.model_max_length - tokenizer.max_len_single_sentence)
154
+
155
+ directory = args.cache_dir
156
+ cached_features_file = os.path.join(
157
+ directory, args.model_type + "_cached_lm_" + str(block_size)
158
+ )
159
+
160
+ if os.path.exists(cached_features_file) and not args.overwrite_cache:
161
+ logger.info("Loading features from cached file %s", cached_features_file)
162
+ with open(cached_features_file, "rb") as handle:
163
+ self.examples = pickle.load(handle)
164
+ else:
165
+ logger.info("Creating features from dataset file at %s", directory)
166
+
167
+ self.examples = []
168
+ for _, row in df.iterrows():
169
+ conv = construct_conv(row, tokenizer)
170
+ self.examples.append(conv)
171
+
172
+ logger.info("Saving features into cached file %s", cached_features_file)
173
+ with open(cached_features_file, "wb") as handle:
174
+ pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
175
+
176
+ def __len__(self):
177
+ return len(self.examples)
178
+
179
+ def __getitem__(self, item):
180
+ return torch.tensor(self.examples[item], dtype=torch.long)
181
+
182
+ # Cacheing and storing of data/checkpoints
183
+
184
+ def load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False):
185
+ return ConversationDataset(tokenizer, args, df_val if evaluate else df_trn)
186
+
187
+
188
+ def set_seed(args):
189
+ random.seed(args.seed)
190
+ np.random.seed(args.seed)
191
+ torch.manual_seed(args.seed)
192
+ if args.n_gpu > 0:
193
+ torch.cuda.manual_seed_all(args.seed)
194
+
195
+
196
+ def _sorted_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> List[str]:
197
+ ordering_and_checkpoint_path = []
198
+
199
+ glob_checkpoints = glob.glob(os.path.join(args.output_dir, "{}-*".format(checkpoint_prefix)))
200
+
201
+ for path in glob_checkpoints:
202
+ if use_mtime:
203
+ ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
204
+ else:
205
+ regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path)
206
+ if regex_match and regex_match.groups():
207
+ ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))
208
+
209
+ checkpoints_sorted = sorted(ordering_and_checkpoint_path)
210
+ checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
211
+ return checkpoints_sorted
212
+
213
+
214
+ def _rotate_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> None:
215
+ if not args.save_total_limit:
216
+ return
217
+ if args.save_total_limit <= 0:
218
+ return
219
+
220
+ # Check if we should delete older checkpoint(s)
221
+ checkpoints_sorted = _sorted_checkpoints(args, checkpoint_prefix, use_mtime)
222
+ if len(checkpoints_sorted) <= args.save_total_limit:
223
+ return
224
+
225
+ number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit)
226
+ checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
227
+ for checkpoint in checkpoints_to_be_deleted:
228
+ logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
229
+ shutil.rmtree(checkpoint)
230
+
231
+ def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
232
+ """ Train the model """
233
+ if args.local_rank in [-1, 0]:
234
+ tb_writer = SummaryWriter()
235
+
236
+ args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
237
+
238
+ def collate(examples: List[torch.Tensor]):
239
+ if tokenizer._pad_token is None:
240
+ return pad_sequence(examples, batch_first=True)
241
+ return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)
242
+
243
+ train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
244
+ train_dataloader = DataLoader(
245
+ train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate, drop_last = True
246
+ )
247
+
248
+ if args.max_steps > 0:
249
+ t_total = args.max_steps
250
+ args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
251
+ else:
252
+ t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
253
+
254
+ model = model.module if hasattr(model, "module") else model # Take care of distributed/parallel training
255
+ model.resize_token_embeddings(len(tokenizer))
256
+ # add_special_tokens_(model, tokenizer)
257
+
258
+
259
+ # Prepare optimizer and schedule (linear warmup and decay)
260
+ no_decay = ["bias", "LayerNorm.weight"]
261
+ optimizer_grouped_parameters = [
262
+ {
263
+ "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
264
+ "weight_decay": args.weight_decay,
265
+ },
266
+ {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
267
+ ]
268
+ optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
269
+ scheduler = get_linear_schedule_with_warmup(
270
+ optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
271
+ )
272
+
273
+ # Check if saved optimizer or scheduler states exist
274
+ if (
275
+ args.model_name_or_path
276
+ and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt"))
277
+ and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt"))
278
+ ):
279
+ # Load in optimizer and scheduler states
280
+ optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
281
+ scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
282
+
283
+ if args.fp16:
284
+ try:
285
+ from apex import amp
286
+ except ImportError:
287
+ raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
288
+ model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
289
+
290
+ # multi-gpu training (should be after apex fp16 initialization)
291
+ if args.n_gpu > 1:
292
+ model = torch.nn.DataParallel(model)
293
+
294
+ # Distributed training (should be after apex fp16 initialization)
295
+ if args.local_rank != -1:
296
+ model = torch.nn.parallel.DistributedDataParallel(
297
+ model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
298
+ )
299
+
300
+ # Train!
301
+ logger.info("***** Running training *****")
302
+ logger.info(" Num examples = %d", len(train_dataset))
303
+ logger.info(" Num Epochs = %d", args.num_train_epochs)
304
+ logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
305
+ logger.info(
306
+ " Total train batch size (w. parallel, distributed & accumulation) = %d",
307
+ args.train_batch_size
308
+ * args.gradient_accumulation_steps
309
+ * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
310
+ )
311
+ logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
312
+ logger.info(" Total optimization steps = %d", t_total)
313
+
314
+ global_step = 0
315
+ epochs_trained = 0
316
+ steps_trained_in_current_epoch = 0
317
+ # Check if continuing training from a checkpoint
318
+ if args.model_name_or_path and os.path.exists(args.model_name_or_path):
319
+ try:
320
+ # set global_step to gobal_step of last saved checkpoint from model path
321
+ checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
322
+ global_step = int(checkpoint_suffix)
323
+ epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
324
+ steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
325
+
326
+ logger.info(" Continuing training from checkpoint, will skip to saved global_step")
327
+ logger.info(" Continuing training from epoch %d", epochs_trained)
328
+ logger.info(" Continuing training from global step %d", global_step)
329
+ logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
330
+ except ValueError:
331
+ logger.info(" Starting fine-tuning.")
332
+
333
+ tr_loss, logging_loss = 0.0, 0.0
334
+
335
+ model.zero_grad()
336
+ train_iterator = trange(
337
+ epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
338
+ )
339
+ set_seed(args) # Added here for reproducibility
340
+ for _ in train_iterator:
341
+ epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
342
+ for step, batch in enumerate(epoch_iterator):
343
+
344
+ # Skip past any already trained steps if resuming training
345
+ if steps_trained_in_current_epoch > 0:
346
+ steps_trained_in_current_epoch -= 1
347
+ continue
348
+
349
+ inputs, labels = (batch, batch)
350
+ if inputs.shape[1] > 1024: continue
351
+ inputs = inputs.to(args.device)
352
+ labels = labels.to(args.device)
353
+ model.train()
354
+ outputs = model(inputs, labels=labels)
355
+ loss = outputs[0] # model outputs are always tuple in transformers (see doc)
356
+
357
+ if args.n_gpu > 1:
358
+ loss = loss.mean() # mean() to average on multi-gpu parallel training
359
+ if args.gradient_accumulation_steps > 1:
360
+ loss = loss / args.gradient_accumulation_steps
361
+
362
+ if args.fp16:
363
+ with amp.scale_loss(loss, optimizer) as scaled_loss:
364
+ scaled_loss.backward()
365
+ else:
366
+ loss.backward()
367
+
368
+ tr_loss += loss.item()
369
+ if (step + 1) % args.gradient_accumulation_steps == 0:
370
+ if args.fp16:
371
+ torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
372
+ else:
373
+ torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
374
+ optimizer.step()
375
+ scheduler.step() # Update learning rate schedule
376
+ model.zero_grad()
377
+ global_step += 1
378
+
379
+ if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
380
+ # Log metrics
381
+ if (
382
+ args.local_rank == -1 and args.evaluate_during_training
383
+ ): # Only evaluate when single GPU otherwise metrics may not average well
384
+ results = evaluate(args, model, tokenizer)
385
+ for key, value in results.items():
386
+ tb_writer.add_scalar("eval_{}".format(key), value, global_step)
387
+ tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
388
+ tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
389
+ logging_loss = tr_loss
390
+
391
+ if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
392
+ checkpoint_prefix = "checkpoint"
393
+ # Save model checkpoint
394
+ output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step))
395
+ os.makedirs(output_dir, exist_ok=True)
396
+ model_to_save = (
397
+ model.module if hasattr(model, "module") else model
398
+ ) # Take care of distributed/parallel training
399
+ model_to_save.save_pretrained(output_dir)
400
+ tokenizer.save_pretrained(output_dir)
401
+
402
+ torch.save(args, os.path.join(output_dir, "training_args.bin"))
403
+ logger.info("Saving model checkpoint to %s", output_dir)
404
+
405
+ _rotate_checkpoints(args, checkpoint_prefix)
406
+
407
+ torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
408
+ torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
409
+ logger.info("Saving optimizer and scheduler states to %s", output_dir)
410
+
411
+ if args.max_steps > 0 and global_step > args.max_steps:
412
+ epoch_iterator.close()
413
+ break
414
+ if args.max_steps > 0 and global_step > args.max_steps:
415
+ train_iterator.close()
416
+ break
417
+
418
+ if args.local_rank in [-1, 0]:
419
+ tb_writer.close()
420
+
421
+ return global_step, tr_loss / global_step
422
+
423
+ # Evaluation of some model
424
+
425
+ def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, df_trn, df_val, prefix="") -> Dict:
426
+ # Loop to handle MNLI double evaluation (matched, mis-matched)
427
+ eval_output_dir = args.output_dir
428
+
429
+ eval_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=True)
430
+ os.makedirs(eval_output_dir, exist_ok=True)
431
+ args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
432
+ # Note that DistributedSampler samples randomly
433
+
434
+ def collate(examples: List[torch.Tensor]):
435
+ if tokenizer._pad_token is None:
436
+ return pad_sequence(examples, batch_first=True)
437
+ return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)
438
+
439
+ eval_sampler = SequentialSampler(eval_dataset)
440
+ eval_dataloader = DataLoader(
441
+ eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate, drop_last = True
442
+ )
443
+
444
+ # multi-gpu evaluate
445
+ if args.n_gpu > 1:
446
+ model = torch.nn.DataParallel(model)
447
+
448
+ # Eval!
449
+ logger.info("***** Running evaluation {} *****".format(prefix))
450
+ logger.info(" Num examples = %d", len(eval_dataset))
451
+ logger.info(" Batch size = %d", args.eval_batch_size)
452
+ eval_loss = 0.0
453
+ nb_eval_steps = 0
454
+ model.eval()
455
+
456
+ for batch in tqdm(eval_dataloader, desc="Evaluating"):
457
+ inputs, labels = (batch, batch)
458
+ inputs = inputs.to(args.device)
459
+ labels = labels.to(args.device)
460
+
461
+ with torch.no_grad():
462
+ outputs = model(inputs, labels=labels)
463
+ lm_loss = outputs[0]
464
+ eval_loss += lm_loss.mean().item()
465
+ nb_eval_steps += 1
466
+
467
+ eval_loss = eval_loss / nb_eval_steps
468
+ perplexity = torch.exp(torch.tensor(eval_loss))
469
+
470
+ result = {"perplexity": perplexity}
471
+
472
+ output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
473
+ with open(output_eval_file, "w") as writer:
474
+ logger.info("***** Eval results {} *****".format(prefix))
475
+ for key in sorted(result.keys()):
476
+ logger.info(" %s = %s", key, str(result[key]))
477
+ writer.write("%s = %s\n" % (key, str(result[key])))
478
+
479
+ return result
480
+
481
+ def main(df_trn, df_val):
482
+ args = Args()
483
+
484
+ if args.should_continue:
485
+ sorted_checkpoints = _sorted_checkpoints(args)
486
+ if len(sorted_checkpoints) == 0:
487
+ raise ValueError("Used --should_continue but no checkpoint was found in --output_dir.")
488
+ else:
489
+ args.model_name_or_path = sorted_checkpoints[-1]
490
+
491
+ if (
492
+ os.path.exists(args.output_dir)
493
+ and os.listdir(args.output_dir)
494
+ and args.do_train
495
+ and not args.overwrite_output_dir
496
+ and not args.should_continue
497
+ ):
498
+ raise ValueError(
499
+ "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
500
+ args.output_dir
501
+ )
502
+ )
503
+
504
+ # Setup CUDA, GPU & distributed training
505
+ device = torch.device("cuda")
506
+ args.n_gpu = torch.cuda.device_count()
507
+ args.device = device
508
+
509
+ # Setup logging
510
+ logging.basicConfig(
511
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
512
+ datefmt="%m/%d/%Y %H:%M:%S",
513
+ level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
514
+ )
515
+ logger.warning(
516
+ "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
517
+ args.local_rank,
518
+ device,
519
+ args.n_gpu,
520
+ bool(args.local_rank != -1),
521
+ args.fp16,
522
+ )
523
+
524
+ # Set seed
525
+ set_seed(args)
526
+
527
+ config = AutoConfig.from_pretrained(args.config_name, cache_dir=args.cache_dir)
528
+ tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, cache_dir=args.cache_dir)
529
+ model = AutoModelWithLMHead.from_pretrained(
530
+ args.model_name_or_path,
531
+ from_tf=False,
532
+ config=config,
533
+ cache_dir=args.cache_dir,
534
+ )
535
+ model.to(args.device)
536
+
537
+ logger.info("Training/evaluation parameters %s", args)
538
+
539
+ # Training
540
+ if args.do_train:
541
+ train_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False)
542
+
543
+ global_step, tr_loss = train(args, train_dataset, model, tokenizer)
544
+ logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
545
+
546
+ # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
547
+ if args.do_train:
548
+ # Create output directory if needed
549
+ os.makedirs(args.output_dir, exist_ok=True)
550
+
551
+ logger.info("Saving model checkpoint to %s", args.output_dir)
552
+ # Save a trained model, configuration and tokenizer using `save_pretrained()`.
553
+ # They can then be reloaded using `from_pretrained()`
554
+ model_to_save = (
555
+ model.module if hasattr(model, "module") else model
556
+ ) # Take care of distributed/parallel training
557
+ model_to_save.save_pretrained(args.output_dir)
558
+ tokenizer.save_pretrained(args.output_dir)
559
+
560
+ # Good practice: save your training arguments together with the trained model
561
+ torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
562
+
563
+ # Load a trained model and vocabulary that you have fine-tuned
564
+ model = AutoModelWithLMHead.from_pretrained(args.output_dir)
565
+ tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
566
+ model.to(args.device)
567
+
568
+ # Evaluation
569
+ results = {}
570
+ if args.do_eval and args.local_rank in [-1, 0]:
571
+ checkpoints = [args.output_dir]
572
+ if args.eval_all_checkpoints:
573
+ checkpoints = list(
574
+ os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
575
+ )
576
+ logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
577
+ logger.info("Evaluate the following checkpoints: %s", checkpoints)
578
+ for checkpoint in checkpoints:
579
+ global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
580
+ prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
581
+
582
+ model = AutoModelWithLMHead.from_pretrained(checkpoint)
583
+ model.to(args.device)
584
+ result = evaluate(args, model, tokenizer, df_trn, df_val, prefix=prefix)
585
+ result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
586
+ results.update(result)
587
+
588
+ return results
589
+
590
+ df = df.rename(columns={'Answer': 'response'})
591
+ df = df.rename(columns={'Question': 'context'})
592
+
593
+ df
594
+
595
+ main(df,df)
596
+
597
+ test_chatbot = []
598
+ text = "Hello"
599
+ # for i in range(len(test_query)):
600
  tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-small')
601
+ model = AutoModelWithLMHead.from_pretrained('output-small-save')
602
+ # append the new user input tokens to the chat history
603
+ bot_input_ids = tokenizer.encode(text + tokenizer.eos_token, return_tensors='pt')
604
+ print("Patient: {} \n".format(text))
605
+ print("Reference: {} \n".format(text))
606
+
607
+
608
+ # generated a response while limiting the total chat history to 1000 tokens,
609
+ chat_history_ids = model.generate(
610
+ bot_input_ids, max_length=100,
611
+ pad_token_id=tokenizer.eos_token_id,
612
+ no_repeat_ngram_size=3,
613
+ do_sample=True,
614
+ top_k=10,
615
+ top_p=0.7,
616
+ temperature = 0.8
617
+ )
618
+
619
+ # pretty print last ouput tokens from bot
620
+ st.write("Predict: {} \n\n".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))
621
+ test_chatbot.append(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True))
622
+
623
+ print(len(test_chatbot))
624
+
625
+
626
+
627
+
628
+
629
+
630
+
631
+
632
+
633
+
634
+
635
+ ####################################
636
+ ############Streamlit###############
637
 
638
  st.set_page_config(
639
  page_title="COVID Doctor using DialoGPT",