gorkemgoknar commited on
Commit
bdfd237
1 Parent(s): 2f7b134

Delete model.py

Browse files
Files changed (1) hide show
  1. model.py +0 -500
model.py DELETED
@@ -1,500 +0,0 @@
1
- from transformers import AutoConfig
2
- from transformers import GPT2Tokenizer, GPT2LMHeadModel
3
-
4
- from utils import SPECIAL_TOKENS, build_input_from_segments, add_special_tokens_
5
- from utils import get_dataset, download_pretrained_model
6
-
7
- import timeit
8
-
9
- import logging
10
- logging.basicConfig(format='%(asctime)s: %(message)s',level=logging.INFO)
11
- logger = logging.getLogger(__file__)
12
-
13
- import random
14
-
15
- from itertools import chain
16
- from pprint import pformat
17
- #import warnings
18
-
19
- import torch
20
- import torch.nn.functional as F
21
-
22
- import boto3
23
- import os
24
- import tarfile
25
- import io
26
- import base64
27
- import json
28
- import re
29
-
30
- from types import SimpleNamespace
31
-
32
- import warnings
33
- warnings.simplefilter(action='ignore', category=FutureWarning)
34
-
35
- print("Loading Model.py module...")
36
-
37
- s3 = boto3.client('s3')
38
-
39
-
40
- def is_list_of_strings(lst):
41
- if lst and isinstance(lst, list):
42
- return all(isinstance(elem, str) for elem in lst)
43
- else:
44
- return False
45
-
46
-
47
-
48
- class ServerlessModel:
49
- def __init__(self, model_path=None, s3_bucket=None, file_prefix=None, efs_path=None):
50
- #logging.basicConfig(level=logging.INFO)
51
- #logger = logging.getLogger(__file__)
52
- print("Trying to init model")
53
-
54
- self.model = None
55
- self.tokenizer = None
56
- self.dataset = None
57
-
58
- if s3_bucket is None:
59
- if model_path is not None and efs_path is None :
60
- print("Loading model from local..")
61
- self.model, self.tokenizer, self.dataset = self.from_pretrained_local_path(model_path, file_prefix)
62
- logging.debug("Done loading")
63
- else:
64
- ##Load model from EFS, with config and tokenizer from local lambda space
65
- if model_path is not None and efs_path is not None:
66
- print("loading model from EFS")
67
- self.model, self.tokenizer, self.dataset = self.from_pretrained(model_path, s3_bucket, file_prefix, efs_path=efs_path)
68
- logging.debug("Done loading")
69
- else:
70
- #no bucket no path fail
71
- print("ERROR: Model path not found")
72
- raise Exception("No model path found")
73
-
74
- else:
75
- print("Loading model from s3 path..")
76
- print(s3_bucket)
77
- self.model, self.tokenizer, self.dataset = self.from_pretrained(
78
- model_path, s3_bucket, file_prefix)
79
- logging.debug("Done loading")
80
-
81
-
82
- self.parameters = {
83
- 'max_length' : 25, #60
84
- 'min_length' : 1,
85
- 'device' : 'cpu',
86
- 'temperature' : 1.0, #1.5
87
- 'dynamic_temperature' : True,
88
- 'dynamic_temperature_range' : 0.15,
89
- 'top_k' : 50, #50
90
- 'top_p' : 0.9, #0.9
91
- 'no_sample' : False,
92
- 'max_history' : 2,
93
-
94
- }
95
-
96
- print("Done initializing model")
97
-
98
-
99
- def from_pretrained(self, model_path: str, s3_bucket: str, file_prefix: str , efs_path = None ):
100
-
101
- if efs_path is None:
102
- model = self.load_model_from_s3(model_path, s3_bucket, file_prefix)
103
- else:
104
- model = self.load_model_from_efs(model_path,efs_path)
105
-
106
- print("Model loaded.")
107
- print("loading tokenizer from path: ", model_path)
108
-
109
- tokenizer = self.load_tokenizer(model_path)
110
- # Get sequence length max of 1024
111
- tokenizer.model_max_length = 1024
112
- print("tokenizer loaded")
113
-
114
- self.model = model
115
- self.tokenizer = tokenizer
116
-
117
- add_special_tokens_(self.model, self.tokenizer)
118
-
119
- #Will only use if it cannot find cache
120
- DATASET_PATH = model_path + '/personafile.json' #maynot be needed if cache exists!
121
-
122
- ##We have cache no need for dataset path
123
- DATASET_CACHE = model_path +'/persona_good' ##persona_good_gpt2_cache (no zip extension)
124
-
125
-
126
- dataset = self.load_dataset(DATASET_PATH, DATASET_CACHE)
127
- self.dataset = dataset
128
-
129
- print("dataset loaded")
130
- model.eval()
131
- print("Model in eval mode, dataset and tokenizer also loaded")
132
- return model, tokenizer, dataset
133
-
134
- def load_model_from_path(self, model_path:str):
135
- print("Loading model from path:",model_path)
136
- model = GPT2LMHeadModel.from_pretrained(model_path)
137
- model.eval()
138
- self.model = model
139
- return model
140
-
141
-
142
- def from_pretrained_local_path(self, model_path: str, file_prefix: str):
143
- print("Local model loading...")
144
- model = GPT2LMHeadModel.from_pretrained(model_path)
145
- tokenizer = self.load_tokenizer(model_path)
146
-
147
- self.model = model
148
- self.tokenizer = tokenizer
149
-
150
- # Get sequence length max of 1024
151
- tokenizer.model_max_length = 1024
152
- add_special_tokens_(model, tokenizer)
153
-
154
-
155
- #Will only use if it cannot find cache
156
- DATASET_PATH = model_path + '/personafile.json' #maynot be needed if cache exists!
157
-
158
- ##We have cache no need for dataset path
159
- DATASET_CACHE = model_path +'/persona_good' ##persona_good_gpt2_cache (no zip extension)
160
-
161
- dataset = self.load_dataset(DATASET_PATH, DATASET_CACHE)
162
-
163
- self.dataset = dataset
164
-
165
-
166
- model.eval()
167
- print("Model in eval mode, dataset and tokenizer also loaded")
168
- return model, tokenizer, dataset
169
-
170
- def load_model_from_efs(self, model_path: str, efs_path: str):
171
- if model_path and efs_path:
172
- config = AutoConfig.from_pretrained(f'{model_path}/config.json')
173
- with open(efs_path, 'rb') as f:
174
- # state messes things just use classics!
175
- state = torch.load(io.BytesIO(
176
- f.read()), map_location=lambda storage, loc: storage)
177
-
178
- '''alt
179
- with open(efs_path, 'rb') as f:
180
- state = pickle.load(f, encoding='latin1')
181
- '''
182
- model = GPT2LMHeadModel.from_pretrained(
183
- pretrained_model_name_or_path=None, state_dict=state, config=config)
184
- return model
185
- else:
186
- raise KeyError('No model config path or EFS bin path')
187
-
188
-
189
- def load_model_from_s3(self, model_path: str, s3_bucket: str, file_prefix: str):
190
- if model_path and s3_bucket and file_prefix:
191
- obj = s3.get_object(Bucket=s3_bucket, Key=file_prefix)
192
- bytestream = io.BytesIO(obj['Body'].read())
193
-
194
- tar = tarfile.open(fileobj=bytestream, mode="r:gz")
195
- config = AutoConfig.from_pretrained(f'{model_path}/config.json')
196
- for member in tar.getmembers():
197
- if member.name.startswith("./._"):
198
- # osx tar adds ./._XXX copyfile need to pass this file
199
- continue
200
- if member.name.endswith(".bin"):
201
- f = tar.extractfile(member)
202
- print("Model file extracted: " + member.name)
203
-
204
- # state messes things just use classics!
205
- state = torch.load(io.BytesIO(
206
- f.read()), map_location=lambda storage, loc: storage)
207
- model = GPT2LMHeadModel.from_pretrained(
208
- pretrained_model_name_or_path=None, state_dict=state, config=config)
209
- #model = AutoModelWithLMHead.from_pretrained("./", config=config)
210
-
211
-
212
- return model
213
- else:
214
- raise KeyError('No S3 Bucket and Key Prefix provided')
215
-
216
- def load_tokenizer(self, model_path: str):
217
- print("loading tokenizer")
218
- tokenizer = GPT2Tokenizer.from_pretrained(model_path)
219
- return tokenizer
220
-
221
- def load_dataset(self, DATASET_PATH: str, DATASET_CACHE: str, use_efs= False):
222
- print("loading dataset")
223
- dataset = get_dataset(self.tokenizer, DATASET_PATH, DATASET_CACHE)
224
- return dataset
225
-
226
- def encode(self, question, context):
227
- encoded = self.tokenizer.encode_plus(question, context)
228
- return encoded["input_ids"], encoded["attention_mask"]
229
-
230
- def decode(self, token):
231
- answer_tokens = self.tokenizer.convert_ids_to_tokens(
232
- token, skip_special_tokens=True)
233
- return self.tokenizer.convert_tokens_to_string(answer_tokens)
234
-
235
- def generate_word(self, text, model=None, tokenizer=None, noprint=False):
236
- if model is None or tokenizer is None:
237
- print("ERROR: No model or tokenizer")
238
- return None
239
-
240
- inputs = tokenizer(text, return_tensors="pt")
241
-
242
- # model output
243
- outputs = model(**inputs, labels=inputs["input_ids"])
244
- loss, logits = outputs[:2]
245
- predicted_index = torch.argmax(logits[0, -1, :]).item()
246
- predicted_text = tokenizer.decode([predicted_index])
247
-
248
- # results
249
- if not noprint:
250
- print('input text:', text)
251
- print('predicted text:', predicted_text)
252
-
253
- return predicted_text
254
-
255
-
256
- def top_filtering(self,logits, top_k=0., top_p=0.9, threshold=-float('Inf'), filter_value=-float('Inf')):
257
- """ Filter a distribution of logits using top-k, top-p (nucleus) and/or threshold filtering
258
- Args:
259
- logits: logits distribution shape (vocabulary size)
260
- top_k: <=0: no filtering, >0: keep only top k tokens with highest probability.
261
- top_p: <=0.0: no filtering, >0.0: keep only a subset S of candidates, where S is the smallest subset
262
- whose total probability mass is greater than or equal to the threshold top_p.
263
- In practice, we select the highest probability tokens whose cumulative probability mass exceeds
264
- the threshold top_p.
265
- threshold: a minimal threshold to keep logits
266
- """
267
- assert logits.dim() == 1 # Only work for batch size 1 for now - could update but it would obfuscate a bit the code
268
- top_k = min(top_k, logits.size(-1))
269
- if top_k > 0:
270
- # Remove all tokens with a probability less than the last token in the top-k tokens
271
- indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
272
- logits[indices_to_remove] = filter_value
273
-
274
- if top_p > 0.0:
275
- # Compute cumulative probabilities of sorted tokens
276
- sorted_logits, sorted_indices = torch.sort(logits, descending=True)
277
- cumulative_probabilities = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
278
-
279
- # Remove tokens with cumulative probability above the threshold
280
- sorted_indices_to_remove = cumulative_probabilities > top_p
281
- # Shift the indices to the right to keep also the first token above the threshold
282
- sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
283
- sorted_indices_to_remove[..., 0] = 0
284
-
285
- # Back to unsorted indices and set them to -infinity
286
- indices_to_remove = sorted_indices[sorted_indices_to_remove]
287
- logits[indices_to_remove] = filter_value
288
-
289
- indices_to_remove = logits < threshold
290
- logits[indices_to_remove] = filter_value
291
-
292
- return logits
293
-
294
-
295
- def sample_sequence(self,personality, history, tokenizer, model, params=None, current_output=None):
296
-
297
- start = timeit.default_timer()
298
-
299
-
300
- if params is not None:
301
-
302
- for k,v in params.items():
303
- self.parameters[k] = v
304
-
305
- ##to access as dot notation
306
- ##param = SimpleNamespace(**parameters)
307
-
308
- special_tokens_ids = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS)
309
- if current_output is None:
310
- current_output = []
311
-
312
- for i in range(self.parameters['max_length']):
313
-
314
- #print(">: {}/{} ".format(i, self.parameters['max_length'] ) ,end='\r', flush=True)
315
-
316
- instance = build_input_from_segments(personality, history, current_output, tokenizer, with_eos=False)
317
-
318
- input_ids = torch.tensor(instance["input_ids"], device=self.parameters['device']).unsqueeze(0)
319
- token_type_ids = torch.tensor(instance["token_type_ids"], device=self.parameters['device']).unsqueeze(0)
320
-
321
- logits = model(input_ids, token_type_ids=token_type_ids)
322
- if isinstance(logits, tuple): # for gpt2 and maybe others
323
- logits = logits[0]
324
-
325
- #SPECIAL Dynamic Temperature mode
326
- if self.parameters['dynamic_temperature']:
327
- #random temperature withing -0.1 / + 0.1 or 'dynamic_temperature_range'
328
- rand_range = random.uniform(-1 * self.parameters['dynamic_temperature_range'] , self.parameters['dynamic_temperature_range'])
329
- temperature = self.parameters['temperature'] + rand_range
330
- else:
331
- temperature = self.parameters['temperature']
332
-
333
- logits = logits[0, -1, :] / temperature
334
-
335
- logits = self.top_filtering(logits, top_k=self.parameters['top_k'], top_p=self.parameters['top_p'])
336
-
337
- probs = F.softmax(logits, dim=-1)
338
-
339
- prev = torch.topk(probs, 1)[1] if self.parameters['no_sample'] else torch.multinomial(probs, 1)
340
- if i < self.parameters['min_length'] and prev.item() in special_tokens_ids:
341
- while prev.item() in special_tokens_ids:
342
- if probs.max().item() == 1:
343
- warnings.warn("Warning: model generating special token with probability 1.")
344
- break # avoid infinitely looping over special token
345
- prev = torch.multinomial(probs, num_samples=1)
346
-
347
- if prev.item() in special_tokens_ids:
348
- ##breaks here if found end of anser!!
349
- break
350
- current_output.append(prev.item())
351
-
352
-
353
- stop = timeit.default_timer()
354
- #print(f"\nPredict in {stop - start} seconds\n")
355
-
356
- return current_output
357
-
358
-
359
- def dump_personalities_with_movies(self):
360
- personalities = [ [dialog["name"], dialog["moviename"]] for dialog in self.dataset["train"]]
361
- name_list = []
362
- for person in personalities:
363
- try:
364
- name_tokenized = person[0]
365
- name = self.tokenizer.decode(name_tokenized)
366
- movies_tokenized = person[1]
367
- movienames= ""
368
- ##check type of first element
369
- ##if int , only 1 movie
370
- if isinstance(movies_tokenized[0], int):
371
- movienames = self.tokenizer.decode(movies_tokenized)
372
- movienames = movienames.replace(".txt", "")
373
- else:
374
- for movie in movies_tokenized:
375
- moviename = self.tokenizer.decode(movie)
376
- moviename = moviename.replace(".txt", "")
377
- movienames = movienames + " / " + moviename
378
- name_list.append([name,movienames])
379
- except:
380
- print("Could not do name:", self.tokenizer.decode(person[0]))
381
-
382
- return name_list
383
-
384
-
385
-
386
-
387
- def dump_personalities(self,as_list=False):
388
- personalities = [dialog["personality"] for dialog in self.dataset["train"]]
389
- name_list = []
390
- for person in personalities:
391
- name_tokenized = person[-1]
392
- name = self.tokenizer.decode(name_tokenized)
393
- name = name.replace("My name is ", "")[:-1]
394
- name_list.append(name)
395
- #print(name)
396
-
397
- if as_list:
398
- return name_list
399
- else:
400
- return " | ".join(name_list)
401
-
402
-
403
- def get_personalities(self):
404
- ##THIS FUNCTION IS NOW LEGACY, USE dump_personalities
405
- personalities = [dialog["personality"] for dialog in self.dataset["train"]]
406
-
407
- people = [item[-1][-10:-1] for item in personalities]
408
- ##will get My Name is Something
409
- people_list = self.tokenizer.decode(chain(*people))
410
-
411
- #print( " | ".join( people_list.split(" ") ) )
412
- text_to_remove = "My name is "
413
- people_list = people_list.replace(text_to_remove, " | ")
414
-
415
-
416
- #characters = " | ".join( people_list.split(" ") )
417
-
418
- return people_list
419
-
420
- def select_personality(self,characters,select_random=False):
421
- ##FIND people list
422
- ##this is for debug, usually has " is Name"
423
- #people = [item[-1][-3:-1] for item in personalities]
424
- personalities = [dialog["personality"] for dialog in self.dataset["train"]]
425
-
426
- if select_random : return random.choice(personalities)
427
-
428
-
429
- #people = [item[-1][-2:-1] for item in personalities]
430
- #people_list = self.tokenizer.decode(chain(*people))
431
- #print( " | ".join( people_list.split(" ") ) )
432
-
433
- personality = None
434
-
435
- name = "My name is " + str(characters)
436
- name_token = self.tokenizer.encode(name)
437
- #print(name_token)
438
- index_start = len(name_token)+1
439
-
440
- try:
441
-
442
- index_of_name = [ item[-1][-1*index_start: -1]== name_token for item in personalities].index(True)
443
-
444
- #print("Selected {} is at: {}".format(characters, str(index_of_name) ) )
445
- personality = personalities[index_of_name]
446
- except:
447
- print("Not found ... Select again")
448
- return None
449
-
450
- ##TALK TO HAL
451
- #personality_hal = ["that's true. My name is Hal"]
452
- #personality = tokenize(personality_hal)
453
- #print(personality)
454
-
455
- print("Selected personality: %s", self.tokenizer.decode(chain(*personality)))
456
-
457
- return personality
458
-
459
-
460
-
461
- def get_answer(self, input_text, personality, history, params=None):
462
-
463
- ##Check length of history (to save 1 computation!)
464
- if len(history)>0:
465
- #mostly it will be empty list so need a length check for performance
466
- #would do string check also but just assume it is list of list of strings, as not public
467
-
468
- new_hist = []
469
- for ele in history:
470
- new_hist.append( self.tokenizer.encode(ele) )
471
- history = new_hist.copy()
472
-
473
- history.append(self.tokenizer.encode(input_text))
474
-
475
- with torch.no_grad():
476
- out_ids = self.sample_sequence(personality, history, self.tokenizer, self.model, params=params)
477
- history.append(out_ids)
478
- history = history[-(2*self.parameters['max_history']+1):]
479
- out_text = self.tokenizer.decode(out_ids, skip_special_tokens=True)
480
- #print(out_text)
481
-
482
-
483
- history_decoded = []
484
- for ele in history:
485
- history_decoded.append(self.tokenizer.decode(ele))
486
-
487
- return out_text, history_decoded, self.parameters
488
-
489
-
490
-
491
- def predict(self, question, parameter_dict):
492
- try:
493
- answer = self.generate_text(question, model=self.model,
494
- tokenizer=self.tokenizer,
495
- parameter_dict=parameter_dict,
496
- )
497
- return answer
498
- except Exception as e:
499
- raise Exception(
500
- "Runtime error see cloudwatch logs : {}".format(repr(e)))