pseudotensor commited on
Commit
8cb62ff
1 Parent(s): 32c203b

Update with h2oGPT hash ed82638ef67fed048ff93c8a1048c32dfd28ef84

Browse files
Files changed (3) hide show
  1. app.py +0 -1158
  2. app.py +1 -0
  3. generate.py +1158 -0
app.py DELETED
@@ -1,1158 +0,0 @@
1
- import functools
2
- import sys
3
- import os
4
- import traceback
5
- import typing
6
-
7
- from utils import set_seed, clear_torch_cache, save_generate_output, NullContext, KThread, wrapped_partial
8
-
9
- SEED = 1236
10
- set_seed(SEED)
11
-
12
- os.environ['HF_HUB_DISABLE_TELEMETRY'] = '1'
13
- from typing import Union
14
- import numpy as np
15
- import pandas as pd
16
-
17
- import fire
18
- import torch
19
- from peft import PeftModel
20
- from transformers import GenerationConfig, StoppingCriteriaList, AutoModel, TextIteratorStreamer
21
- from accelerate import init_empty_weights, infer_auto_device_map
22
-
23
- from prompter import Prompter
24
-
25
- from finetune import get_loaders, example_data_points, generate_prompt, human, bot, inv_prompt_type_to_model_lower
26
- from stopping import StoppingCriteriaSub
27
-
28
- eval_extra_columns = ['prompt', 'response', 'score']
29
-
30
-
31
- def main(
32
- load_8bit: bool = False,
33
- load_half: bool = True,
34
- infer_devices: bool = True, # really if to "control" devices now
35
- base_model: str = '',
36
- tokenizer_base_model: str = '',
37
- lora_weights: str = "",
38
- gpu_id: int = 0, # if infer_devices = True and gpu_id != -1
39
-
40
- prompt_type: Union[int, str] = None,
41
- # input to generation
42
- temperature: float = None,
43
- top_p: float = None,
44
- top_k: int = None,
45
- num_beams: int = None,
46
- repetition_penalty: float = None,
47
- num_return_sequences: int = None,
48
- do_sample: bool = None,
49
- max_new_tokens: int = None,
50
- min_new_tokens: int = None,
51
- early_stopping: Union[bool, str] = None,
52
- max_time: float = None,
53
-
54
- debug: bool = False,
55
- save_dir: str = None,
56
- share: bool = True,
57
- local_files_only: bool = False,
58
- resume_download: bool = True,
59
- use_auth_token: Union[str, bool] = False, # True requires CLI did huggingface-cli login before running
60
-
61
- src_lang: str = "English",
62
- tgt_lang: str = "Russian",
63
-
64
- gradio: bool = True,
65
- gradio_avoid_processing_markdown: bool = False,
66
- chat: bool = True,
67
- chat_history: int = 4096, # character length of chat context/history
68
- chat_context: bool = False, # use default context if human_bot
69
- stream_output: bool = True,
70
- show_examples: bool = None,
71
- verbose: bool = False,
72
- h2ocolors: bool = True,
73
- height: int = 400,
74
- show_lora: bool = True,
75
- # set to True to load --base_model after client logs in,
76
- # to be able to free GPU memory when model is swapped
77
- login_mode_if_model0: bool = False,
78
- block_gradio_exit: bool = True,
79
- concurrency_count: int = 1,
80
- api_open: bool = False, # don't let API skip queue
81
- allow_api: bool = True,
82
- input_lines: int = 1,
83
-
84
- sanitize_user_prompt: bool = True,
85
- sanitize_bot_response: bool = True,
86
-
87
- extra_model_options: typing.List[str] = [],
88
- extra_lora_options: typing.List[str] = [],
89
-
90
- score_model: str = 'OpenAssistant/reward-model-deberta-v3-large-v2',
91
- auto_score: bool = True,
92
-
93
- eval_sharegpt_prompts_only: int = 0,
94
- eval_sharegpt_prompts_only_seed: int = 1234,
95
- eval_sharegpt_as_output: bool = False,
96
-
97
- hard_stop_list: typing.List[str] = [],
98
- ):
99
- is_hf = bool(os.getenv("HUGGINGFACE_SPACES"))
100
- is_gpth2oai = bool(os.getenv("GPT_H2O_AI"))
101
- is_public = is_hf or is_gpth2oai # multi-user case with fixed model and disclaimer
102
- is_low_mem = is_hf # assumes run on 24GB consumer GPU
103
- admin_pass = os.getenv("ADMIN_PASS")
104
- # will sometimes appear in UI or sometimes actual generation, but maybe better than empty result
105
- # but becomes unrecoverable sometimes if raise, so just be silent for now
106
- raise_generate_gpu_exceptions = not is_public
107
-
108
- # allow set token directly
109
- use_auth_token = os.environ.get("HUGGINGFACE_API_TOKEN", use_auth_token)
110
-
111
- if is_public:
112
- input_lines = 1 # ensure set, for ease of use
113
- temperature = 0.4
114
- top_p = 0.85
115
- top_k = 70
116
- do_sample = True
117
- if is_low_mem:
118
- base_model = 'h2oai/h2ogpt-oasst1-512-12b'
119
- load_8bit = True
120
- else:
121
- base_model = 'h2oai/h2ogpt-oasst1-512-20b'
122
- if is_low_mem:
123
- load_8bit = True
124
- if is_hf:
125
- # must override share if in spaces
126
- share = False
127
- save_dir = os.getenv('SAVE_DIR', save_dir)
128
- score_model = os.getenv('SCORE_MODEL', score_model)
129
- if score_model == 'None':
130
- score_model = ''
131
- concurrency_count = int(os.getenv('CONCURRENCY_COUNT', concurrency_count))
132
- api_open = bool(int(os.getenv('API_OPEN', api_open)))
133
- allow_api = bool(int(os.getenv('ALLOW_API', allow_api)))
134
-
135
- n_gpus = torch.cuda.device_count()
136
-
137
- # get defaults
138
- model_lower = base_model.lower()
139
- if not gradio:
140
- # force, else not single response like want to look at
141
- stream_output = False
142
- # else prompt removal can mess up output
143
- chat = False
144
-
145
- placeholder_instruction, placeholder_input, \
146
- stream_output, show_examples, \
147
- prompt_type, temperature, top_p, top_k, num_beams, \
148
- max_new_tokens, min_new_tokens, early_stopping, max_time, \
149
- repetition_penalty, num_return_sequences, \
150
- do_sample, \
151
- src_lang, tgt_lang, \
152
- examples, \
153
- task_info = \
154
- get_generate_params(model_lower, chat,
155
- stream_output, show_examples,
156
- prompt_type, temperature, top_p, top_k, num_beams,
157
- max_new_tokens, min_new_tokens, early_stopping, max_time,
158
- repetition_penalty, num_return_sequences,
159
- do_sample,
160
- )
161
-
162
- if not gradio:
163
- if eval_sharegpt_prompts_only > 0:
164
- # override default examples with shareGPT ones for human-level eval purposes only
165
- eval_filename = 'ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json'
166
- if not os.path.isfile(eval_filename):
167
- os.system(
168
- 'wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/%s' % eval_filename)
169
- import json
170
- data = json.load(open(eval_filename, 'rt'))
171
- # focus on data that starts with human, else likely chopped from other data
172
- turn_start = 0 # odd in general
173
- data = [x for x in data if len(x['conversations']) > turn_start + 1 and
174
- x['conversations'][turn_start]['from'] == 'human' and
175
- x['conversations'][turn_start + 1]['from'] == 'gpt']
176
- np.random.seed(eval_sharegpt_prompts_only_seed)
177
- example1 = examples[-1] # pick reference example
178
- examples = []
179
- responses = []
180
- for i in list(np.random.randint(0, len(data), size=eval_sharegpt_prompts_only)):
181
- assert data[i]['conversations'][turn_start]['from'] == 'human'
182
- instruction = data[i]['conversations'][turn_start]['value']
183
- assert data[i]['conversations'][turn_start + 1]['from'] == 'gpt'
184
- output = data[i]['conversations'][turn_start + 1]['value']
185
- examplenew = example1.copy()
186
- assert not chat, "No gradio must use chat=False, uses nochat instruct"
187
- examplenew[eval_func_param_names.index('instruction_nochat')] = instruction
188
- examplenew[eval_func_param_names.index('iinput_nochat')] = '' # no input
189
- examplenew[eval_func_param_names.index('context')] = get_context(chat_context, prompt_type)
190
- examples.append(examplenew)
191
- responses.append(output)
192
-
193
- num_examples = len(examples)
194
- scoring_path = 'scoring'
195
- os.makedirs(scoring_path, exist_ok=True)
196
- if eval_sharegpt_as_output:
197
- used_base_model = 'gpt35'
198
- used_lora_weights = ''
199
- else:
200
- used_base_model = str(base_model.split('/')[-1])
201
- used_lora_weights = str(lora_weights.split('/')[-1])
202
- eval_filename = "df_scores_%s_%s_%s_%s_%s_%s.parquet" % (num_examples, eval_sharegpt_prompts_only,
203
- eval_sharegpt_prompts_only_seed,
204
- eval_sharegpt_as_output,
205
- used_base_model,
206
- used_lora_weights)
207
- eval_filename = os.path.join(scoring_path, eval_filename)
208
-
209
- # torch.device("cuda") leads to cuda:x cuda:y mismatches for multi-GPU consistently
210
- context_class = NullContext() if n_gpus > 1 else torch.device("cuda")
211
-
212
- with context_class:
213
- # ensure was set right above before examples generated
214
- assert not stream_output, "stream_output=True does not make sense with example loop"
215
- import time
216
- from functools import partial
217
-
218
- # get score model
219
- smodel, stokenizer, sdevice = get_score_model(**locals())
220
-
221
- if not eval_sharegpt_as_output:
222
- model, tokenizer, device = get_model(**locals())
223
- model_state = [model, tokenizer, device, base_model]
224
- fun = partial(evaluate, model_state, debug=debug, save_dir=save_dir, is_low_mem=is_low_mem,
225
- raise_generate_gpu_exceptions=raise_generate_gpu_exceptions,
226
- chat_context=chat_context,
227
- concurrency_count=concurrency_count)
228
- else:
229
- assert eval_sharegpt_prompts_only > 0
230
-
231
- def get_response(*args, exi=0):
232
- # assumes same ordering of examples and responses
233
- yield responses[exi]
234
-
235
- fun = get_response
236
- t0 = time.time()
237
- score_dump = []
238
-
239
- import matplotlib.pyplot as plt
240
-
241
- for exi, ex in enumerate(examples):
242
- instruction = ex[eval_func_param_names.index('instruction_nochat')]
243
- iinput = ex[eval_func_param_names.index('iinput_nochat')]
244
- context = ex[eval_func_param_names.index('context')]
245
- clear_torch_cache()
246
- print("")
247
- print("START" + "=" * 100)
248
- print("Question: %s %s" % (instruction, ('input=%s' % iinput if iinput else '')))
249
- print("-" * 105)
250
- # fun yields as generator, so have to iterate over it
251
- # Also means likely do NOT want --stream_output=True, else would show all generations
252
- gener = fun(*tuple(ex), exi=exi) if eval_sharegpt_as_output else fun(*tuple(ex))
253
- for res in gener:
254
- print(res)
255
- if smodel:
256
- score_with_prompt = False
257
- if score_with_prompt:
258
- data_point = dict(instruction=instruction, input=iinput, context=context)
259
- prompter = Prompter(prompt_type, debug=debug, chat=chat, stream_output=stream_output)
260
- prompt = prompter.generate_prompt(data_point)
261
- else:
262
- # just raw input and output
263
- if eval_sharegpt_prompts_only > 0:
264
- # only our own examples have this filled at moment
265
- assert iinput in [None, ''], iinput # should be no iinput
266
- if not (chat_context and prompt_type == 'human_bot'):
267
- assert context in [None, ''], context # should be no context
268
- prompt = instruction
269
- cutoff_len = 768 if is_low_mem else 2048
270
- inputs = stokenizer(prompt, res,
271
- return_tensors="pt",
272
- truncation=True,
273
- max_length=cutoff_len)
274
- try:
275
- score = torch.sigmoid(smodel(**inputs).logits[0]).cpu().detach().numpy()[0]
276
- except torch.cuda.OutOfMemoryError as e:
277
- print("GPU OOM 1: question: %s answer: %s exception: %s" % (prompt, res, str(e)), flush=True)
278
- traceback.print_exc()
279
- score = 0.0
280
- clear_torch_cache()
281
- except (Exception, RuntimeError) as e:
282
- if 'Expected all tensors to be on the same device' in str(e) or \
283
- 'expected scalar type Half but found Float' in str(e) or \
284
- 'probability tensor contains either' in str(e) or \
285
- 'cublasLt ran into an error!' in str(e):
286
- print("GPU error: question: %s answer: %s exception: %s" % (prompt, res, str(e)),
287
- flush=True)
288
- traceback.print_exc()
289
- score = 0.0
290
- clear_torch_cache()
291
- else:
292
- raise
293
- print("SCORE %s: %s" % (exi, score), flush=True)
294
- score_dump.append(ex + [prompt, res, score])
295
- # dump every score in case abort
296
- df_scores = pd.DataFrame(score_dump,
297
- columns=eval_func_param_names + eval_extra_columns)
298
- df_scores.to_parquet(eval_filename, index=False)
299
- # plot histogram so far
300
- plt.figure(figsize=(10, 10))
301
- plt.hist(df_scores['score'], bins=20)
302
- score_avg = np.mean(df_scores['score'])
303
- score_median = np.median(df_scores['score'])
304
- plt.title("Score avg: %s median: %s" % (score_avg, score_median))
305
- plt.savefig(eval_filename.replace('.parquet', '.png'))
306
- plt.close()
307
-
308
- print("END" + "=" * 102)
309
- print("")
310
- t2 = time.time()
311
- print("Time taken so far: %.4f about %.4g per example" % (t2 - t0, (t2 - t0) / (1 + exi)))
312
- t1 = time.time()
313
- print("Total time taken: %.4f about %.4g per example" % (t1 - t0, (t1 - t0) / num_examples))
314
- return eval_filename
315
-
316
- if gradio:
317
- # imported here so don't require gradio to run generate
318
- from gradio_runner import go_gradio
319
-
320
- # get default model
321
- all_kwargs = locals().copy()
322
- if all_kwargs.get('base_model') and not all_kwargs['login_mode_if_model0']:
323
- model0, tokenizer0, device = get_model(**all_kwargs)
324
- else:
325
- # if empty model, then don't load anything, just get gradio up
326
- model0, tokenizer0, device = None, None, None
327
- model_state0 = [model0, tokenizer0, device, all_kwargs['base_model']]
328
-
329
- # get score model
330
- smodel, stokenizer, sdevice = get_score_model(**all_kwargs)
331
- score_model_state0 = [smodel, stokenizer, sdevice, score_model]
332
-
333
- go_gradio(**locals())
334
-
335
-
336
- def get_device():
337
- if torch.cuda.is_available():
338
- device = "cuda"
339
- else:
340
- raise RuntimeError("only cuda supported")
341
-
342
- return device
343
-
344
-
345
- def get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward_type,
346
- gpu_id=0,
347
- use_auth_token=False):
348
- """
349
- Ensure model gets on correct device
350
- :param base_model:
351
- :param model_loader:
352
- :param load_half:
353
- :param model_kwargs:
354
- :param reward_type:
355
- :param gpu_id:
356
- :param use_auth_token:
357
- :return:
358
- """
359
- with init_empty_weights():
360
- from transformers import AutoConfig
361
- config = AutoConfig.from_pretrained(base_model, use_auth_token=use_auth_token)
362
- model = AutoModel.from_config(
363
- config,
364
- )
365
-
366
- # NOTE: Can specify max_memory={0: max_mem, 1: max_mem}, to shard model
367
- # NOTE: Some models require avoiding sharding some layers,
368
- # then would pass no_split_module_classes and give list of those layers.
369
- device_map = infer_auto_device_map(
370
- model,
371
- dtype=torch.float16 if load_half else torch.float32,
372
- )
373
- if hasattr(model, 'model'):
374
- device_map_model = infer_auto_device_map(
375
- model.model,
376
- dtype=torch.float16 if load_half else torch.float32,
377
- )
378
- device_map.update(device_map_model)
379
- print('device_map: %s' % device_map, flush=True)
380
-
381
- if gpu_id >= 0:
382
- # FIXME: If really distributes model, tend to get things like: ValueError: gpt_neox.embed_in.weight doesn't have any device set.
383
- # So avoid for now, just put on first GPU, unless score_model, put on last
384
- n_gpus = torch.cuda.device_count()
385
- if reward_type:
386
- device_map = {'': n_gpus - 1}
387
- else:
388
- device_map = {'': min(n_gpus - 1, gpu_id)}
389
- if gpu_id == -1:
390
- device_map = {'': 'cuda'}
391
-
392
- load_in_8bit = model_kwargs.get('load_in_8bit', False)
393
- model_kwargs['device_map'] = device_map
394
-
395
- if load_in_8bit or not load_half:
396
- model = model_loader.from_pretrained(
397
- base_model,
398
- **model_kwargs,
399
- )
400
- else:
401
- model = model_loader.from_pretrained(
402
- base_model,
403
- **model_kwargs,
404
- ).half()
405
- return model
406
-
407
-
408
- def get_model(
409
- load_8bit: bool = False,
410
- load_half: bool = True,
411
- infer_devices: bool = True,
412
- base_model: str = '',
413
- tokenizer_base_model: str = '',
414
- lora_weights: str = "",
415
- gpu_id: int = 0,
416
-
417
- reward_type: bool = None,
418
- local_files_only: bool = False,
419
- resume_download: bool = True,
420
- use_auth_token: Union[str, bool] = False,
421
- compile: bool = True,
422
- **kwargs,
423
- ):
424
- """
425
-
426
- :param load_8bit: load model in 8-bit, not supported by all models
427
- :param load_half: load model in 16-bit
428
- :param infer_devices: Use torch infer of optimal placement of layers on devices (for non-lora case)
429
- For non-LORA case, False will spread shards across multiple GPUs, but this can lead to cuda:x cuda:y mismatches
430
- So it is not the default
431
- :param base_model: name/path of base model
432
- :param tokenizer_base_model: name/path of tokenizer
433
- :param lora_weights: name/path
434
- :param gpu_id: which GPU (0..n_gpus-1) or allow all GPUs if relevant (-1)
435
- :param reward_type: reward type model for sequence classification
436
- :param local_files_only: use local files instead of from HF
437
- :param resume_download: resume downloads from HF
438
- :param use_auth_token: assumes user did on CLI `huggingface-cli login` to access private repo
439
- :param compile: whether to compile torch model
440
- :param kwargs:
441
- :return:
442
- """
443
- print("Get %s model" % base_model, flush=True)
444
- if lora_weights is not None and lora_weights.strip():
445
- print("Get %s lora weights" % lora_weights, flush=True)
446
- device = get_device()
447
-
448
- if 'gpt2' in base_model.lower():
449
- # RuntimeError: where expected condition to be a boolean tensor, but got a tensor with dtype Half
450
- load_8bit = False
451
-
452
- assert base_model.strip(), (
453
- "Please choose a base model with --base_model (CLI) or in Models Tab (gradio)"
454
- )
455
-
456
- from transformers import AutoConfig
457
- config = AutoConfig.from_pretrained(base_model, use_auth_token=use_auth_token)
458
- llama_type_from_config = 'llama' in str(config).lower()
459
- llama_type_from_name = "llama" in base_model.lower()
460
- llama_type = llama_type_from_config or llama_type_from_name
461
- if llama_type:
462
- print("Detected as llama type from"
463
- " config (%s) or name (%s)" % (llama_type_from_config, llama_type_from_name), flush=True)
464
-
465
- model_loader, tokenizer_loader = get_loaders(llama_type=llama_type, model_name=base_model, reward_type=reward_type)
466
- if not tokenizer_base_model:
467
- tokenizer_base_model = base_model
468
-
469
- if tokenizer_loader is not None and not isinstance(tokenizer_loader, str):
470
- tokenizer = tokenizer_loader.from_pretrained(tokenizer_base_model,
471
- local_files_only=local_files_only,
472
- resume_download=resume_download,
473
- use_auth_token=use_auth_token,
474
- )
475
- else:
476
- tokenizer = tokenizer_loader
477
-
478
- if isinstance(tokenizer, str):
479
- # already a pipeline, tokenizer_loader is string for task
480
- model = model_loader(tokenizer,
481
- model=base_model,
482
- device=0 if device == "cuda" else -1,
483
- torch_dtype=torch.float16)
484
- else:
485
- assert device == "cuda", "Unsupported device %s" % device
486
- model_kwargs = dict(local_files_only=local_files_only,
487
- torch_dtype=torch.float16,
488
- resume_download=resume_download,
489
- use_auth_token=use_auth_token)
490
- if 'mbart-' not in base_model.lower():
491
- model_kwargs.update(dict(load_in_8bit=load_8bit,
492
- device_map={"": 0} if load_8bit else "auto",
493
- ))
494
- if 'OpenAssistant/reward-model'.lower() in base_model.lower():
495
- # could put on other GPUs
496
- model_kwargs['device_map'] = {"": 0}
497
- model_kwargs.pop('torch_dtype', None)
498
-
499
- if not lora_weights:
500
- with torch.device("cuda"):
501
- if infer_devices:
502
- model = get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward_type,
503
- gpu_id=gpu_id, use_auth_token=use_auth_token)
504
- else:
505
- if load_half and not load_8bit:
506
- model = model_loader.from_pretrained(
507
- base_model,
508
- **model_kwargs).half()
509
- else:
510
- model = model_loader.from_pretrained(
511
- base_model,
512
- **model_kwargs)
513
- elif load_8bit:
514
- model = model_loader.from_pretrained(
515
- base_model,
516
- **model_kwargs
517
- )
518
- model = PeftModel.from_pretrained(
519
- model,
520
- lora_weights,
521
- torch_dtype=torch.float16,
522
- local_files_only=local_files_only,
523
- resume_download=resume_download,
524
- use_auth_token=use_auth_token,
525
- device_map={"": 0}, # seems to be required
526
- )
527
- else:
528
- with torch.device("cuda"):
529
- model = model_loader.from_pretrained(
530
- base_model,
531
- **model_kwargs
532
- )
533
- model = PeftModel.from_pretrained(
534
- model,
535
- lora_weights,
536
- torch_dtype=torch.float16,
537
- local_files_only=local_files_only,
538
- resume_download=resume_download,
539
- use_auth_token=use_auth_token,
540
- device_map="auto",
541
- )
542
- if load_half:
543
- model.half()
544
-
545
- # unwind broken decapoda-research config
546
- if llama_type:
547
- model.config.pad_token_id = tokenizer.pad_token_id = 0 # unk
548
- model.config.bos_token_id = 1
549
- model.config.eos_token_id = 2
550
- if 'gpt2' in base_model.lower():
551
- # add special tokens that otherwise all share the same id
552
- tokenizer.add_special_tokens({'bos_token': '<bos>',
553
- 'eos_token': '<eos>',
554
- 'pad_token': '<pad>'})
555
-
556
- if not isinstance(tokenizer, str):
557
- model.eval()
558
- if torch.__version__ >= "2" and sys.platform != "win32" and compile:
559
- model = torch.compile(model)
560
-
561
- return model, tokenizer, device
562
-
563
-
564
- def get_score_model(**kwargs):
565
- # score model
566
- if kwargs.get('score_model') is not None and kwargs.get('score_model').strip():
567
- score_all_kwargs = kwargs.copy()
568
- score_all_kwargs['load_8bit'] = False
569
- score_all_kwargs['load_half'] = False
570
- score_all_kwargs['base_model'] = kwargs.get('score_model').strip()
571
- score_all_kwargs['tokenizer_base_model'] = ''
572
- score_all_kwargs['lora_weights'] = ''
573
- score_all_kwargs['llama_type'] = False
574
- score_all_kwargs['compile'] = False
575
- smodel, stokenizer, sdevice = get_model(**score_all_kwargs)
576
- else:
577
- smodel, stokenizer, sdevice = None, None, None
578
- return smodel, stokenizer, sdevice
579
-
580
-
581
- eval_func_param_names = ['instruction',
582
- 'iinput',
583
- 'context',
584
- 'stream_output',
585
- 'prompt_type',
586
- 'temperature',
587
- 'top_p',
588
- 'top_k',
589
- 'num_beams',
590
- 'max_new_tokens',
591
- 'min_new_tokens',
592
- 'early_stopping',
593
- 'max_time',
594
- 'repetition_penalty',
595
- 'num_return_sequences',
596
- 'do_sample',
597
- 'chat',
598
- 'instruction_nochat',
599
- 'iinput_nochat',
600
- ]
601
-
602
-
603
- def evaluate(
604
- model_state,
605
- # START NOTE: Examples must have same order of parameters
606
- instruction,
607
- iinput,
608
- context,
609
- stream_output,
610
- prompt_type,
611
- temperature,
612
- top_p,
613
- top_k,
614
- num_beams,
615
- max_new_tokens,
616
- min_new_tokens,
617
- early_stopping,
618
- max_time,
619
- repetition_penalty,
620
- num_return_sequences,
621
- do_sample,
622
- chat,
623
- instruction_nochat,
624
- iinput_nochat,
625
- # END NOTE: Examples must have same order of parameters
626
- src_lang=None,
627
- tgt_lang=None,
628
- debug=False,
629
- concurrency_count=None,
630
- save_dir=None,
631
- hard_stop_list=None,
632
- sanitize_bot_response=True,
633
- model_state0=None,
634
- is_low_mem=None,
635
- raise_generate_gpu_exceptions=None,
636
- chat_context=None,
637
- ):
638
- # ensure passed these
639
- assert concurrency_count is not None
640
- assert is_low_mem is not None
641
- assert raise_generate_gpu_exceptions is not None
642
- assert chat_context is not None
643
-
644
- if debug:
645
- locals_dict = locals().copy()
646
- locals_dict.pop('model_state', None)
647
- locals_dict.pop('model_state0', None)
648
- print(locals_dict)
649
-
650
- no_model_msg = "Please choose a base model with --base_model (CLI) or in Models Tab (gradio).\nThen start New Conversation"
651
-
652
- if model_state0 is None:
653
- # e.g. for no gradio case, set dummy value, else should be set
654
- model_state0 = [None, None, None, None]
655
-
656
- if model_state is not None and len(model_state) == 4 and not isinstance(model_state[0], str):
657
- # try to free-up original model (i.e. list was passed as reference)
658
- if model_state0 is not None and model_state0[0] is not None:
659
- model_state0[0].cpu()
660
- model_state0[0] = None
661
- # try to free-up original tokenizer (i.e. list was passed as reference)
662
- if model_state0 is not None and model_state0[1] is not None:
663
- model_state0[1] = None
664
- clear_torch_cache()
665
- model, tokenizer, device, base_model = model_state
666
- elif model_state0 is not None and len(model_state0) == 4 and model_state0[0] is not None:
667
- assert isinstance(model_state[0], str)
668
- model, tokenizer, device, base_model = model_state0
669
- else:
670
- raise AssertionError(no_model_msg)
671
-
672
- if base_model is None:
673
- raise AssertionError(no_model_msg)
674
-
675
- assert base_model.strip(), no_model_msg
676
- assert model, "Model is missing"
677
- assert tokenizer, "Tokenizer is missing"
678
-
679
- # choose chat or non-chat mode
680
- if not chat:
681
- instruction = instruction_nochat
682
- iinput = iinput_nochat
683
-
684
- if not context:
685
- # get hidden context if have one
686
- context = get_context(chat_context, prompt_type)
687
-
688
- data_point = dict(context=context, instruction=instruction, input=iinput)
689
- prompter = Prompter(prompt_type, debug=debug, chat=chat, stream_output=stream_output)
690
- prompt = prompter.generate_prompt(data_point)
691
-
692
- if hard_stop_list is None:
693
- # acts like undo on user entry and bot response
694
- hard_stop_list = []
695
-
696
- if isinstance(tokenizer, str):
697
- # pipeline
698
- if tokenizer == "summarization":
699
- key = 'summary_text'
700
- else:
701
- raise RuntimeError("No such task type %s" % tokenizer)
702
- # NOTE: uses max_length only
703
- yield model(prompt, max_length=max_new_tokens)[0][key]
704
-
705
- if 'mbart-' in base_model.lower():
706
- assert src_lang is not None
707
- tokenizer.src_lang = languages_covered()[src_lang]
708
-
709
- if chat:
710
- # override, ignore user change
711
- num_return_sequences = 1
712
- if prompt_type in ['human_bot', 'instruct_vicuna', 'instruct_with_end']:
713
- if prompt_type == 'human_bot':
714
- # encounters = [prompt.count(human) + 1, prompt.count(bot) + 1]
715
- # stopping only starts once output is beyond prompt
716
- # 1 human is enough to trigger, but need 2 bots, because very first view back will be bot we added
717
- stop_words = [human, bot, '\n' + human, '\n' + bot]
718
- encounters = [1, 2]
719
- elif prompt_type == 'instruct_vicuna':
720
- # even below is not enough, generic strings and many ways to encode
721
- stop_words = [
722
- '### Human:',
723
- """
724
- ### Human:""",
725
- """
726
- ### Human:
727
- """,
728
- '### Assistant:',
729
- """
730
- ### Assistant:""",
731
- """
732
- ### Assistant:
733
- """,
734
- ]
735
- encounters = [1, 2]
736
- else:
737
- # some instruct prompts have this as end, doesn't hurt to stop on it since not common otherwise
738
- stop_words = ['### End']
739
- encounters = [1]
740
- stop_words_ids = [
741
- tokenizer(stop_word, return_tensors='pt')['input_ids'].squeeze() for stop_word in stop_words]
742
- # handle single token case
743
- stop_words_ids = [x if len(x.shape) > 0 else torch.tensor([x]) for x in stop_words_ids]
744
- stop_words_ids = [x for x in stop_words_ids if x.shape[0] > 0]
745
- # avoid padding in front of tokens
746
- if tokenizer.pad_token:
747
- stop_words_ids = [x[1:] if x[0] == tokenizer.pad_token_id and len(x) > 1 else x for x in stop_words_ids]
748
- # handle fake \n added
749
- stop_words_ids = [x[1:] if y[0] == '\n' else x for x, y in zip(stop_words_ids, stop_words)]
750
- # build stopper
751
- stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids, encounters=encounters)])
752
- else:
753
- stopping_criteria = StoppingCriteriaList()
754
-
755
- # help to avoid errors like:
756
- # RuntimeError: The size of tensor a (2048) must match the size of tensor b (2049) at non-singleton dimension 3
757
- # RuntimeError: expected scalar type Half but found Float
758
- # with - 256
759
- max_length_tokenize = 768 - 256 if is_low_mem else 2048 - 256
760
- cutoff_len = max_length_tokenize * 4 # if reaches limit, then can't generate new tokens
761
- output_smallest = 30 * 4
762
- prompt = prompt[-cutoff_len - output_smallest:]
763
- inputs = tokenizer(prompt,
764
- return_tensors="pt",
765
- truncation=True,
766
- max_length=max_length_tokenize)
767
- if debug and len(inputs["input_ids"]) > 0:
768
- print('input_ids length', len(inputs["input_ids"][0]), flush=True)
769
- input_ids = inputs["input_ids"].to(device)
770
- generation_config = GenerationConfig(
771
- temperature=float(temperature),
772
- top_p=float(top_p),
773
- top_k=top_k,
774
- num_beams=num_beams,
775
- do_sample=do_sample,
776
- repetition_penalty=float(repetition_penalty),
777
- num_return_sequences=num_return_sequences,
778
- renormalize_logits=True,
779
- remove_invalid_values=True,
780
- )
781
-
782
- gen_kwargs = dict(input_ids=input_ids,
783
- generation_config=generation_config,
784
- return_dict_in_generate=True,
785
- output_scores=True,
786
- max_new_tokens=max_new_tokens, # prompt + new
787
- min_new_tokens=min_new_tokens, # prompt + new
788
- early_stopping=early_stopping, # False, True, "never"
789
- max_time=max_time,
790
- stopping_criteria=stopping_criteria,
791
- )
792
- if 'gpt2' in base_model.lower():
793
- gen_kwargs.update(dict(bos_token_id=tokenizer.bos_token_id, pad_token_id=tokenizer.eos_token_id))
794
- elif 'mbart-' in base_model.lower():
795
- assert tgt_lang is not None
796
- tgt_lang = languages_covered()[tgt_lang]
797
- gen_kwargs.update(dict(forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang]))
798
- else:
799
- gen_kwargs.update(dict(pad_token_id=tokenizer.eos_token_id))
800
-
801
- decoder = functools.partial(tokenizer.decode,
802
- skip_special_tokens=True,
803
- clean_up_tokenization_spaces=True,
804
- )
805
- decoder_raw = functools.partial(tokenizer.decode,
806
- skip_special_tokens=False,
807
- clean_up_tokenization_spaces=True,
808
- )
809
-
810
- with torch.no_grad():
811
- # decoded tokenized prompt can deviate from prompt due to special characters
812
- inputs_decoded = decoder(input_ids[0])
813
- inputs_decoded_raw = decoder_raw(input_ids[0])
814
- if inputs_decoded == prompt:
815
- # normal
816
- pass
817
- elif inputs_decoded.lstrip() == prompt.lstrip():
818
- # sometimes extra space in front, make prompt same for prompt removal
819
- prompt = inputs_decoded
820
- elif inputs_decoded_raw == prompt:
821
- # some models specify special tokens that are part of normal prompt, so can't skip them
822
- inputs_decoded_raw = inputs_decoded
823
- decoder = decoder_raw
824
- else:
825
- print("WARNING: Special characters in prompt", flush=True)
826
- if stream_output:
827
- #skip_prompt = prompt_type != 'plain'
828
- skip_prompt = False
829
- streamer = TextIteratorStreamer(tokenizer, skip_prompt=skip_prompt)
830
- gen_kwargs.update(dict(streamer=streamer))
831
- if debug:
832
- KThread.show_threads()
833
- target_func = generate_with_exceptions
834
- if concurrency_count == 1:
835
- # otherwise can't do this
836
- KThread.kill_threads(target_func.__name__, debug=debug)
837
- target = wrapped_partial(generate_with_exceptions, model.generate, prompt, inputs_decoded,
838
- raise_generate_gpu_exceptions, **gen_kwargs)
839
- thread = KThread(target=target)
840
- thread.start()
841
- outputs = ""
842
- for new_text in streamer:
843
- outputs += new_text
844
- yield prompter.get_response(outputs, prompt=inputs_decoded,
845
- sanitize_bot_response=sanitize_bot_response)
846
- else:
847
- outputs = model.generate(**gen_kwargs)
848
- outputs = [decoder(s) for s in outputs.sequences]
849
- yield prompter.get_response(outputs, prompt=inputs_decoded,
850
- sanitize_bot_response=sanitize_bot_response)
851
- if save_dir and outputs and len(outputs) >= 1:
852
- decoded_output = prompt + outputs[0]
853
- save_generate_output(output=decoded_output, base_model=base_model, save_dir=save_dir)
854
-
855
-
856
- def generate_with_exceptions(func, prompt, inputs_decoded, raise_generate_gpu_exceptions, **kwargs):
857
- try:
858
- func(**kwargs)
859
- except torch.cuda.OutOfMemoryError as e:
860
- print("GPU OOM 2: prompt: %s inputs_decoded: %s exception: %s" % (prompt, inputs_decoded, str(e)),
861
- flush=True)
862
- if kwargs['input_ids'] is not None:
863
- kwargs['input_ids'].cpu()
864
- kwargs['input_ids'] = None
865
- traceback.print_exc()
866
- clear_torch_cache()
867
- return
868
- except (Exception, RuntimeError) as e:
869
- if 'Expected all tensors to be on the same device' in str(e) or \
870
- 'expected scalar type Half but found Float' in str(e) or \
871
- 'probability tensor contains either' in str(e) or \
872
- 'cublasLt ran into an error!' in str(e) or \
873
- 'mat1 and mat2 shapes cannot be multiplied' in str(e):
874
- print(
875
- "GPU Error: prompt: %s inputs_decoded: %s exception: %s" % (prompt, inputs_decoded, str(e)),
876
- flush=True)
877
- traceback.print_exc()
878
- clear_torch_cache()
879
- if raise_generate_gpu_exceptions:
880
- raise
881
- return
882
- else:
883
- clear_torch_cache()
884
- raise
885
-
886
-
887
- def get_generate_params(model_lower, chat,
888
- stream_output, show_examples,
889
- prompt_type, temperature, top_p, top_k, num_beams,
890
- max_new_tokens, min_new_tokens, early_stopping, max_time,
891
- repetition_penalty, num_return_sequences,
892
- do_sample):
893
- use_defaults = False
894
- use_default_examples = True
895
- examples = []
896
- task_info = f"{prompt_type}"
897
- if model_lower:
898
- print(f"Using Model {model_lower}", flush=True)
899
- else:
900
- print("No model defined yet", flush=True)
901
-
902
- min_new_tokens = min_new_tokens if min_new_tokens is not None else 0
903
- early_stopping = early_stopping if early_stopping is not None else False
904
- max_time_defaults = 60 * 3
905
- max_time = max_time if max_time is not None else max_time_defaults
906
-
907
- if not prompt_type and model_lower in inv_prompt_type_to_model_lower:
908
- prompt_type = inv_prompt_type_to_model_lower[model_lower]
909
-
910
- # examples at first don't include chat, instruction_nochat, iinput_nochat, added at end
911
- if show_examples is None:
912
- if chat:
913
- show_examples = False
914
- else:
915
- show_examples = True
916
-
917
- summarize_example1 = """Jeff: Can I train a ? Transformers model on Amazon SageMaker?
918
- Philipp: Sure you can use the new Hugging Face Deep Learning Container.
919
- Jeff: ok.
920
- Jeff: and how can I get started?
921
- Jeff: where can I find documentation?
922
- Philipp: ok, ok you can find everything here. https://huggingface.co/blog/the-partnership-amazon-sagemaker-and-hugging-face"""
923
-
924
- if 'bart-large-cnn-samsum' in model_lower or 'flan-t5-base-samsum' in model_lower:
925
- placeholder_instruction = summarize_example1
926
- placeholder_input = ""
927
- use_defaults = True
928
- use_default_examples = False
929
- examples += [
930
- [placeholder_instruction, "", "", stream_output, 'plain', 1.0, 1.0, 50, 1, 128, 0, False, max_time_defaults,
931
- 1.0, 1,
932
- False]]
933
- task_info = "Summarization"
934
- elif 't5-' in model_lower or 't5' == model_lower or 'flan-' in model_lower:
935
- placeholder_instruction = "The square root of x is the cube root of y. What is y to the power of 2, if x = 4?"
936
- placeholder_input = ""
937
- use_defaults = True
938
- use_default_examples = True
939
- task_info = "Multi-Task: Q/A, translation, Chain-of-Thought, Logical Reasoning, Summarization, etc. Best to use task prefix as trained on, e.g. `translate English to German: ` (space after colon)"
940
- elif 'mbart-' in model_lower:
941
- placeholder_instruction = "The girl has long hair."
942
- placeholder_input = ""
943
- use_defaults = True
944
- use_default_examples = False
945
- examples += [
946
- [placeholder_instruction, "", "", stream_output, 'plain', 1.0, 1.0, 50, 1, 128, 0, False, max_time_defaults,
947
- 1.0, 1,
948
- False]]
949
- elif 'gpt2' in model_lower:
950
- placeholder_instruction = "The sky is"
951
- placeholder_input = ""
952
- prompt_type = prompt_type or 'plain'
953
- use_default_examples = True # some will be odd "continuations" but can be ok
954
- examples += [
955
- [placeholder_instruction, "", "", stream_output, 'plain', 1.0, 1.0, 50, 1, 128, 0, False, max_time_defaults,
956
- 1.0, 1,
957
- False]]
958
- task_info = "Auto-complete phrase, code, etc."
959
- use_defaults = True
960
- else:
961
- if chat:
962
- placeholder_instruction = "Enter a question or imperative."
963
- else:
964
- placeholder_instruction = "Give detailed answer for whether Einstein or Newton is smarter."
965
- placeholder_input = ""
966
- if model_lower:
967
- prompt_type = prompt_type or 'human_bot'
968
- else:
969
- prompt_type = ''
970
- examples += [[summarize_example1, 'Summarize' if prompt_type not in ['plain', 'instruct_simple'] else '', "",
971
- stream_output, prompt_type or 'plain', 0.1, 0.75, 40, 4, 256, 0, False, max_time_defaults, 1.0, 1,
972
- False]]
973
- task_info = "No task"
974
- if prompt_type == 'instruct':
975
- task_info = "Answer question or follow imperative as instruction with optionally input."
976
- elif prompt_type == 'plain':
977
- task_info = "Auto-complete phrase, code, etc."
978
- elif prompt_type == 'human_bot':
979
- if chat:
980
- task_info = "Chat (Shift-Enter to give question/imperative, input concatenated with instruction)"
981
- else:
982
- task_info = "Ask question/imperative (input concatenated with instruction)"
983
-
984
- # revert to plain if still nothing
985
- prompt_type = prompt_type or 'plain'
986
- if use_defaults:
987
- temperature = 1.0 if temperature is None else temperature
988
- top_p = 1.0 if top_p is None else top_p
989
- top_k = 40 if top_k is None else top_k
990
- num_beams = num_beams or 1
991
- max_new_tokens = max_new_tokens or 128
992
- repetition_penalty = repetition_penalty or 1.07
993
- num_return_sequences = min(num_beams, num_return_sequences or 1)
994
- do_sample = False if do_sample is None else do_sample
995
- else:
996
- temperature = 0.4 if temperature is None else temperature
997
- top_p = 0.85 if top_p is None else top_p
998
- top_k = 70 if top_k is None else top_k
999
- if chat:
1000
- num_beams = num_beams or 1
1001
- else:
1002
- num_beams = num_beams or 4
1003
- max_new_tokens = max_new_tokens or 256
1004
- repetition_penalty = repetition_penalty or 1.07
1005
- num_return_sequences = min(num_beams, num_return_sequences or 1)
1006
- do_sample = True if do_sample is None else do_sample
1007
- # doesn't include chat, instruction_nochat, iinput_nochat, added later
1008
- params_list = ["", stream_output, prompt_type, temperature, top_p, top_k, num_beams, max_new_tokens, min_new_tokens,
1009
- early_stopping, max_time, repetition_penalty, num_return_sequences, do_sample]
1010
-
1011
- if use_default_examples:
1012
- examples += [
1013
- ["Translate English to French", "Good morning"] + params_list,
1014
- ["Give detailed answer for whether Einstein or Newton is smarter.", ''] + params_list,
1015
- ["Explain in detailed list, all the best practices for coding in python.", ''] + params_list,
1016
- [
1017
- "Create a markdown table with 3 rows for the primary colors, and 2 columns, with color name and hex codes.",
1018
- ''] + params_list,
1019
- ['Translate to German: My name is Arthur', ''] + params_list,
1020
- ["Please answer to the following question. Who is going to be the next Ballon d'or?", ''] + params_list,
1021
- ['Can Geoffrey Hinton have a conversation with George Washington? Give the rationale before answering.',
1022
- ''] + params_list,
1023
- ['Please answer the following question. What is the boiling point of Nitrogen?', ''] + params_list,
1024
- ['Answer the following yes/no question. Can you write a whole Haiku in a single tweet?', ''] + params_list,
1025
- ["Simplify the following expression: (False or False and True). Explain your answer.", ''] + params_list,
1026
- [
1027
- "Premise: At my age you will probably have learnt one lesson. Hypothesis: It's not certain how many lessons you'll learn by your thirties. Does the premise entail the hypothesis?",
1028
- ''] + params_list,
1029
- ['The square root of x is the cube root of y. What is y to the power of 2, if x = 4?', ''] + params_list,
1030
- [
1031
- 'Answer the following question by reasoning step by step. The cafeteria had 23 apples. If they used 20 for lunch, and bought 6 more, how many apple do they have?',
1032
- ''] + params_list,
1033
- ["""def area_of_rectangle(a: float, b: float):
1034
- \"\"\"Return the area of the rectangle.\"\"\"""", ''] + params_list,
1035
- ["""# a function in native python:
1036
- def mean(a):
1037
- return sum(a)/len(a)
1038
-
1039
- # the same function using numpy:
1040
- import numpy as np
1041
- def mean(a):""", ''] + params_list,
1042
- ["""X = np.random.randn(100, 100)
1043
- y = np.random.randint(0, 1, 100)
1044
-
1045
- # fit random forest classifier with 20 estimators""", ''] + params_list,
1046
- ]
1047
-
1048
- src_lang = "English"
1049
- tgt_lang = "Russian"
1050
-
1051
- # move to correct position
1052
- for example in examples:
1053
- example += [chat, '', '']
1054
- # adjust examples if non-chat mode
1055
- if not chat:
1056
- example[eval_func_param_names.index('instruction_nochat')] = example[
1057
- eval_func_param_names.index('instruction')]
1058
- example[eval_func_param_names.index('instruction')] = ''
1059
-
1060
- example[eval_func_param_names.index('iinput_nochat')] = example[eval_func_param_names.index('iinput')]
1061
- example[eval_func_param_names.index('iinput')] = ''
1062
-
1063
- return placeholder_instruction, placeholder_input, \
1064
- stream_output, show_examples, \
1065
- prompt_type, temperature, top_p, top_k, num_beams, \
1066
- max_new_tokens, min_new_tokens, early_stopping, max_time, \
1067
- repetition_penalty, num_return_sequences, \
1068
- do_sample, \
1069
- src_lang, tgt_lang, \
1070
- examples, \
1071
- task_info
1072
-
1073
-
1074
- def languages_covered():
1075
- # https://huggingface.co/facebook/mbart-large-50-many-to-many-mmt#languages-covered
1076
- covered = """Arabic (ar_AR), Czech (cs_CZ), German (de_DE), English (en_XX), Spanish (es_XX), Estonian (et_EE), Finnish (fi_FI), French (fr_XX), Gujarati (gu_IN), Hindi (hi_IN), Italian (it_IT), Japanese (ja_XX), Kazakh (kk_KZ), Korean (ko_KR), Lithuanian (lt_LT), Latvian (lv_LV), Burmese (my_MM), Nepali (ne_NP), Dutch (nl_XX), Romanian (ro_RO), Russian (ru_RU), Sinhala (si_LK), Turkish (tr_TR), Vietnamese (vi_VN), Chinese (zh_CN), Afrikaans (af_ZA), Azerbaijani (az_AZ), Bengali (bn_IN), Persian (fa_IR), Hebrew (he_IL), Croatian (hr_HR), Indonesian (id_ID), Georgian (ka_GE), Khmer (km_KH), Macedonian (mk_MK), Malayalam (ml_IN), Mongolian (mn_MN), Marathi (mr_IN), Polish (pl_PL), Pashto (ps_AF), Portuguese (pt_XX), Swedish (sv_SE), Swahili (sw_KE), Tamil (ta_IN), Telugu (te_IN), Thai (th_TH), Tagalog (tl_XX), Ukrainian (uk_UA), Urdu (ur_PK), Xhosa (xh_ZA), Galician (gl_ES), Slovene (sl_SI)"""
1077
- covered = covered.split(', ')
1078
- covered = {x.split(' ')[0]: x.split(' ')[1].replace(')', '').replace('(', '') for x in covered}
1079
- return covered
1080
-
1081
-
1082
- def get_context(chat_context, prompt_type):
1083
- if chat_context and prompt_type == 'human_bot':
1084
- context0 = """<bot>: I am an intelligent, helpful, truthful, and fair assistant named h2oGPT, who will give accurate, balanced, and reliable responses. I will not respond with I don't know or I don't understand.
1085
- <human>: I am a human person seeking useful assistance and request all questions be answered completely, and typically expect detailed responses. Give answers in numbered list format if several distinct but related items are being listed."""
1086
- else:
1087
- context0 = ''
1088
- return context0
1089
-
1090
-
1091
- def test_test_prompt(prompt_type='instruct', data_point=0):
1092
- example_data_point = example_data_points[data_point]
1093
- example_data_point.pop('output', None)
1094
- return generate_prompt(example_data_point, prompt_type, False, False)
1095
-
1096
-
1097
- def score_qa(smodel, stokenizer, max_length_tokenize, question, answer, cutoff_len):
1098
- question = question[-cutoff_len:]
1099
- answer = answer[-cutoff_len:]
1100
-
1101
- inputs = stokenizer(question, answer,
1102
- return_tensors="pt",
1103
- truncation=True,
1104
- max_length=max_length_tokenize).to(smodel.device)
1105
- try:
1106
- score = torch.sigmoid(smodel(**inputs).logits[0]).cpu().detach().numpy()[0]
1107
- except torch.cuda.OutOfMemoryError as e:
1108
- print("GPU OOM 3: question: %s answer: %s exception: %s" % (question, answer, str(e)), flush=True)
1109
- del inputs
1110
- traceback.print_exc()
1111
- clear_torch_cache()
1112
- return 'Response Score: GPU OOM'
1113
- except (Exception, RuntimeError) as e:
1114
- if 'Expected all tensors to be on the same device' in str(e) or \
1115
- 'expected scalar type Half but found Float' in str(e) or \
1116
- 'probability tensor contains either' in str(e) or \
1117
- 'cublasLt ran into an error!' in str(e):
1118
- print("GPU Error: question: %s answer: %s exception: %s" % (question, answer, str(e)),
1119
- flush=True)
1120
- traceback.print_exc()
1121
- clear_torch_cache()
1122
- return 'Response Score: GPU Error'
1123
- else:
1124
- raise
1125
- os.environ['TOKENIZERS_PARALLELISM'] = 'true'
1126
- return score
1127
-
1128
-
1129
- if __name__ == "__main__":
1130
- print("""
1131
- WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1,2,3" torchrun --nproc_per_node=4 --master_port=1234 generate.py --base_model='EleutherAI/gpt-j-6B' --lora_weights=lora-alpaca_6B
1132
- python generate.py --base_model='EleutherAI/gpt-j-6B' --lora_weights='lora-alpaca_6B'
1133
- python generate.py --base_model='EleutherAI/gpt-neox-20b' --lora_weights='lora-alpaca_20B'
1134
-
1135
- # generate without lora weights, no prompt
1136
- python generate.py --base_model='EleutherAI/gpt-neox-20b' --prompt_type='plain'
1137
- python generate.py --base_model='togethercomputer/GPT-NeoXT-Chat-Base-20B' --prompt_type='dai_faq'
1138
-
1139
- python generate.py --base_model='togethercomputer/GPT-NeoXT-Chat-Base-20B' --prompt_type='dai_faq' --lora_weights='lora_20B_daifaq'
1140
- # OpenChatKit settings:
1141
- python generate.py --base_model='togethercomputer/GPT-NeoXT-Chat-Base-20B' --prompt_type='human_bot --debug=True --num_beams=1 --temperature=0.6 --top_k=40 --top_p=1.0
1142
-
1143
- python generate.py --base_model='distilgpt2' --prompt_type='plain' --debug=True --num_beams=1 --temperature=0.6 --top_k=40 --top_p=1.0 --share=False
1144
- python generate.py --base_model='t5-large' --prompt_type='simple_instruct'
1145
- python generate.py --base_model='philschmid/bart-large-cnn-samsum'
1146
- python generate.py --base_model='philschmid/flan-t5-base-samsum'
1147
- python generate.py --base_model='facebook/mbart-large-50-many-to-many-mmt'
1148
-
1149
- python generate.py --base_model='togethercomputer/GPT-NeoXT-Chat-Base-20B' --prompt_type='human_bot' --lora_weights='GPT-NeoXT-Chat-Base-20B.merged.json.8_epochs.57b2892c53df5b8cefac45f84d019cace803ef26.28'
1150
-
1151
- must have 4*48GB GPU and run without 8bit in order for sharding to work with infer_devices=False
1152
- can also pass --prompt_type='human_bot' and model can somewhat handle instructions without being instruct tuned
1153
- python generate.py --base_model=decapoda-research/llama-65b-hf --load_8bit=False --infer_devices=False --prompt_type='human_bot'
1154
-
1155
- python generate.py --base_model=h2oai/h2ogpt-oig-oasst1-512-6.9b
1156
-
1157
- """, flush=True)
1158
- fire.Fire(main)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1 @@
 
 
1
+ generate.py
generate.py ADDED
@@ -0,0 +1,1158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import functools
2
+ import sys
3
+ import os
4
+ import traceback
5
+ import typing
6
+
7
+ from utils import set_seed, clear_torch_cache, save_generate_output, NullContext, KThread, wrapped_partial
8
+
9
+ SEED = 1236
10
+ set_seed(SEED)
11
+
12
+ os.environ['HF_HUB_DISABLE_TELEMETRY'] = '1'
13
+ from typing import Union
14
+ import numpy as np
15
+ import pandas as pd
16
+
17
+ import fire
18
+ import torch
19
+ from peft import PeftModel
20
+ from transformers import GenerationConfig, StoppingCriteriaList, AutoModel, TextIteratorStreamer
21
+ from accelerate import init_empty_weights, infer_auto_device_map
22
+
23
+ from prompter import Prompter
24
+
25
+ from finetune import get_loaders, example_data_points, generate_prompt, human, bot, inv_prompt_type_to_model_lower
26
+ from stopping import StoppingCriteriaSub
27
+
28
+ eval_extra_columns = ['prompt', 'response', 'score']
29
+
30
+
31
+ def main(
32
+ load_8bit: bool = False,
33
+ load_half: bool = True,
34
+ infer_devices: bool = True, # really if to "control" devices now
35
+ base_model: str = '',
36
+ tokenizer_base_model: str = '',
37
+ lora_weights: str = "",
38
+ gpu_id: int = 0, # if infer_devices = True and gpu_id != -1
39
+
40
+ prompt_type: Union[int, str] = None,
41
+ # input to generation
42
+ temperature: float = None,
43
+ top_p: float = None,
44
+ top_k: int = None,
45
+ num_beams: int = None,
46
+ repetition_penalty: float = None,
47
+ num_return_sequences: int = None,
48
+ do_sample: bool = None,
49
+ max_new_tokens: int = None,
50
+ min_new_tokens: int = None,
51
+ early_stopping: Union[bool, str] = None,
52
+ max_time: float = None,
53
+
54
+ debug: bool = False,
55
+ save_dir: str = None,
56
+ share: bool = True,
57
+ local_files_only: bool = False,
58
+ resume_download: bool = True,
59
+ use_auth_token: Union[str, bool] = False, # True requires CLI did huggingface-cli login before running
60
+
61
+ src_lang: str = "English",
62
+ tgt_lang: str = "Russian",
63
+
64
+ gradio: bool = True,
65
+ gradio_avoid_processing_markdown: bool = False,
66
+ chat: bool = True,
67
+ chat_history: int = 4096, # character length of chat context/history
68
+ chat_context: bool = False, # use default context if human_bot
69
+ stream_output: bool = True,
70
+ show_examples: bool = None,
71
+ verbose: bool = False,
72
+ h2ocolors: bool = True,
73
+ height: int = 400,
74
+ show_lora: bool = True,
75
+ # set to True to load --base_model after client logs in,
76
+ # to be able to free GPU memory when model is swapped
77
+ login_mode_if_model0: bool = False,
78
+ block_gradio_exit: bool = True,
79
+ concurrency_count: int = 1,
80
+ api_open: bool = False, # don't let API skip queue
81
+ allow_api: bool = True,
82
+ input_lines: int = 1,
83
+
84
+ sanitize_user_prompt: bool = True,
85
+ sanitize_bot_response: bool = True,
86
+
87
+ extra_model_options: typing.List[str] = [],
88
+ extra_lora_options: typing.List[str] = [],
89
+
90
+ score_model: str = 'OpenAssistant/reward-model-deberta-v3-large-v2',
91
+ auto_score: bool = True,
92
+
93
+ eval_sharegpt_prompts_only: int = 0,
94
+ eval_sharegpt_prompts_only_seed: int = 1234,
95
+ eval_sharegpt_as_output: bool = False,
96
+
97
+ hard_stop_list: typing.List[str] = [],
98
+ ):
99
+ is_hf = bool(os.getenv("HUGGINGFACE_SPACES"))
100
+ is_gpth2oai = bool(os.getenv("GPT_H2O_AI"))
101
+ is_public = is_hf or is_gpth2oai # multi-user case with fixed model and disclaimer
102
+ is_low_mem = is_hf # assumes run on 24GB consumer GPU
103
+ admin_pass = os.getenv("ADMIN_PASS")
104
+ # will sometimes appear in UI or sometimes actual generation, but maybe better than empty result
105
+ # but becomes unrecoverable sometimes if raise, so just be silent for now
106
+ raise_generate_gpu_exceptions = not is_public
107
+
108
+ # allow set token directly
109
+ use_auth_token = os.environ.get("HUGGINGFACE_API_TOKEN", use_auth_token)
110
+
111
+ if is_public:
112
+ input_lines = 1 # ensure set, for ease of use
113
+ temperature = 0.4
114
+ top_p = 0.85
115
+ top_k = 70
116
+ do_sample = True
117
+ if is_low_mem:
118
+ base_model = 'h2oai/h2ogpt-oasst1-512-12b'
119
+ load_8bit = True
120
+ else:
121
+ base_model = 'h2oai/h2ogpt-oasst1-512-20b'
122
+ if is_low_mem:
123
+ load_8bit = True
124
+ if is_hf:
125
+ # must override share if in spaces
126
+ share = False
127
+ save_dir = os.getenv('SAVE_DIR', save_dir)
128
+ score_model = os.getenv('SCORE_MODEL', score_model)
129
+ if score_model == 'None':
130
+ score_model = ''
131
+ concurrency_count = int(os.getenv('CONCURRENCY_COUNT', concurrency_count))
132
+ api_open = bool(int(os.getenv('API_OPEN', api_open)))
133
+ allow_api = bool(int(os.getenv('ALLOW_API', allow_api)))
134
+
135
+ n_gpus = torch.cuda.device_count()
136
+
137
+ # get defaults
138
+ model_lower = base_model.lower()
139
+ if not gradio:
140
+ # force, else not single response like want to look at
141
+ stream_output = False
142
+ # else prompt removal can mess up output
143
+ chat = False
144
+
145
+ placeholder_instruction, placeholder_input, \
146
+ stream_output, show_examples, \
147
+ prompt_type, temperature, top_p, top_k, num_beams, \
148
+ max_new_tokens, min_new_tokens, early_stopping, max_time, \
149
+ repetition_penalty, num_return_sequences, \
150
+ do_sample, \
151
+ src_lang, tgt_lang, \
152
+ examples, \
153
+ task_info = \
154
+ get_generate_params(model_lower, chat,
155
+ stream_output, show_examples,
156
+ prompt_type, temperature, top_p, top_k, num_beams,
157
+ max_new_tokens, min_new_tokens, early_stopping, max_time,
158
+ repetition_penalty, num_return_sequences,
159
+ do_sample,
160
+ )
161
+
162
+ if not gradio:
163
+ if eval_sharegpt_prompts_only > 0:
164
+ # override default examples with shareGPT ones for human-level eval purposes only
165
+ eval_filename = 'ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json'
166
+ if not os.path.isfile(eval_filename):
167
+ os.system(
168
+ 'wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/%s' % eval_filename)
169
+ import json
170
+ data = json.load(open(eval_filename, 'rt'))
171
+ # focus on data that starts with human, else likely chopped from other data
172
+ turn_start = 0 # odd in general
173
+ data = [x for x in data if len(x['conversations']) > turn_start + 1 and
174
+ x['conversations'][turn_start]['from'] == 'human' and
175
+ x['conversations'][turn_start + 1]['from'] == 'gpt']
176
+ np.random.seed(eval_sharegpt_prompts_only_seed)
177
+ example1 = examples[-1] # pick reference example
178
+ examples = []
179
+ responses = []
180
+ for i in list(np.random.randint(0, len(data), size=eval_sharegpt_prompts_only)):
181
+ assert data[i]['conversations'][turn_start]['from'] == 'human'
182
+ instruction = data[i]['conversations'][turn_start]['value']
183
+ assert data[i]['conversations'][turn_start + 1]['from'] == 'gpt'
184
+ output = data[i]['conversations'][turn_start + 1]['value']
185
+ examplenew = example1.copy()
186
+ assert not chat, "No gradio must use chat=False, uses nochat instruct"
187
+ examplenew[eval_func_param_names.index('instruction_nochat')] = instruction
188
+ examplenew[eval_func_param_names.index('iinput_nochat')] = '' # no input
189
+ examplenew[eval_func_param_names.index('context')] = get_context(chat_context, prompt_type)
190
+ examples.append(examplenew)
191
+ responses.append(output)
192
+
193
+ num_examples = len(examples)
194
+ scoring_path = 'scoring'
195
+ os.makedirs(scoring_path, exist_ok=True)
196
+ if eval_sharegpt_as_output:
197
+ used_base_model = 'gpt35'
198
+ used_lora_weights = ''
199
+ else:
200
+ used_base_model = str(base_model.split('/')[-1])
201
+ used_lora_weights = str(lora_weights.split('/')[-1])
202
+ eval_filename = "df_scores_%s_%s_%s_%s_%s_%s.parquet" % (num_examples, eval_sharegpt_prompts_only,
203
+ eval_sharegpt_prompts_only_seed,
204
+ eval_sharegpt_as_output,
205
+ used_base_model,
206
+ used_lora_weights)
207
+ eval_filename = os.path.join(scoring_path, eval_filename)
208
+
209
+ # torch.device("cuda") leads to cuda:x cuda:y mismatches for multi-GPU consistently
210
+ context_class = NullContext() if n_gpus > 1 else torch.device("cuda")
211
+
212
+ with context_class:
213
+ # ensure was set right above before examples generated
214
+ assert not stream_output, "stream_output=True does not make sense with example loop"
215
+ import time
216
+ from functools import partial
217
+
218
+ # get score model
219
+ smodel, stokenizer, sdevice = get_score_model(**locals())
220
+
221
+ if not eval_sharegpt_as_output:
222
+ model, tokenizer, device = get_model(**locals())
223
+ model_state = [model, tokenizer, device, base_model]
224
+ fun = partial(evaluate, model_state, debug=debug, save_dir=save_dir, is_low_mem=is_low_mem,
225
+ raise_generate_gpu_exceptions=raise_generate_gpu_exceptions,
226
+ chat_context=chat_context,
227
+ concurrency_count=concurrency_count)
228
+ else:
229
+ assert eval_sharegpt_prompts_only > 0
230
+
231
+ def get_response(*args, exi=0):
232
+ # assumes same ordering of examples and responses
233
+ yield responses[exi]
234
+
235
+ fun = get_response
236
+ t0 = time.time()
237
+ score_dump = []
238
+
239
+ import matplotlib.pyplot as plt
240
+
241
+ for exi, ex in enumerate(examples):
242
+ instruction = ex[eval_func_param_names.index('instruction_nochat')]
243
+ iinput = ex[eval_func_param_names.index('iinput_nochat')]
244
+ context = ex[eval_func_param_names.index('context')]
245
+ clear_torch_cache()
246
+ print("")
247
+ print("START" + "=" * 100)
248
+ print("Question: %s %s" % (instruction, ('input=%s' % iinput if iinput else '')))
249
+ print("-" * 105)
250
+ # fun yields as generator, so have to iterate over it
251
+ # Also means likely do NOT want --stream_output=True, else would show all generations
252
+ gener = fun(*tuple(ex), exi=exi) if eval_sharegpt_as_output else fun(*tuple(ex))
253
+ for res in gener:
254
+ print(res)
255
+ if smodel:
256
+ score_with_prompt = False
257
+ if score_with_prompt:
258
+ data_point = dict(instruction=instruction, input=iinput, context=context)
259
+ prompter = Prompter(prompt_type, debug=debug, chat=chat, stream_output=stream_output)
260
+ prompt = prompter.generate_prompt(data_point)
261
+ else:
262
+ # just raw input and output
263
+ if eval_sharegpt_prompts_only > 0:
264
+ # only our own examples have this filled at moment
265
+ assert iinput in [None, ''], iinput # should be no iinput
266
+ if not (chat_context and prompt_type == 'human_bot'):
267
+ assert context in [None, ''], context # should be no context
268
+ prompt = instruction
269
+ cutoff_len = 768 if is_low_mem else 2048
270
+ inputs = stokenizer(prompt, res,
271
+ return_tensors="pt",
272
+ truncation=True,
273
+ max_length=cutoff_len)
274
+ try:
275
+ score = torch.sigmoid(smodel(**inputs).logits[0]).cpu().detach().numpy()[0]
276
+ except torch.cuda.OutOfMemoryError as e:
277
+ print("GPU OOM 1: question: %s answer: %s exception: %s" % (prompt, res, str(e)), flush=True)
278
+ traceback.print_exc()
279
+ score = 0.0
280
+ clear_torch_cache()
281
+ except (Exception, RuntimeError) as e:
282
+ if 'Expected all tensors to be on the same device' in str(e) or \
283
+ 'expected scalar type Half but found Float' in str(e) or \
284
+ 'probability tensor contains either' in str(e) or \
285
+ 'cublasLt ran into an error!' in str(e):
286
+ print("GPU error: question: %s answer: %s exception: %s" % (prompt, res, str(e)),
287
+ flush=True)
288
+ traceback.print_exc()
289
+ score = 0.0
290
+ clear_torch_cache()
291
+ else:
292
+ raise
293
+ print("SCORE %s: %s" % (exi, score), flush=True)
294
+ score_dump.append(ex + [prompt, res, score])
295
+ # dump every score in case abort
296
+ df_scores = pd.DataFrame(score_dump,
297
+ columns=eval_func_param_names + eval_extra_columns)
298
+ df_scores.to_parquet(eval_filename, index=False)
299
+ # plot histogram so far
300
+ plt.figure(figsize=(10, 10))
301
+ plt.hist(df_scores['score'], bins=20)
302
+ score_avg = np.mean(df_scores['score'])
303
+ score_median = np.median(df_scores['score'])
304
+ plt.title("Score avg: %s median: %s" % (score_avg, score_median))
305
+ plt.savefig(eval_filename.replace('.parquet', '.png'))
306
+ plt.close()
307
+
308
+ print("END" + "=" * 102)
309
+ print("")
310
+ t2 = time.time()
311
+ print("Time taken so far: %.4f about %.4g per example" % (t2 - t0, (t2 - t0) / (1 + exi)))
312
+ t1 = time.time()
313
+ print("Total time taken: %.4f about %.4g per example" % (t1 - t0, (t1 - t0) / num_examples))
314
+ return eval_filename
315
+
316
+ if gradio:
317
+ # imported here so don't require gradio to run generate
318
+ from gradio_runner import go_gradio
319
+
320
+ # get default model
321
+ all_kwargs = locals().copy()
322
+ if all_kwargs.get('base_model') and not all_kwargs['login_mode_if_model0']:
323
+ model0, tokenizer0, device = get_model(**all_kwargs)
324
+ else:
325
+ # if empty model, then don't load anything, just get gradio up
326
+ model0, tokenizer0, device = None, None, None
327
+ model_state0 = [model0, tokenizer0, device, all_kwargs['base_model']]
328
+
329
+ # get score model
330
+ smodel, stokenizer, sdevice = get_score_model(**all_kwargs)
331
+ score_model_state0 = [smodel, stokenizer, sdevice, score_model]
332
+
333
+ go_gradio(**locals())
334
+
335
+
336
+ def get_device():
337
+ if torch.cuda.is_available():
338
+ device = "cuda"
339
+ else:
340
+ raise RuntimeError("only cuda supported")
341
+
342
+ return device
343
+
344
+
345
+ def get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward_type,
346
+ gpu_id=0,
347
+ use_auth_token=False):
348
+ """
349
+ Ensure model gets on correct device
350
+ :param base_model:
351
+ :param model_loader:
352
+ :param load_half:
353
+ :param model_kwargs:
354
+ :param reward_type:
355
+ :param gpu_id:
356
+ :param use_auth_token:
357
+ :return:
358
+ """
359
+ with init_empty_weights():
360
+ from transformers import AutoConfig
361
+ config = AutoConfig.from_pretrained(base_model, use_auth_token=use_auth_token)
362
+ model = AutoModel.from_config(
363
+ config,
364
+ )
365
+
366
+ # NOTE: Can specify max_memory={0: max_mem, 1: max_mem}, to shard model
367
+ # NOTE: Some models require avoiding sharding some layers,
368
+ # then would pass no_split_module_classes and give list of those layers.
369
+ device_map = infer_auto_device_map(
370
+ model,
371
+ dtype=torch.float16 if load_half else torch.float32,
372
+ )
373
+ if hasattr(model, 'model'):
374
+ device_map_model = infer_auto_device_map(
375
+ model.model,
376
+ dtype=torch.float16 if load_half else torch.float32,
377
+ )
378
+ device_map.update(device_map_model)
379
+ print('device_map: %s' % device_map, flush=True)
380
+
381
+ if gpu_id >= 0:
382
+ # FIXME: If really distributes model, tend to get things like: ValueError: gpt_neox.embed_in.weight doesn't have any device set.
383
+ # So avoid for now, just put on first GPU, unless score_model, put on last
384
+ n_gpus = torch.cuda.device_count()
385
+ if reward_type:
386
+ device_map = {'': n_gpus - 1}
387
+ else:
388
+ device_map = {'': min(n_gpus - 1, gpu_id)}
389
+ if gpu_id == -1:
390
+ device_map = {'': 'cuda'}
391
+
392
+ load_in_8bit = model_kwargs.get('load_in_8bit', False)
393
+ model_kwargs['device_map'] = device_map
394
+
395
+ if load_in_8bit or not load_half:
396
+ model = model_loader.from_pretrained(
397
+ base_model,
398
+ **model_kwargs,
399
+ )
400
+ else:
401
+ model = model_loader.from_pretrained(
402
+ base_model,
403
+ **model_kwargs,
404
+ ).half()
405
+ return model
406
+
407
+
408
+ def get_model(
409
+ load_8bit: bool = False,
410
+ load_half: bool = True,
411
+ infer_devices: bool = True,
412
+ base_model: str = '',
413
+ tokenizer_base_model: str = '',
414
+ lora_weights: str = "",
415
+ gpu_id: int = 0,
416
+
417
+ reward_type: bool = None,
418
+ local_files_only: bool = False,
419
+ resume_download: bool = True,
420
+ use_auth_token: Union[str, bool] = False,
421
+ compile: bool = True,
422
+ **kwargs,
423
+ ):
424
+ """
425
+
426
+ :param load_8bit: load model in 8-bit, not supported by all models
427
+ :param load_half: load model in 16-bit
428
+ :param infer_devices: Use torch infer of optimal placement of layers on devices (for non-lora case)
429
+ For non-LORA case, False will spread shards across multiple GPUs, but this can lead to cuda:x cuda:y mismatches
430
+ So it is not the default
431
+ :param base_model: name/path of base model
432
+ :param tokenizer_base_model: name/path of tokenizer
433
+ :param lora_weights: name/path
434
+ :param gpu_id: which GPU (0..n_gpus-1) or allow all GPUs if relevant (-1)
435
+ :param reward_type: reward type model for sequence classification
436
+ :param local_files_only: use local files instead of from HF
437
+ :param resume_download: resume downloads from HF
438
+ :param use_auth_token: assumes user did on CLI `huggingface-cli login` to access private repo
439
+ :param compile: whether to compile torch model
440
+ :param kwargs:
441
+ :return:
442
+ """
443
+ print("Get %s model" % base_model, flush=True)
444
+ if lora_weights is not None and lora_weights.strip():
445
+ print("Get %s lora weights" % lora_weights, flush=True)
446
+ device = get_device()
447
+
448
+ if 'gpt2' in base_model.lower():
449
+ # RuntimeError: where expected condition to be a boolean tensor, but got a tensor with dtype Half
450
+ load_8bit = False
451
+
452
+ assert base_model.strip(), (
453
+ "Please choose a base model with --base_model (CLI) or in Models Tab (gradio)"
454
+ )
455
+
456
+ from transformers import AutoConfig
457
+ config = AutoConfig.from_pretrained(base_model, use_auth_token=use_auth_token)
458
+ llama_type_from_config = 'llama' in str(config).lower()
459
+ llama_type_from_name = "llama" in base_model.lower()
460
+ llama_type = llama_type_from_config or llama_type_from_name
461
+ if llama_type:
462
+ print("Detected as llama type from"
463
+ " config (%s) or name (%s)" % (llama_type_from_config, llama_type_from_name), flush=True)
464
+
465
+ model_loader, tokenizer_loader = get_loaders(llama_type=llama_type, model_name=base_model, reward_type=reward_type)
466
+ if not tokenizer_base_model:
467
+ tokenizer_base_model = base_model
468
+
469
+ if tokenizer_loader is not None and not isinstance(tokenizer_loader, str):
470
+ tokenizer = tokenizer_loader.from_pretrained(tokenizer_base_model,
471
+ local_files_only=local_files_only,
472
+ resume_download=resume_download,
473
+ use_auth_token=use_auth_token,
474
+ )
475
+ else:
476
+ tokenizer = tokenizer_loader
477
+
478
+ if isinstance(tokenizer, str):
479
+ # already a pipeline, tokenizer_loader is string for task
480
+ model = model_loader(tokenizer,
481
+ model=base_model,
482
+ device=0 if device == "cuda" else -1,
483
+ torch_dtype=torch.float16)
484
+ else:
485
+ assert device == "cuda", "Unsupported device %s" % device
486
+ model_kwargs = dict(local_files_only=local_files_only,
487
+ torch_dtype=torch.float16,
488
+ resume_download=resume_download,
489
+ use_auth_token=use_auth_token)
490
+ if 'mbart-' not in base_model.lower():
491
+ model_kwargs.update(dict(load_in_8bit=load_8bit,
492
+ device_map={"": 0} if load_8bit else "auto",
493
+ ))
494
+ if 'OpenAssistant/reward-model'.lower() in base_model.lower():
495
+ # could put on other GPUs
496
+ model_kwargs['device_map'] = {"": 0}
497
+ model_kwargs.pop('torch_dtype', None)
498
+
499
+ if not lora_weights:
500
+ with torch.device("cuda"):
501
+ if infer_devices:
502
+ model = get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward_type,
503
+ gpu_id=gpu_id, use_auth_token=use_auth_token)
504
+ else:
505
+ if load_half and not load_8bit:
506
+ model = model_loader.from_pretrained(
507
+ base_model,
508
+ **model_kwargs).half()
509
+ else:
510
+ model = model_loader.from_pretrained(
511
+ base_model,
512
+ **model_kwargs)
513
+ elif load_8bit:
514
+ model = model_loader.from_pretrained(
515
+ base_model,
516
+ **model_kwargs
517
+ )
518
+ model = PeftModel.from_pretrained(
519
+ model,
520
+ lora_weights,
521
+ torch_dtype=torch.float16,
522
+ local_files_only=local_files_only,
523
+ resume_download=resume_download,
524
+ use_auth_token=use_auth_token,
525
+ device_map={"": 0}, # seems to be required
526
+ )
527
+ else:
528
+ with torch.device("cuda"):
529
+ model = model_loader.from_pretrained(
530
+ base_model,
531
+ **model_kwargs
532
+ )
533
+ model = PeftModel.from_pretrained(
534
+ model,
535
+ lora_weights,
536
+ torch_dtype=torch.float16,
537
+ local_files_only=local_files_only,
538
+ resume_download=resume_download,
539
+ use_auth_token=use_auth_token,
540
+ device_map="auto",
541
+ )
542
+ if load_half:
543
+ model.half()
544
+
545
+ # unwind broken decapoda-research config
546
+ if llama_type:
547
+ model.config.pad_token_id = tokenizer.pad_token_id = 0 # unk
548
+ model.config.bos_token_id = 1
549
+ model.config.eos_token_id = 2
550
+ if 'gpt2' in base_model.lower():
551
+ # add special tokens that otherwise all share the same id
552
+ tokenizer.add_special_tokens({'bos_token': '<bos>',
553
+ 'eos_token': '<eos>',
554
+ 'pad_token': '<pad>'})
555
+
556
+ if not isinstance(tokenizer, str):
557
+ model.eval()
558
+ if torch.__version__ >= "2" and sys.platform != "win32" and compile:
559
+ model = torch.compile(model)
560
+
561
+ return model, tokenizer, device
562
+
563
+
564
+ def get_score_model(**kwargs):
565
+ # score model
566
+ if kwargs.get('score_model') is not None and kwargs.get('score_model').strip():
567
+ score_all_kwargs = kwargs.copy()
568
+ score_all_kwargs['load_8bit'] = False
569
+ score_all_kwargs['load_half'] = False
570
+ score_all_kwargs['base_model'] = kwargs.get('score_model').strip()
571
+ score_all_kwargs['tokenizer_base_model'] = ''
572
+ score_all_kwargs['lora_weights'] = ''
573
+ score_all_kwargs['llama_type'] = False
574
+ score_all_kwargs['compile'] = False
575
+ smodel, stokenizer, sdevice = get_model(**score_all_kwargs)
576
+ else:
577
+ smodel, stokenizer, sdevice = None, None, None
578
+ return smodel, stokenizer, sdevice
579
+
580
+
581
+ eval_func_param_names = ['instruction',
582
+ 'iinput',
583
+ 'context',
584
+ 'stream_output',
585
+ 'prompt_type',
586
+ 'temperature',
587
+ 'top_p',
588
+ 'top_k',
589
+ 'num_beams',
590
+ 'max_new_tokens',
591
+ 'min_new_tokens',
592
+ 'early_stopping',
593
+ 'max_time',
594
+ 'repetition_penalty',
595
+ 'num_return_sequences',
596
+ 'do_sample',
597
+ 'chat',
598
+ 'instruction_nochat',
599
+ 'iinput_nochat',
600
+ ]
601
+
602
+
603
+ def evaluate(
604
+ model_state,
605
+ # START NOTE: Examples must have same order of parameters
606
+ instruction,
607
+ iinput,
608
+ context,
609
+ stream_output,
610
+ prompt_type,
611
+ temperature,
612
+ top_p,
613
+ top_k,
614
+ num_beams,
615
+ max_new_tokens,
616
+ min_new_tokens,
617
+ early_stopping,
618
+ max_time,
619
+ repetition_penalty,
620
+ num_return_sequences,
621
+ do_sample,
622
+ chat,
623
+ instruction_nochat,
624
+ iinput_nochat,
625
+ # END NOTE: Examples must have same order of parameters
626
+ src_lang=None,
627
+ tgt_lang=None,
628
+ debug=False,
629
+ concurrency_count=None,
630
+ save_dir=None,
631
+ hard_stop_list=None,
632
+ sanitize_bot_response=True,
633
+ model_state0=None,
634
+ is_low_mem=None,
635
+ raise_generate_gpu_exceptions=None,
636
+ chat_context=None,
637
+ ):
638
+ # ensure passed these
639
+ assert concurrency_count is not None
640
+ assert is_low_mem is not None
641
+ assert raise_generate_gpu_exceptions is not None
642
+ assert chat_context is not None
643
+
644
+ if debug:
645
+ locals_dict = locals().copy()
646
+ locals_dict.pop('model_state', None)
647
+ locals_dict.pop('model_state0', None)
648
+ print(locals_dict)
649
+
650
+ no_model_msg = "Please choose a base model with --base_model (CLI) or in Models Tab (gradio).\nThen start New Conversation"
651
+
652
+ if model_state0 is None:
653
+ # e.g. for no gradio case, set dummy value, else should be set
654
+ model_state0 = [None, None, None, None]
655
+
656
+ if model_state is not None and len(model_state) == 4 and not isinstance(model_state[0], str):
657
+ # try to free-up original model (i.e. list was passed as reference)
658
+ if model_state0 is not None and model_state0[0] is not None:
659
+ model_state0[0].cpu()
660
+ model_state0[0] = None
661
+ # try to free-up original tokenizer (i.e. list was passed as reference)
662
+ if model_state0 is not None and model_state0[1] is not None:
663
+ model_state0[1] = None
664
+ clear_torch_cache()
665
+ model, tokenizer, device, base_model = model_state
666
+ elif model_state0 is not None and len(model_state0) == 4 and model_state0[0] is not None:
667
+ assert isinstance(model_state[0], str)
668
+ model, tokenizer, device, base_model = model_state0
669
+ else:
670
+ raise AssertionError(no_model_msg)
671
+
672
+ if base_model is None:
673
+ raise AssertionError(no_model_msg)
674
+
675
+ assert base_model.strip(), no_model_msg
676
+ assert model, "Model is missing"
677
+ assert tokenizer, "Tokenizer is missing"
678
+
679
+ # choose chat or non-chat mode
680
+ if not chat:
681
+ instruction = instruction_nochat
682
+ iinput = iinput_nochat
683
+
684
+ if not context:
685
+ # get hidden context if have one
686
+ context = get_context(chat_context, prompt_type)
687
+
688
+ data_point = dict(context=context, instruction=instruction, input=iinput)
689
+ prompter = Prompter(prompt_type, debug=debug, chat=chat, stream_output=stream_output)
690
+ prompt = prompter.generate_prompt(data_point)
691
+
692
+ if hard_stop_list is None:
693
+ # acts like undo on user entry and bot response
694
+ hard_stop_list = []
695
+
696
+ if isinstance(tokenizer, str):
697
+ # pipeline
698
+ if tokenizer == "summarization":
699
+ key = 'summary_text'
700
+ else:
701
+ raise RuntimeError("No such task type %s" % tokenizer)
702
+ # NOTE: uses max_length only
703
+ yield model(prompt, max_length=max_new_tokens)[0][key]
704
+
705
+ if 'mbart-' in base_model.lower():
706
+ assert src_lang is not None
707
+ tokenizer.src_lang = languages_covered()[src_lang]
708
+
709
+ if chat:
710
+ # override, ignore user change
711
+ num_return_sequences = 1
712
+ if prompt_type in ['human_bot', 'instruct_vicuna', 'instruct_with_end']:
713
+ if prompt_type == 'human_bot':
714
+ # encounters = [prompt.count(human) + 1, prompt.count(bot) + 1]
715
+ # stopping only starts once output is beyond prompt
716
+ # 1 human is enough to trigger, but need 2 bots, because very first view back will be bot we added
717
+ stop_words = [human, bot, '\n' + human, '\n' + bot]
718
+ encounters = [1, 2]
719
+ elif prompt_type == 'instruct_vicuna':
720
+ # even below is not enough, generic strings and many ways to encode
721
+ stop_words = [
722
+ '### Human:',
723
+ """
724
+ ### Human:""",
725
+ """
726
+ ### Human:
727
+ """,
728
+ '### Assistant:',
729
+ """
730
+ ### Assistant:""",
731
+ """
732
+ ### Assistant:
733
+ """,
734
+ ]
735
+ encounters = [1, 2]
736
+ else:
737
+ # some instruct prompts have this as end, doesn't hurt to stop on it since not common otherwise
738
+ stop_words = ['### End']
739
+ encounters = [1]
740
+ stop_words_ids = [
741
+ tokenizer(stop_word, return_tensors='pt')['input_ids'].squeeze() for stop_word in stop_words]
742
+ # handle single token case
743
+ stop_words_ids = [x if len(x.shape) > 0 else torch.tensor([x]) for x in stop_words_ids]
744
+ stop_words_ids = [x for x in stop_words_ids if x.shape[0] > 0]
745
+ # avoid padding in front of tokens
746
+ if tokenizer.pad_token:
747
+ stop_words_ids = [x[1:] if x[0] == tokenizer.pad_token_id and len(x) > 1 else x for x in stop_words_ids]
748
+ # handle fake \n added
749
+ stop_words_ids = [x[1:] if y[0] == '\n' else x for x, y in zip(stop_words_ids, stop_words)]
750
+ # build stopper
751
+ stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids, encounters=encounters)])
752
+ else:
753
+ stopping_criteria = StoppingCriteriaList()
754
+
755
+ # help to avoid errors like:
756
+ # RuntimeError: The size of tensor a (2048) must match the size of tensor b (2049) at non-singleton dimension 3
757
+ # RuntimeError: expected scalar type Half but found Float
758
+ # with - 256
759
+ max_length_tokenize = 768 - 256 if is_low_mem else 2048 - 256
760
+ cutoff_len = max_length_tokenize * 4 # if reaches limit, then can't generate new tokens
761
+ output_smallest = 30 * 4
762
+ prompt = prompt[-cutoff_len - output_smallest:]
763
+ inputs = tokenizer(prompt,
764
+ return_tensors="pt",
765
+ truncation=True,
766
+ max_length=max_length_tokenize)
767
+ if debug and len(inputs["input_ids"]) > 0:
768
+ print('input_ids length', len(inputs["input_ids"][0]), flush=True)
769
+ input_ids = inputs["input_ids"].to(device)
770
+ generation_config = GenerationConfig(
771
+ temperature=float(temperature),
772
+ top_p=float(top_p),
773
+ top_k=top_k,
774
+ num_beams=num_beams,
775
+ do_sample=do_sample,
776
+ repetition_penalty=float(repetition_penalty),
777
+ num_return_sequences=num_return_sequences,
778
+ renormalize_logits=True,
779
+ remove_invalid_values=True,
780
+ )
781
+
782
+ gen_kwargs = dict(input_ids=input_ids,
783
+ generation_config=generation_config,
784
+ return_dict_in_generate=True,
785
+ output_scores=True,
786
+ max_new_tokens=max_new_tokens, # prompt + new
787
+ min_new_tokens=min_new_tokens, # prompt + new
788
+ early_stopping=early_stopping, # False, True, "never"
789
+ max_time=max_time,
790
+ stopping_criteria=stopping_criteria,
791
+ )
792
+ if 'gpt2' in base_model.lower():
793
+ gen_kwargs.update(dict(bos_token_id=tokenizer.bos_token_id, pad_token_id=tokenizer.eos_token_id))
794
+ elif 'mbart-' in base_model.lower():
795
+ assert tgt_lang is not None
796
+ tgt_lang = languages_covered()[tgt_lang]
797
+ gen_kwargs.update(dict(forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang]))
798
+ else:
799
+ gen_kwargs.update(dict(pad_token_id=tokenizer.eos_token_id))
800
+
801
+ decoder = functools.partial(tokenizer.decode,
802
+ skip_special_tokens=True,
803
+ clean_up_tokenization_spaces=True,
804
+ )
805
+ decoder_raw = functools.partial(tokenizer.decode,
806
+ skip_special_tokens=False,
807
+ clean_up_tokenization_spaces=True,
808
+ )
809
+
810
+ with torch.no_grad():
811
+ # decoded tokenized prompt can deviate from prompt due to special characters
812
+ inputs_decoded = decoder(input_ids[0])
813
+ inputs_decoded_raw = decoder_raw(input_ids[0])
814
+ if inputs_decoded == prompt:
815
+ # normal
816
+ pass
817
+ elif inputs_decoded.lstrip() == prompt.lstrip():
818
+ # sometimes extra space in front, make prompt same for prompt removal
819
+ prompt = inputs_decoded
820
+ elif inputs_decoded_raw == prompt:
821
+ # some models specify special tokens that are part of normal prompt, so can't skip them
822
+ inputs_decoded_raw = inputs_decoded
823
+ decoder = decoder_raw
824
+ else:
825
+ print("WARNING: Special characters in prompt", flush=True)
826
+ if stream_output:
827
+ #skip_prompt = prompt_type != 'plain'
828
+ skip_prompt = False
829
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=skip_prompt)
830
+ gen_kwargs.update(dict(streamer=streamer))
831
+ if debug:
832
+ KThread.show_threads()
833
+ target_func = generate_with_exceptions
834
+ if concurrency_count == 1:
835
+ # otherwise can't do this
836
+ KThread.kill_threads(target_func.__name__, debug=debug)
837
+ target = wrapped_partial(generate_with_exceptions, model.generate, prompt, inputs_decoded,
838
+ raise_generate_gpu_exceptions, **gen_kwargs)
839
+ thread = KThread(target=target)
840
+ thread.start()
841
+ outputs = ""
842
+ for new_text in streamer:
843
+ outputs += new_text
844
+ yield prompter.get_response(outputs, prompt=inputs_decoded,
845
+ sanitize_bot_response=sanitize_bot_response)
846
+ else:
847
+ outputs = model.generate(**gen_kwargs)
848
+ outputs = [decoder(s) for s in outputs.sequences]
849
+ yield prompter.get_response(outputs, prompt=inputs_decoded,
850
+ sanitize_bot_response=sanitize_bot_response)
851
+ if save_dir and outputs and len(outputs) >= 1:
852
+ decoded_output = prompt + outputs[0]
853
+ save_generate_output(output=decoded_output, base_model=base_model, save_dir=save_dir)
854
+
855
+
856
+ def generate_with_exceptions(func, prompt, inputs_decoded, raise_generate_gpu_exceptions, **kwargs):
857
+ try:
858
+ func(**kwargs)
859
+ except torch.cuda.OutOfMemoryError as e:
860
+ print("GPU OOM 2: prompt: %s inputs_decoded: %s exception: %s" % (prompt, inputs_decoded, str(e)),
861
+ flush=True)
862
+ if kwargs['input_ids'] is not None:
863
+ kwargs['input_ids'].cpu()
864
+ kwargs['input_ids'] = None
865
+ traceback.print_exc()
866
+ clear_torch_cache()
867
+ return
868
+ except (Exception, RuntimeError) as e:
869
+ if 'Expected all tensors to be on the same device' in str(e) or \
870
+ 'expected scalar type Half but found Float' in str(e) or \
871
+ 'probability tensor contains either' in str(e) or \
872
+ 'cublasLt ran into an error!' in str(e) or \
873
+ 'mat1 and mat2 shapes cannot be multiplied' in str(e):
874
+ print(
875
+ "GPU Error: prompt: %s inputs_decoded: %s exception: %s" % (prompt, inputs_decoded, str(e)),
876
+ flush=True)
877
+ traceback.print_exc()
878
+ clear_torch_cache()
879
+ if raise_generate_gpu_exceptions:
880
+ raise
881
+ return
882
+ else:
883
+ clear_torch_cache()
884
+ raise
885
+
886
+
887
+ def get_generate_params(model_lower, chat,
888
+ stream_output, show_examples,
889
+ prompt_type, temperature, top_p, top_k, num_beams,
890
+ max_new_tokens, min_new_tokens, early_stopping, max_time,
891
+ repetition_penalty, num_return_sequences,
892
+ do_sample):
893
+ use_defaults = False
894
+ use_default_examples = True
895
+ examples = []
896
+ task_info = f"{prompt_type}"
897
+ if model_lower:
898
+ print(f"Using Model {model_lower}", flush=True)
899
+ else:
900
+ print("No model defined yet", flush=True)
901
+
902
+ min_new_tokens = min_new_tokens if min_new_tokens is not None else 0
903
+ early_stopping = early_stopping if early_stopping is not None else False
904
+ max_time_defaults = 60 * 3
905
+ max_time = max_time if max_time is not None else max_time_defaults
906
+
907
+ if not prompt_type and model_lower in inv_prompt_type_to_model_lower:
908
+ prompt_type = inv_prompt_type_to_model_lower[model_lower]
909
+
910
+ # examples at first don't include chat, instruction_nochat, iinput_nochat, added at end
911
+ if show_examples is None:
912
+ if chat:
913
+ show_examples = False
914
+ else:
915
+ show_examples = True
916
+
917
+ summarize_example1 = """Jeff: Can I train a ? Transformers model on Amazon SageMaker?
918
+ Philipp: Sure you can use the new Hugging Face Deep Learning Container.
919
+ Jeff: ok.
920
+ Jeff: and how can I get started?
921
+ Jeff: where can I find documentation?
922
+ Philipp: ok, ok you can find everything here. https://huggingface.co/blog/the-partnership-amazon-sagemaker-and-hugging-face"""
923
+
924
+ if 'bart-large-cnn-samsum' in model_lower or 'flan-t5-base-samsum' in model_lower:
925
+ placeholder_instruction = summarize_example1
926
+ placeholder_input = ""
927
+ use_defaults = True
928
+ use_default_examples = False
929
+ examples += [
930
+ [placeholder_instruction, "", "", stream_output, 'plain', 1.0, 1.0, 50, 1, 128, 0, False, max_time_defaults,
931
+ 1.0, 1,
932
+ False]]
933
+ task_info = "Summarization"
934
+ elif 't5-' in model_lower or 't5' == model_lower or 'flan-' in model_lower:
935
+ placeholder_instruction = "The square root of x is the cube root of y. What is y to the power of 2, if x = 4?"
936
+ placeholder_input = ""
937
+ use_defaults = True
938
+ use_default_examples = True
939
+ task_info = "Multi-Task: Q/A, translation, Chain-of-Thought, Logical Reasoning, Summarization, etc. Best to use task prefix as trained on, e.g. `translate English to German: ` (space after colon)"
940
+ elif 'mbart-' in model_lower:
941
+ placeholder_instruction = "The girl has long hair."
942
+ placeholder_input = ""
943
+ use_defaults = True
944
+ use_default_examples = False
945
+ examples += [
946
+ [placeholder_instruction, "", "", stream_output, 'plain', 1.0, 1.0, 50, 1, 128, 0, False, max_time_defaults,
947
+ 1.0, 1,
948
+ False]]
949
+ elif 'gpt2' in model_lower:
950
+ placeholder_instruction = "The sky is"
951
+ placeholder_input = ""
952
+ prompt_type = prompt_type or 'plain'
953
+ use_default_examples = True # some will be odd "continuations" but can be ok
954
+ examples += [
955
+ [placeholder_instruction, "", "", stream_output, 'plain', 1.0, 1.0, 50, 1, 128, 0, False, max_time_defaults,
956
+ 1.0, 1,
957
+ False]]
958
+ task_info = "Auto-complete phrase, code, etc."
959
+ use_defaults = True
960
+ else:
961
+ if chat:
962
+ placeholder_instruction = "Enter a question or imperative."
963
+ else:
964
+ placeholder_instruction = "Give detailed answer for whether Einstein or Newton is smarter."
965
+ placeholder_input = ""
966
+ if model_lower:
967
+ prompt_type = prompt_type or 'human_bot'
968
+ else:
969
+ prompt_type = ''
970
+ examples += [[summarize_example1, 'Summarize' if prompt_type not in ['plain', 'instruct_simple'] else '', "",
971
+ stream_output, prompt_type or 'plain', 0.1, 0.75, 40, 4, 256, 0, False, max_time_defaults, 1.0, 1,
972
+ False]]
973
+ task_info = "No task"
974
+ if prompt_type == 'instruct':
975
+ task_info = "Answer question or follow imperative as instruction with optionally input."
976
+ elif prompt_type == 'plain':
977
+ task_info = "Auto-complete phrase, code, etc."
978
+ elif prompt_type == 'human_bot':
979
+ if chat:
980
+ task_info = "Chat (Shift-Enter to give question/imperative, input concatenated with instruction)"
981
+ else:
982
+ task_info = "Ask question/imperative (input concatenated with instruction)"
983
+
984
+ # revert to plain if still nothing
985
+ prompt_type = prompt_type or 'plain'
986
+ if use_defaults:
987
+ temperature = 1.0 if temperature is None else temperature
988
+ top_p = 1.0 if top_p is None else top_p
989
+ top_k = 40 if top_k is None else top_k
990
+ num_beams = num_beams or 1
991
+ max_new_tokens = max_new_tokens or 128
992
+ repetition_penalty = repetition_penalty or 1.07
993
+ num_return_sequences = min(num_beams, num_return_sequences or 1)
994
+ do_sample = False if do_sample is None else do_sample
995
+ else:
996
+ temperature = 0.4 if temperature is None else temperature
997
+ top_p = 0.85 if top_p is None else top_p
998
+ top_k = 70 if top_k is None else top_k
999
+ if chat:
1000
+ num_beams = num_beams or 1
1001
+ else:
1002
+ num_beams = num_beams or 4
1003
+ max_new_tokens = max_new_tokens or 256
1004
+ repetition_penalty = repetition_penalty or 1.07
1005
+ num_return_sequences = min(num_beams, num_return_sequences or 1)
1006
+ do_sample = True if do_sample is None else do_sample
1007
+ # doesn't include chat, instruction_nochat, iinput_nochat, added later
1008
+ params_list = ["", stream_output, prompt_type, temperature, top_p, top_k, num_beams, max_new_tokens, min_new_tokens,
1009
+ early_stopping, max_time, repetition_penalty, num_return_sequences, do_sample]
1010
+
1011
+ if use_default_examples:
1012
+ examples += [
1013
+ ["Translate English to French", "Good morning"] + params_list,
1014
+ ["Give detailed answer for whether Einstein or Newton is smarter.", ''] + params_list,
1015
+ ["Explain in detailed list, all the best practices for coding in python.", ''] + params_list,
1016
+ [
1017
+ "Create a markdown table with 3 rows for the primary colors, and 2 columns, with color name and hex codes.",
1018
+ ''] + params_list,
1019
+ ['Translate to German: My name is Arthur', ''] + params_list,
1020
+ ["Please answer to the following question. Who is going to be the next Ballon d'or?", ''] + params_list,
1021
+ ['Can Geoffrey Hinton have a conversation with George Washington? Give the rationale before answering.',
1022
+ ''] + params_list,
1023
+ ['Please answer the following question. What is the boiling point of Nitrogen?', ''] + params_list,
1024
+ ['Answer the following yes/no question. Can you write a whole Haiku in a single tweet?', ''] + params_list,
1025
+ ["Simplify the following expression: (False or False and True). Explain your answer.", ''] + params_list,
1026
+ [
1027
+ "Premise: At my age you will probably have learnt one lesson. Hypothesis: It's not certain how many lessons you'll learn by your thirties. Does the premise entail the hypothesis?",
1028
+ ''] + params_list,
1029
+ ['The square root of x is the cube root of y. What is y to the power of 2, if x = 4?', ''] + params_list,
1030
+ [
1031
+ 'Answer the following question by reasoning step by step. The cafeteria had 23 apples. If they used 20 for lunch, and bought 6 more, how many apple do they have?',
1032
+ ''] + params_list,
1033
+ ["""def area_of_rectangle(a: float, b: float):
1034
+ \"\"\"Return the area of the rectangle.\"\"\"""", ''] + params_list,
1035
+ ["""# a function in native python:
1036
+ def mean(a):
1037
+ return sum(a)/len(a)
1038
+
1039
+ # the same function using numpy:
1040
+ import numpy as np
1041
+ def mean(a):""", ''] + params_list,
1042
+ ["""X = np.random.randn(100, 100)
1043
+ y = np.random.randint(0, 1, 100)
1044
+
1045
+ # fit random forest classifier with 20 estimators""", ''] + params_list,
1046
+ ]
1047
+
1048
+ src_lang = "English"
1049
+ tgt_lang = "Russian"
1050
+
1051
+ # move to correct position
1052
+ for example in examples:
1053
+ example += [chat, '', '']
1054
+ # adjust examples if non-chat mode
1055
+ if not chat:
1056
+ example[eval_func_param_names.index('instruction_nochat')] = example[
1057
+ eval_func_param_names.index('instruction')]
1058
+ example[eval_func_param_names.index('instruction')] = ''
1059
+
1060
+ example[eval_func_param_names.index('iinput_nochat')] = example[eval_func_param_names.index('iinput')]
1061
+ example[eval_func_param_names.index('iinput')] = ''
1062
+
1063
+ return placeholder_instruction, placeholder_input, \
1064
+ stream_output, show_examples, \
1065
+ prompt_type, temperature, top_p, top_k, num_beams, \
1066
+ max_new_tokens, min_new_tokens, early_stopping, max_time, \
1067
+ repetition_penalty, num_return_sequences, \
1068
+ do_sample, \
1069
+ src_lang, tgt_lang, \
1070
+ examples, \
1071
+ task_info
1072
+
1073
+
1074
+ def languages_covered():
1075
+ # https://huggingface.co/facebook/mbart-large-50-many-to-many-mmt#languages-covered
1076
+ covered = """Arabic (ar_AR), Czech (cs_CZ), German (de_DE), English (en_XX), Spanish (es_XX), Estonian (et_EE), Finnish (fi_FI), French (fr_XX), Gujarati (gu_IN), Hindi (hi_IN), Italian (it_IT), Japanese (ja_XX), Kazakh (kk_KZ), Korean (ko_KR), Lithuanian (lt_LT), Latvian (lv_LV), Burmese (my_MM), Nepali (ne_NP), Dutch (nl_XX), Romanian (ro_RO), Russian (ru_RU), Sinhala (si_LK), Turkish (tr_TR), Vietnamese (vi_VN), Chinese (zh_CN), Afrikaans (af_ZA), Azerbaijani (az_AZ), Bengali (bn_IN), Persian (fa_IR), Hebrew (he_IL), Croatian (hr_HR), Indonesian (id_ID), Georgian (ka_GE), Khmer (km_KH), Macedonian (mk_MK), Malayalam (ml_IN), Mongolian (mn_MN), Marathi (mr_IN), Polish (pl_PL), Pashto (ps_AF), Portuguese (pt_XX), Swedish (sv_SE), Swahili (sw_KE), Tamil (ta_IN), Telugu (te_IN), Thai (th_TH), Tagalog (tl_XX), Ukrainian (uk_UA), Urdu (ur_PK), Xhosa (xh_ZA), Galician (gl_ES), Slovene (sl_SI)"""
1077
+ covered = covered.split(', ')
1078
+ covered = {x.split(' ')[0]: x.split(' ')[1].replace(')', '').replace('(', '') for x in covered}
1079
+ return covered
1080
+
1081
+
1082
+ def get_context(chat_context, prompt_type):
1083
+ if chat_context and prompt_type == 'human_bot':
1084
+ context0 = """<bot>: I am an intelligent, helpful, truthful, and fair assistant named h2oGPT, who will give accurate, balanced, and reliable responses. I will not respond with I don't know or I don't understand.
1085
+ <human>: I am a human person seeking useful assistance and request all questions be answered completely, and typically expect detailed responses. Give answers in numbered list format if several distinct but related items are being listed."""
1086
+ else:
1087
+ context0 = ''
1088
+ return context0
1089
+
1090
+
1091
+ def test_test_prompt(prompt_type='instruct', data_point=0):
1092
+ example_data_point = example_data_points[data_point]
1093
+ example_data_point.pop('output', None)
1094
+ return generate_prompt(example_data_point, prompt_type, False, False)
1095
+
1096
+
1097
+ def score_qa(smodel, stokenizer, max_length_tokenize, question, answer, cutoff_len):
1098
+ question = question[-cutoff_len:]
1099
+ answer = answer[-cutoff_len:]
1100
+
1101
+ inputs = stokenizer(question, answer,
1102
+ return_tensors="pt",
1103
+ truncation=True,
1104
+ max_length=max_length_tokenize).to(smodel.device)
1105
+ try:
1106
+ score = torch.sigmoid(smodel(**inputs).logits[0]).cpu().detach().numpy()[0]
1107
+ except torch.cuda.OutOfMemoryError as e:
1108
+ print("GPU OOM 3: question: %s answer: %s exception: %s" % (question, answer, str(e)), flush=True)
1109
+ del inputs
1110
+ traceback.print_exc()
1111
+ clear_torch_cache()
1112
+ return 'Response Score: GPU OOM'
1113
+ except (Exception, RuntimeError) as e:
1114
+ if 'Expected all tensors to be on the same device' in str(e) or \
1115
+ 'expected scalar type Half but found Float' in str(e) or \
1116
+ 'probability tensor contains either' in str(e) or \
1117
+ 'cublasLt ran into an error!' in str(e):
1118
+ print("GPU Error: question: %s answer: %s exception: %s" % (question, answer, str(e)),
1119
+ flush=True)
1120
+ traceback.print_exc()
1121
+ clear_torch_cache()
1122
+ return 'Response Score: GPU Error'
1123
+ else:
1124
+ raise
1125
+ os.environ['TOKENIZERS_PARALLELISM'] = 'true'
1126
+ return score
1127
+
1128
+
1129
+ if __name__ == "__main__":
1130
+ print("""
1131
+ WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1,2,3" torchrun --nproc_per_node=4 --master_port=1234 generate.py --base_model='EleutherAI/gpt-j-6B' --lora_weights=lora-alpaca_6B
1132
+ python generate.py --base_model='EleutherAI/gpt-j-6B' --lora_weights='lora-alpaca_6B'
1133
+ python generate.py --base_model='EleutherAI/gpt-neox-20b' --lora_weights='lora-alpaca_20B'
1134
+
1135
+ # generate without lora weights, no prompt
1136
+ python generate.py --base_model='EleutherAI/gpt-neox-20b' --prompt_type='plain'
1137
+ python generate.py --base_model='togethercomputer/GPT-NeoXT-Chat-Base-20B' --prompt_type='dai_faq'
1138
+
1139
+ python generate.py --base_model='togethercomputer/GPT-NeoXT-Chat-Base-20B' --prompt_type='dai_faq' --lora_weights='lora_20B_daifaq'
1140
+ # OpenChatKit settings:
1141
+ python generate.py --base_model='togethercomputer/GPT-NeoXT-Chat-Base-20B' --prompt_type='human_bot --debug=True --num_beams=1 --temperature=0.6 --top_k=40 --top_p=1.0
1142
+
1143
+ python generate.py --base_model='distilgpt2' --prompt_type='plain' --debug=True --num_beams=1 --temperature=0.6 --top_k=40 --top_p=1.0 --share=False
1144
+ python generate.py --base_model='t5-large' --prompt_type='simple_instruct'
1145
+ python generate.py --base_model='philschmid/bart-large-cnn-samsum'
1146
+ python generate.py --base_model='philschmid/flan-t5-base-samsum'
1147
+ python generate.py --base_model='facebook/mbart-large-50-many-to-many-mmt'
1148
+
1149
+ python generate.py --base_model='togethercomputer/GPT-NeoXT-Chat-Base-20B' --prompt_type='human_bot' --lora_weights='GPT-NeoXT-Chat-Base-20B.merged.json.8_epochs.57b2892c53df5b8cefac45f84d019cace803ef26.28'
1150
+
1151
+ must have 4*48GB GPU and run without 8bit in order for sharding to work with infer_devices=False
1152
+ can also pass --prompt_type='human_bot' and model can somewhat handle instructions without being instruct tuned
1153
+ python generate.py --base_model=decapoda-research/llama-65b-hf --load_8bit=False --infer_devices=False --prompt_type='human_bot'
1154
+
1155
+ python generate.py --base_model=h2oai/h2ogpt-oig-oasst1-512-6.9b
1156
+
1157
+ """, flush=True)
1158
+ fire.Fire(main)