pseudotensor commited on
Commit
31f9cfa
1 Parent(s): 1265a5f

Update with h2oGPT hash dba6431da758fe9d822c9659f144ee64ea80f111

Browse files
Files changed (3) hide show
  1. generate.py +42 -24
  2. stopping.py +2 -2
  3. utils.py +1 -1
generate.py CHANGED
@@ -6,6 +6,7 @@ import typing
6
  from threading import Thread
7
 
8
  import filelock
 
9
 
10
  from utils import set_seed, clear_torch_cache, save_generate_output, NullContext, wrapped_partial
11
 
@@ -135,7 +136,19 @@ def main(
135
  api_open = bool(int(os.getenv('API_OPEN', api_open)))
136
  allow_api = bool(int(os.getenv('ALLOW_API', allow_api)))
137
 
138
- n_gpus = torch.cuda.device_count()
 
 
 
 
 
 
 
 
 
 
 
 
139
 
140
  # get defaults
141
  model_lower = base_model.lower()
@@ -210,7 +223,7 @@ def main(
210
  eval_filename = os.path.join(scoring_path, eval_filename)
211
 
212
  # torch.device("cuda") leads to cuda:x cuda:y mismatches for multi-GPU consistently
213
- context_class = NullContext() if n_gpus > 1 else torch.device("cuda")
214
 
215
  with context_class:
216
  # ensure was set right above before examples generated
@@ -340,7 +353,7 @@ def get_device():
340
  if torch.cuda.is_available():
341
  device = "cuda"
342
  else:
343
- raise RuntimeError("only cuda supported")
344
 
345
  return device
346
 
@@ -381,16 +394,21 @@ def get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward
381
  device_map.update(device_map_model)
382
  print('device_map: %s' % device_map, flush=True)
383
 
384
- if gpu_id >= 0:
385
- # FIXME: If really distributes model, tend to get things like: ValueError: gpt_neox.embed_in.weight doesn't have any device set.
386
- # So avoid for now, just put on first GPU, unless score_model, put on last
387
- n_gpus = torch.cuda.device_count()
388
- if reward_type:
389
- device_map = {'': n_gpus - 1}
390
- else:
391
- device_map = {'': min(n_gpus - 1, gpu_id)}
392
- if gpu_id == -1:
393
- device_map = {'': 'cuda'}
 
 
 
 
 
394
 
395
  load_in_8bit = model_kwargs.get('load_in_8bit', False)
396
  model_kwargs['device_map'] = device_map
@@ -483,24 +501,24 @@ def get_model(
483
  model = model_loader(tokenizer,
484
  model=base_model,
485
  device=0 if device == "cuda" else -1,
486
- torch_dtype=torch.float16)
487
  else:
488
- assert device == "cuda", "Unsupported device %s" % device
489
  model_kwargs = dict(local_files_only=local_files_only,
490
- torch_dtype=torch.float16,
491
  resume_download=resume_download,
492
  use_auth_token=use_auth_token)
493
  if 'mbart-' not in base_model.lower():
494
  model_kwargs.update(dict(load_in_8bit=load_8bit,
495
- device_map={"": 0} if load_8bit else "auto",
496
  ))
497
  if 'OpenAssistant/reward-model'.lower() in base_model.lower():
498
  # could put on other GPUs
499
- model_kwargs['device_map'] = {"": 0}
500
  model_kwargs.pop('torch_dtype', None)
501
 
502
  if not lora_weights:
503
- with torch.device("cuda"):
504
  if infer_devices:
505
  model = get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward_type,
506
  gpu_id=gpu_id, use_auth_token=use_auth_token)
@@ -521,14 +539,14 @@ def get_model(
521
  model = PeftModel.from_pretrained(
522
  model,
523
  lora_weights,
524
- torch_dtype=torch.float16,
525
  local_files_only=local_files_only,
526
  resume_download=resume_download,
527
  use_auth_token=use_auth_token,
528
- device_map={"": 0}, # seems to be required
529
  )
530
  else:
531
- with torch.device("cuda"):
532
  model = model_loader.from_pretrained(
533
  base_model,
534
  **model_kwargs
@@ -536,7 +554,7 @@ def get_model(
536
  model = PeftModel.from_pretrained(
537
  model,
538
  lora_weights,
539
- torch_dtype=torch.float16,
540
  local_files_only=local_files_only,
541
  resume_download=resume_download,
542
  use_auth_token=use_auth_token,
@@ -751,7 +769,7 @@ def evaluate(
751
  # handle fake \n added
752
  stop_words_ids = [x[1:] if y[0] == '\n' else x for x, y in zip(stop_words_ids, stop_words)]
753
  # build stopper
754
- stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids, encounters=encounters)])
755
  else:
756
  stopping_criteria = StoppingCriteriaList()
757
 
 
6
  from threading import Thread
7
 
8
  import filelock
9
+ import psutil
10
 
11
  from utils import set_seed, clear_torch_cache, save_generate_output, NullContext, wrapped_partial
12
 
 
136
  api_open = bool(int(os.getenv('API_OPEN', api_open)))
137
  allow_api = bool(int(os.getenv('ALLOW_API', allow_api)))
138
 
139
+ n_gpus = torch.cuda.device_count() if torch.cuda.is_available else 0
140
+ if n_gpus == 0:
141
+ gpu_id = None
142
+ load_8bit = False
143
+ load_half = False
144
+ infer_devices = False
145
+ torch.backends.cudnn.benchmark = True
146
+ torch.backends.cudnn.enabled = False
147
+ torch.set_default_dtype(torch.float32)
148
+ if psutil.virtual_memory().available < 94*1024**3:
149
+ # 12B uses ~94GB
150
+ # 6.9B uses ~47GB
151
+ base_model = 'h2oai/h2ogpt-oig-oasst1-512-6.9b'
152
 
153
  # get defaults
154
  model_lower = base_model.lower()
 
223
  eval_filename = os.path.join(scoring_path, eval_filename)
224
 
225
  # torch.device("cuda") leads to cuda:x cuda:y mismatches for multi-GPU consistently
226
+ context_class = NullContext() if n_gpus > 1 or n_gpus == 0 else torch.device("cuda")
227
 
228
  with context_class:
229
  # ensure was set right above before examples generated
 
353
  if torch.cuda.is_available():
354
  device = "cuda"
355
  else:
356
+ device = "cpu"
357
 
358
  return device
359
 
 
394
  device_map.update(device_map_model)
395
  print('device_map: %s' % device_map, flush=True)
396
 
397
+ n_gpus = torch.cuda.device_count() if torch.cuda.is_available else 0
398
+
399
+ if n_gpus > 0:
400
+ if gpu_id >= 0:
401
+ # FIXME: If really distributes model, tend to get things like: ValueError: gpt_neox.embed_in.weight doesn't have any device set.
402
+ # So avoid for now, just put on first GPU, unless score_model, put on last
403
+ if reward_type:
404
+ device_map = {'': n_gpus - 1}
405
+ else:
406
+ device_map = {'': min(n_gpus - 1, gpu_id)}
407
+ if gpu_id == -1:
408
+ device_map = {'': 'cuda'}
409
+ else:
410
+ device_map = {'': 'cpu'}
411
+ model_kwargs['load_in_8bit'] = False
412
 
413
  load_in_8bit = model_kwargs.get('load_in_8bit', False)
414
  model_kwargs['device_map'] = device_map
 
501
  model = model_loader(tokenizer,
502
  model=base_model,
503
  device=0 if device == "cuda" else -1,
504
+ torch_dtype=torch.float16 if device == 'cuda' else torch.float32)
505
  else:
506
+ assert device in ["cuda", "cpu"], "Unsupported device %s" % device
507
  model_kwargs = dict(local_files_only=local_files_only,
508
+ torch_dtype=torch.float16 if device == 'cuda' else torch.float32,
509
  resume_download=resume_download,
510
  use_auth_token=use_auth_token)
511
  if 'mbart-' not in base_model.lower():
512
  model_kwargs.update(dict(load_in_8bit=load_8bit,
513
+ device_map={"": 0} if load_8bit and device == 'cuda' else "auto",
514
  ))
515
  if 'OpenAssistant/reward-model'.lower() in base_model.lower():
516
  # could put on other GPUs
517
+ model_kwargs['device_map'] = {"": 0} if device == 'cuda' else {"": 'cpu'}
518
  model_kwargs.pop('torch_dtype', None)
519
 
520
  if not lora_weights:
521
+ with torch.device(device):
522
  if infer_devices:
523
  model = get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward_type,
524
  gpu_id=gpu_id, use_auth_token=use_auth_token)
 
539
  model = PeftModel.from_pretrained(
540
  model,
541
  lora_weights,
542
+ torch_dtype=torch.float16 if device == 'cuda' else torch.float32,
543
  local_files_only=local_files_only,
544
  resume_download=resume_download,
545
  use_auth_token=use_auth_token,
546
+ device_map={"": 0} if device == 'cuda' else {"": 'cpu'}, # seems to be required
547
  )
548
  else:
549
+ with torch.device(device):
550
  model = model_loader.from_pretrained(
551
  base_model,
552
  **model_kwargs
 
554
  model = PeftModel.from_pretrained(
555
  model,
556
  lora_weights,
557
+ torch_dtype=torch.float16 if device == 'cuda' else torch.float32,
558
  local_files_only=local_files_only,
559
  resume_download=resume_download,
560
  use_auth_token=use_auth_token,
 
769
  # handle fake \n added
770
  stop_words_ids = [x[1:] if y[0] == '\n' else x for x, y in zip(stop_words_ids, stop_words)]
771
  # build stopper
772
+ stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids, encounters=encounters, device=device)])
773
  else:
774
  stopping_criteria = StoppingCriteriaList()
775
 
stopping.py CHANGED
@@ -9,11 +9,11 @@ from transformers import StoppingCriteria
9
 
10
  class StoppingCriteriaSub(StoppingCriteria):
11
 
12
- def __init__(self, stops=[], encounters=[]):
13
  super().__init__()
14
  assert len(stops) % len(encounters) == 0, "Number of stops and encounters must match"
15
  self.encounters = encounters
16
- self.stops = [stop.to("cuda") for stop in stops]
17
  self.num_stops = [0] * len(stops)
18
 
19
  def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
 
9
 
10
  class StoppingCriteriaSub(StoppingCriteria):
11
 
12
+ def __init__(self, stops=[], encounters=[], device="cuda"):
13
  super().__init__()
14
  assert len(stops) % len(encounters) == 0, "Number of stops and encounters must match"
15
  self.encounters = encounters
16
+ self.stops = [stop.to(device) for stop in stops]
17
  self.num_stops = [0] * len(stops)
18
 
19
  def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
utils.py CHANGED
@@ -46,7 +46,7 @@ def flatten_list(lis):
46
 
47
  def clear_torch_cache():
48
  import torch
49
- if torch.cuda.is_available:
50
  torch.cuda.empty_cache()
51
  torch.cuda.ipc_collect()
52
  gc.collect()
 
46
 
47
  def clear_torch_cache():
48
  import torch
49
+ if torch.cuda.is_available():
50
  torch.cuda.empty_cache()
51
  torch.cuda.ipc_collect()
52
  gc.collect()