pseudotensor commited on
Commit
5cf48e0
1 Parent(s): efe0924

More restrictions for HF spaces to stabilize against GPU OOM and hide unusable options

Browse files
Files changed (3) hide show
  1. app.py +71 -17
  2. requirements.txt +4 -0
  3. utils.py +50 -0
app.py CHANGED
@@ -5,7 +5,7 @@ import os
5
  import traceback
6
  import typing
7
 
8
- from utils import set_seed, flatten_list, clear_torch_cache
9
 
10
  SEED = 1236
11
  set_seed(SEED)
@@ -210,6 +210,16 @@ def main(
210
  traceback.print_exc()
211
  score = 0.0
212
  clear_torch_cache()
 
 
 
 
 
 
 
 
 
 
213
  print("SCORE %s: %s" % (exi, score), flush=True)
214
  score_dump.append(ex + [prompt, res, score])
215
  # dump every score in case abort
@@ -515,7 +525,7 @@ def go_gradio(**kwargs):
515
  if os.environ.get("HUGGINGFACE_SPACES"):
516
  description += """<p><b> DISCLAIMERS: </b><ul><i><li>The data used to train this model include The Pile and other sources. These may contain objectionable content, so the model may reproduce that material. Use application and responses at own risk.</i></li>"""
517
  if kwargs['load_8bit']:
518
- description += """<i><li> Model is loaded in 8-bit and 768 token context length to fit on HF GPUs, so model may perform worse than 16-bit with 2048 token limit.</i></li>"""
519
  description += """<i><li>Model loading and unloading disabled on HF SPACES to avoid GPU OOM for multi-user environment.</i></li></ul></p>"""
520
 
521
  if kwargs['verbose']:
@@ -668,7 +678,8 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
668
  stream_output = gr.components.Checkbox(label="Stream output",
669
  value=kwargs['stream_output'])
670
  prompt_type = gr.Dropdown(prompt_types_strings,
671
- value=kwargs['prompt_type'], label="Prompt Type")
 
672
  temperature = gr.Slider(minimum=0, maximum=3,
673
  value=kwargs['temperature'],
674
  label="Temperature",
@@ -681,35 +692,43 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
681
  value=kwargs['top_k'], label="Top k",
682
  info='Num. tokens to sample from'
683
  )
684
- num_beams = gr.Slider(minimum=1, maximum=8, step=1,
685
- value=kwargs['num_beams'], label="Beams",
686
- info="Number of searches for optimal overall probability. Uses more GPU memory/compute")
 
 
 
687
  max_new_tokens = gr.Slider(
688
- minimum=1, maximum=2048, step=1,
689
- value=kwargs['max_new_tokens'], label="Max output length"
690
  )
691
  min_new_tokens = gr.Slider(
692
- minimum=0, maximum=2048, step=1,
693
- value=kwargs['min_new_tokens'], label="Min output length"
694
  )
695
  early_stopping = gr.Checkbox(label="EarlyStopping", info="Stop early in beam search",
696
  value=kwargs['early_stopping'])
697
- max_time = gr.Slider(minimum=0, maximum=60 * 5, step=1,
698
- value=kwargs['max_time'], label="Max. time",
 
699
  info="Max. time to search optimal output.")
700
  repetition_penalty = gr.Slider(minimum=0.01, maximum=3.0,
701
  value=kwargs['repetition_penalty'],
702
  label="Repetition Penalty")
703
  num_return_sequences = gr.Slider(minimum=1, maximum=10, step=1,
704
  value=kwargs['num_return_sequences'],
705
- label="Number Returns", info="Must be <= num_beams")
 
706
  do_sample = gr.Checkbox(label="Sample", info="Sample, for diverse output(s)",
707
  value=kwargs['do_sample'])
708
  if kwargs['chat']:
709
  iinput = gr.Textbox(lines=4, label="Input",
710
- placeholder=kwargs['placeholder_input'])
 
 
711
  context = gr.Textbox(lines=1, label="Context",
712
- info="Ignored in chat mode.") # nominally empty for chat mode
 
713
 
714
  with gr.TabItem("Models"):
715
  with gr.Row():
@@ -731,6 +750,12 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
731
  with gr.Column(scale=1):
732
  add_model_button = gr.Button("Add new model name")
733
  add_lora_button = gr.Button("Add new LORA name", visible=kwargs['show_lora'])
 
 
 
 
 
 
734
 
735
  inputs_list = get_inputs_list(locals(), kwargs['model_lower'])
736
  from functools import partial
@@ -801,6 +826,14 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
801
  traceback.print_exc()
802
  clear_torch_cache()
803
  return 'Response Score: GPU OOM'
 
 
 
 
 
 
 
 
804
  os.environ['TOKENIZERS_PARALLELISM'] = 'true'
805
  return 'Response Score: {:.1%}'.format(score)
806
  else:
@@ -987,7 +1020,10 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
987
  prompt_update_args = dict(fn=dropdown_prompt_type_list, inputs=prompt_type, outputs=prompt_type)
988
  chatbot_update_args = dict(fn=chatbot_list, inputs=[text_output, model_used], outputs=text_output)
989
  if not os.environ.get("HUGGINGFACE_SPACES"):
990
- load_model_event = load_model_button.click(**load_model_args).then(**prompt_update_args).then(**chatbot_update_args).then(clear_torch_cache)
 
 
 
991
 
992
  def dropdown_model_list(list0, x):
993
  new_state = [list0[0] + [x]]
@@ -1015,6 +1051,12 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
1015
  callback.setup(inputs_list + [text_output], "flagged_data_points")
1016
  flag_btn.click(lambda *args: callback.flag(args), inputs_list + [text_output], None, preprocess=False,
1017
  api_name='flag')
 
 
 
 
 
 
1018
  if kwargs['chat']:
1019
 
1020
  # don't pass text_output, don't want to clear output, just stop it
@@ -1275,13 +1317,25 @@ def evaluate(
1275
  try:
1276
  model.generate(**kwargs)
1277
  except torch.cuda.OutOfMemoryError as e:
1278
- print("GPU OOM: prompt: %s inputs_decoded: %s exception: %s" % (prompt, inputs_decoded, str(e)), flush=True)
 
1279
  if kwargs['input_ids'] is not None:
1280
  kwargs['input_ids'].cpu()
1281
  kwargs['input_ids'] = None
1282
  traceback.print_exc()
1283
  clear_torch_cache()
1284
  return
 
 
 
 
 
 
 
 
 
 
 
1285
 
1286
  for output in CallbackToGenerator(generate, callback=None, **gen_kwargs):
1287
  decoded_output = decoder(output)
 
5
  import traceback
6
  import typing
7
 
8
+ from utils import set_seed, flatten_list, clear_torch_cache, system_info_print
9
 
10
  SEED = 1236
11
  set_seed(SEED)
 
210
  traceback.print_exc()
211
  score = 0.0
212
  clear_torch_cache()
213
+ except RuntimeError as e:
214
+ if 'Expected all tensors to be on the same device' in str(
215
+ e) or 'expected scalar type Half but found Float' in str(e):
216
+ print("GPU error: question: %s answer: %s exception: %s" % (prompt, res, str(e)),
217
+ flush=True)
218
+ traceback.print_exc()
219
+ score = 0.0
220
+ clear_torch_cache()
221
+ else:
222
+ raise
223
  print("SCORE %s: %s" % (exi, score), flush=True)
224
  score_dump.append(ex + [prompt, res, score])
225
  # dump every score in case abort
 
525
  if os.environ.get("HUGGINGFACE_SPACES"):
526
  description += """<p><b> DISCLAIMERS: </b><ul><i><li>The data used to train this model include The Pile and other sources. These may contain objectionable content, so the model may reproduce that material. Use application and responses at own risk.</i></li>"""
527
  if kwargs['load_8bit']:
528
+ description += """<i><li> Model is loaded in 8-bit and HF spaces version has other limitations in order to fit on HF GPUs, so UX can be worse than native app.</i></li>"""
529
  description += """<i><li>Model loading and unloading disabled on HF SPACES to avoid GPU OOM for multi-user environment.</i></li></ul></p>"""
530
 
531
  if kwargs['verbose']:
 
678
  stream_output = gr.components.Checkbox(label="Stream output",
679
  value=kwargs['stream_output'])
680
  prompt_type = gr.Dropdown(prompt_types_strings,
681
+ value=kwargs['prompt_type'], label="Prompt Type",
682
+ visible=not os.environ.get("HUGGINGFACE_SPACES"))
683
  temperature = gr.Slider(minimum=0, maximum=3,
684
  value=kwargs['temperature'],
685
  label="Temperature",
 
692
  value=kwargs['top_k'], label="Top k",
693
  info='Num. tokens to sample from'
694
  )
695
+ max_beams = 8 if not os.environ.get("HUGGINGFACE_SPACES") else 2
696
+ num_beams = gr.Slider(minimum=1, maximum=max_beams, step=1,
697
+ value=min(max_beams, kwargs['num_beams']), label="Beams",
698
+ info="Number of searches for optimal overall probability. "
699
+ "Uses more GPU memory/compute")
700
+ max_max_new_tokens = 2048 if not os.environ.get("HUGGINGFACE_SPACES") else kwargs['max_new_tokens']
701
  max_new_tokens = gr.Slider(
702
+ minimum=1, maximum=max_max_new_tokens, step=1,
703
+ value=min(max_max_new_tokens, kwargs['max_new_tokens']), label="Max output length",
704
  )
705
  min_new_tokens = gr.Slider(
706
+ minimum=0, maximum=max_max_new_tokens, step=1,
707
+ value=min(max_max_new_tokens, kwargs['min_new_tokens']), label="Min output length",
708
  )
709
  early_stopping = gr.Checkbox(label="EarlyStopping", info="Stop early in beam search",
710
  value=kwargs['early_stopping'])
711
+ max_max_time = 60 * 5 if not os.environ.get("HUGGINGFACE_SPACES") else 60
712
+ max_time = gr.Slider(minimum=0, maximum=max_max_time, step=1,
713
+ value=min(max_max_time, kwargs['max_time']), label="Max. time",
714
  info="Max. time to search optimal output.")
715
  repetition_penalty = gr.Slider(minimum=0.01, maximum=3.0,
716
  value=kwargs['repetition_penalty'],
717
  label="Repetition Penalty")
718
  num_return_sequences = gr.Slider(minimum=1, maximum=10, step=1,
719
  value=kwargs['num_return_sequences'],
720
+ label="Number Returns", info="Must be <= num_beams",
721
+ visible=not os.environ.get("HUGGINGFACE_SPACES"))
722
  do_sample = gr.Checkbox(label="Sample", info="Sample, for diverse output(s)",
723
  value=kwargs['do_sample'])
724
  if kwargs['chat']:
725
  iinput = gr.Textbox(lines=4, label="Input",
726
+ placeholder=kwargs['placeholder_input'],
727
+ visible=not os.environ.get("HUGGINGFACE_SPACES"))
728
+ # nominally empty for chat mode
729
  context = gr.Textbox(lines=1, label="Context",
730
+ info="Ignored in chat mode.",
731
+ visible=not os.environ.get("HUGGINGFACE_SPACES"))
732
 
733
  with gr.TabItem("Models"):
734
  with gr.Row():
 
750
  with gr.Column(scale=1):
751
  add_model_button = gr.Button("Add new model name")
752
  add_lora_button = gr.Button("Add new LORA name", visible=kwargs['show_lora'])
753
+ with gr.TabItem("System"):
754
+ with gr.Row():
755
+ with gr.Column():
756
+ system_text = gr.Textbox(label='System Info')
757
+ system_btn = gr.Button(value='Get System Info')
758
+
759
 
760
  inputs_list = get_inputs_list(locals(), kwargs['model_lower'])
761
  from functools import partial
 
826
  traceback.print_exc()
827
  clear_torch_cache()
828
  return 'Response Score: GPU OOM'
829
+ except RuntimeError as e:
830
+ if 'Expected all tensors to be on the same device' in str(e) or 'expected scalar type Half but found Float' in str(e):
831
+ print("GPU Error: question: %s answer: %s exception: %s" % (question, answer, str(e)), flush=True)
832
+ traceback.print_exc()
833
+ clear_torch_cache()
834
+ return 'Response Score: GPU Error'
835
+ else:
836
+ raise
837
  os.environ['TOKENIZERS_PARALLELISM'] = 'true'
838
  return 'Response Score: {:.1%}'.format(score)
839
  else:
 
1020
  prompt_update_args = dict(fn=dropdown_prompt_type_list, inputs=prompt_type, outputs=prompt_type)
1021
  chatbot_update_args = dict(fn=chatbot_list, inputs=[text_output, model_used], outputs=text_output)
1022
  if not os.environ.get("HUGGINGFACE_SPACES"):
1023
+ load_model_event = load_model_button.click(**load_model_args) \
1024
+ .then(**prompt_update_args) \
1025
+ .then(**chatbot_update_args) \
1026
+ .then(clear_torch_cache)
1027
 
1028
  def dropdown_model_list(list0, x):
1029
  new_state = [list0[0] + [x]]
 
1051
  callback.setup(inputs_list + [text_output], "flagged_data_points")
1052
  flag_btn.click(lambda *args: callback.flag(args), inputs_list + [text_output], None, preprocess=False,
1053
  api_name='flag')
1054
+
1055
+ def get_system_info():
1056
+ return gr.Textbox.update(value=system_info_print())
1057
+
1058
+ system_event = system_btn.click(get_system_info, outputs=system_text, api_name='system_info')
1059
+
1060
  if kwargs['chat']:
1061
 
1062
  # don't pass text_output, don't want to clear output, just stop it
 
1317
  try:
1318
  model.generate(**kwargs)
1319
  except torch.cuda.OutOfMemoryError as e:
1320
+ print("GPU OOM: prompt: %s inputs_decoded: %s exception: %s" % (prompt, inputs_decoded, str(e)),
1321
+ flush=True)
1322
  if kwargs['input_ids'] is not None:
1323
  kwargs['input_ids'].cpu()
1324
  kwargs['input_ids'] = None
1325
  traceback.print_exc()
1326
  clear_torch_cache()
1327
  return
1328
+ except RuntimeError as e:
1329
+ if 'Expected all tensors to be on the same device' in str(
1330
+ e) or 'expected scalar type Half but found Float' in str(e):
1331
+ print(
1332
+ "GPU Error: prompt: %s inputs_decoded: %s exception: %s" % (prompt, inputs_decoded, str(e)),
1333
+ flush=True)
1334
+ traceback.print_exc()
1335
+ clear_torch_cache()
1336
+ return
1337
+ else:
1338
+ raise
1339
 
1340
  for output in CallbackToGenerator(generate, callback=None, **gen_kwargs):
1341
  decoded_output = decoder(output)
requirements.txt CHANGED
@@ -23,6 +23,10 @@ git+https://github.com/huggingface/peft.git@098962fa6515f2e4fe83a757f5995d3ffbb1
23
  transformers==4.28.1
24
  tokenizers==0.13.3
25
 
 
 
 
 
26
  # optional for finetune
27
  tensorboard==2.12.1
28
  neptune==1.1.1
 
23
  transformers==4.28.1
24
  tokenizers==0.13.3
25
 
26
+ # optional for generate
27
+ pynvml==11.5.0
28
+ psutil==5.9.4
29
+
30
  # optional for finetune
31
  tensorboard==2.12.1
32
  neptune==1.1.1
utils.py CHANGED
@@ -1,7 +1,9 @@
1
  import os
2
  import gc
3
  import random
 
4
  import numpy as np
 
5
  import torch
6
 
7
 
@@ -37,3 +39,51 @@ def clear_torch_cache():
37
  torch.cuda.empty_cache()
38
  torch.cuda.ipc_collect()
39
  gc.collect()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import gc
3
  import random
4
+ import time
5
  import numpy as np
6
+ import pandas as pd
7
  import torch
8
 
9
 
 
39
  torch.cuda.empty_cache()
40
  torch.cuda.ipc_collect()
41
  gc.collect()
42
+
43
+
44
+ def system_info():
45
+ import psutil
46
+
47
+ system = {}
48
+ # https://stackoverflow.com/questions/48951136/plot-multiple-graphs-in-one-plot-using-tensorboard
49
+ # https://arshren.medium.com/monitoring-your-devices-in-python-5191d672f749
50
+ temps = psutil.sensors_temperatures(fahrenheit=False)
51
+ if 'coretemp' in temps:
52
+ coretemp = temps['coretemp']
53
+ temp_dict = {k.label: k.current for k in coretemp}
54
+ for k, v in temp_dict.items():
55
+ system['CPU_C/%s' % k] = v
56
+
57
+ # https://github.com/gpuopenanalytics/pynvml/blob/master/help_query_gpu.txt
58
+ from pynvml.smi import nvidia_smi
59
+ nvsmi = nvidia_smi.getInstance()
60
+
61
+ gpu_power_dict = {'W_gpu%d' % i: x['power_readings']['power_draw'] for i, x in
62
+ enumerate(nvsmi.DeviceQuery('power.draw')['gpu'])}
63
+ for k, v in gpu_power_dict.items():
64
+ system['GPU_W/%s' % k] = v
65
+
66
+ gpu_temp_dict = {'C_gpu%d' % i: x['temperature']['gpu_temp'] for i, x in
67
+ enumerate(nvsmi.DeviceQuery('temperature.gpu')['gpu'])}
68
+ for k, v in gpu_temp_dict.items():
69
+ system['GPU_C/%s' % k] = v
70
+
71
+ gpu_memory_free_dict = {'MiB_gpu%d' % i: x['fb_memory_usage']['free'] for i, x in
72
+ enumerate(nvsmi.DeviceQuery('memory.free')['gpu'])}
73
+ gpu_memory_total_dict = {'MiB_gpu%d' % i: x['fb_memory_usage']['total'] for i, x in
74
+ enumerate(nvsmi.DeviceQuery('memory.total')['gpu'])}
75
+ gpu_memory_frac_dict = {k: gpu_memory_free_dict[k] / gpu_memory_total_dict[k] for k in gpu_memory_total_dict}
76
+ for k, v in gpu_memory_frac_dict.items():
77
+ system[f'GPU_M/%s' % k] = v
78
+
79
+ return system
80
+
81
+
82
+ def system_info_print():
83
+ try:
84
+ df = pd.DataFrame.from_dict(system_info(), orient='index')
85
+ # avoid slamming GPUs
86
+ time.sleep(1)
87
+ return df.to_markdown()
88
+ except Exception as e:
89
+ return "Error: %s" % str(e)