Spaces:
Runtime error
Runtime error
pseudotensor
commited on
Commit
·
5cf48e0
1
Parent(s):
efe0924
More restrictions for HF spaces to stabilize against GPU OOM and hide unusable options
Browse files- app.py +71 -17
- requirements.txt +4 -0
- utils.py +50 -0
app.py
CHANGED
@@ -5,7 +5,7 @@ import os
|
|
5 |
import traceback
|
6 |
import typing
|
7 |
|
8 |
-
from utils import set_seed, flatten_list, clear_torch_cache
|
9 |
|
10 |
SEED = 1236
|
11 |
set_seed(SEED)
|
@@ -210,6 +210,16 @@ def main(
|
|
210 |
traceback.print_exc()
|
211 |
score = 0.0
|
212 |
clear_torch_cache()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
213 |
print("SCORE %s: %s" % (exi, score), flush=True)
|
214 |
score_dump.append(ex + [prompt, res, score])
|
215 |
# dump every score in case abort
|
@@ -515,7 +525,7 @@ def go_gradio(**kwargs):
|
|
515 |
if os.environ.get("HUGGINGFACE_SPACES"):
|
516 |
description += """<p><b> DISCLAIMERS: </b><ul><i><li>The data used to train this model include The Pile and other sources. These may contain objectionable content, so the model may reproduce that material. Use application and responses at own risk.</i></li>"""
|
517 |
if kwargs['load_8bit']:
|
518 |
-
description += """<i><li> Model is loaded in 8-bit and
|
519 |
description += """<i><li>Model loading and unloading disabled on HF SPACES to avoid GPU OOM for multi-user environment.</i></li></ul></p>"""
|
520 |
|
521 |
if kwargs['verbose']:
|
@@ -668,7 +678,8 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
|
|
668 |
stream_output = gr.components.Checkbox(label="Stream output",
|
669 |
value=kwargs['stream_output'])
|
670 |
prompt_type = gr.Dropdown(prompt_types_strings,
|
671 |
-
value=kwargs['prompt_type'], label="Prompt Type"
|
|
|
672 |
temperature = gr.Slider(minimum=0, maximum=3,
|
673 |
value=kwargs['temperature'],
|
674 |
label="Temperature",
|
@@ -681,35 +692,43 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
|
|
681 |
value=kwargs['top_k'], label="Top k",
|
682 |
info='Num. tokens to sample from'
|
683 |
)
|
684 |
-
|
685 |
-
|
686 |
-
|
|
|
|
|
|
|
687 |
max_new_tokens = gr.Slider(
|
688 |
-
minimum=1, maximum=
|
689 |
-
value=kwargs['max_new_tokens'], label="Max output length"
|
690 |
)
|
691 |
min_new_tokens = gr.Slider(
|
692 |
-
minimum=0, maximum=
|
693 |
-
value=kwargs['min_new_tokens'], label="Min output length"
|
694 |
)
|
695 |
early_stopping = gr.Checkbox(label="EarlyStopping", info="Stop early in beam search",
|
696 |
value=kwargs['early_stopping'])
|
697 |
-
|
698 |
-
|
|
|
699 |
info="Max. time to search optimal output.")
|
700 |
repetition_penalty = gr.Slider(minimum=0.01, maximum=3.0,
|
701 |
value=kwargs['repetition_penalty'],
|
702 |
label="Repetition Penalty")
|
703 |
num_return_sequences = gr.Slider(minimum=1, maximum=10, step=1,
|
704 |
value=kwargs['num_return_sequences'],
|
705 |
-
label="Number Returns", info="Must be <= num_beams"
|
|
|
706 |
do_sample = gr.Checkbox(label="Sample", info="Sample, for diverse output(s)",
|
707 |
value=kwargs['do_sample'])
|
708 |
if kwargs['chat']:
|
709 |
iinput = gr.Textbox(lines=4, label="Input",
|
710 |
-
placeholder=kwargs['placeholder_input']
|
|
|
|
|
711 |
context = gr.Textbox(lines=1, label="Context",
|
712 |
-
info="Ignored in chat mode."
|
|
|
713 |
|
714 |
with gr.TabItem("Models"):
|
715 |
with gr.Row():
|
@@ -731,6 +750,12 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
|
|
731 |
with gr.Column(scale=1):
|
732 |
add_model_button = gr.Button("Add new model name")
|
733 |
add_lora_button = gr.Button("Add new LORA name", visible=kwargs['show_lora'])
|
|
|
|
|
|
|
|
|
|
|
|
|
734 |
|
735 |
inputs_list = get_inputs_list(locals(), kwargs['model_lower'])
|
736 |
from functools import partial
|
@@ -801,6 +826,14 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
|
|
801 |
traceback.print_exc()
|
802 |
clear_torch_cache()
|
803 |
return 'Response Score: GPU OOM'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
804 |
os.environ['TOKENIZERS_PARALLELISM'] = 'true'
|
805 |
return 'Response Score: {:.1%}'.format(score)
|
806 |
else:
|
@@ -987,7 +1020,10 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
|
|
987 |
prompt_update_args = dict(fn=dropdown_prompt_type_list, inputs=prompt_type, outputs=prompt_type)
|
988 |
chatbot_update_args = dict(fn=chatbot_list, inputs=[text_output, model_used], outputs=text_output)
|
989 |
if not os.environ.get("HUGGINGFACE_SPACES"):
|
990 |
-
load_model_event = load_model_button.click(**load_model_args)
|
|
|
|
|
|
|
991 |
|
992 |
def dropdown_model_list(list0, x):
|
993 |
new_state = [list0[0] + [x]]
|
@@ -1015,6 +1051,12 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
|
|
1015 |
callback.setup(inputs_list + [text_output], "flagged_data_points")
|
1016 |
flag_btn.click(lambda *args: callback.flag(args), inputs_list + [text_output], None, preprocess=False,
|
1017 |
api_name='flag')
|
|
|
|
|
|
|
|
|
|
|
|
|
1018 |
if kwargs['chat']:
|
1019 |
|
1020 |
# don't pass text_output, don't want to clear output, just stop it
|
@@ -1275,13 +1317,25 @@ def evaluate(
|
|
1275 |
try:
|
1276 |
model.generate(**kwargs)
|
1277 |
except torch.cuda.OutOfMemoryError as e:
|
1278 |
-
print("GPU OOM: prompt: %s inputs_decoded: %s exception: %s" % (prompt, inputs_decoded, str(e)),
|
|
|
1279 |
if kwargs['input_ids'] is not None:
|
1280 |
kwargs['input_ids'].cpu()
|
1281 |
kwargs['input_ids'] = None
|
1282 |
traceback.print_exc()
|
1283 |
clear_torch_cache()
|
1284 |
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1285 |
|
1286 |
for output in CallbackToGenerator(generate, callback=None, **gen_kwargs):
|
1287 |
decoded_output = decoder(output)
|
|
|
5 |
import traceback
|
6 |
import typing
|
7 |
|
8 |
+
from utils import set_seed, flatten_list, clear_torch_cache, system_info_print
|
9 |
|
10 |
SEED = 1236
|
11 |
set_seed(SEED)
|
|
|
210 |
traceback.print_exc()
|
211 |
score = 0.0
|
212 |
clear_torch_cache()
|
213 |
+
except RuntimeError as e:
|
214 |
+
if 'Expected all tensors to be on the same device' in str(
|
215 |
+
e) or 'expected scalar type Half but found Float' in str(e):
|
216 |
+
print("GPU error: question: %s answer: %s exception: %s" % (prompt, res, str(e)),
|
217 |
+
flush=True)
|
218 |
+
traceback.print_exc()
|
219 |
+
score = 0.0
|
220 |
+
clear_torch_cache()
|
221 |
+
else:
|
222 |
+
raise
|
223 |
print("SCORE %s: %s" % (exi, score), flush=True)
|
224 |
score_dump.append(ex + [prompt, res, score])
|
225 |
# dump every score in case abort
|
|
|
525 |
if os.environ.get("HUGGINGFACE_SPACES"):
|
526 |
description += """<p><b> DISCLAIMERS: </b><ul><i><li>The data used to train this model include The Pile and other sources. These may contain objectionable content, so the model may reproduce that material. Use application and responses at own risk.</i></li>"""
|
527 |
if kwargs['load_8bit']:
|
528 |
+
description += """<i><li> Model is loaded in 8-bit and HF spaces version has other limitations in order to fit on HF GPUs, so UX can be worse than native app.</i></li>"""
|
529 |
description += """<i><li>Model loading and unloading disabled on HF SPACES to avoid GPU OOM for multi-user environment.</i></li></ul></p>"""
|
530 |
|
531 |
if kwargs['verbose']:
|
|
|
678 |
stream_output = gr.components.Checkbox(label="Stream output",
|
679 |
value=kwargs['stream_output'])
|
680 |
prompt_type = gr.Dropdown(prompt_types_strings,
|
681 |
+
value=kwargs['prompt_type'], label="Prompt Type",
|
682 |
+
visible=not os.environ.get("HUGGINGFACE_SPACES"))
|
683 |
temperature = gr.Slider(minimum=0, maximum=3,
|
684 |
value=kwargs['temperature'],
|
685 |
label="Temperature",
|
|
|
692 |
value=kwargs['top_k'], label="Top k",
|
693 |
info='Num. tokens to sample from'
|
694 |
)
|
695 |
+
max_beams = 8 if not os.environ.get("HUGGINGFACE_SPACES") else 2
|
696 |
+
num_beams = gr.Slider(minimum=1, maximum=max_beams, step=1,
|
697 |
+
value=min(max_beams, kwargs['num_beams']), label="Beams",
|
698 |
+
info="Number of searches for optimal overall probability. "
|
699 |
+
"Uses more GPU memory/compute")
|
700 |
+
max_max_new_tokens = 2048 if not os.environ.get("HUGGINGFACE_SPACES") else kwargs['max_new_tokens']
|
701 |
max_new_tokens = gr.Slider(
|
702 |
+
minimum=1, maximum=max_max_new_tokens, step=1,
|
703 |
+
value=min(max_max_new_tokens, kwargs['max_new_tokens']), label="Max output length",
|
704 |
)
|
705 |
min_new_tokens = gr.Slider(
|
706 |
+
minimum=0, maximum=max_max_new_tokens, step=1,
|
707 |
+
value=min(max_max_new_tokens, kwargs['min_new_tokens']), label="Min output length",
|
708 |
)
|
709 |
early_stopping = gr.Checkbox(label="EarlyStopping", info="Stop early in beam search",
|
710 |
value=kwargs['early_stopping'])
|
711 |
+
max_max_time = 60 * 5 if not os.environ.get("HUGGINGFACE_SPACES") else 60
|
712 |
+
max_time = gr.Slider(minimum=0, maximum=max_max_time, step=1,
|
713 |
+
value=min(max_max_time, kwargs['max_time']), label="Max. time",
|
714 |
info="Max. time to search optimal output.")
|
715 |
repetition_penalty = gr.Slider(minimum=0.01, maximum=3.0,
|
716 |
value=kwargs['repetition_penalty'],
|
717 |
label="Repetition Penalty")
|
718 |
num_return_sequences = gr.Slider(minimum=1, maximum=10, step=1,
|
719 |
value=kwargs['num_return_sequences'],
|
720 |
+
label="Number Returns", info="Must be <= num_beams",
|
721 |
+
visible=not os.environ.get("HUGGINGFACE_SPACES"))
|
722 |
do_sample = gr.Checkbox(label="Sample", info="Sample, for diverse output(s)",
|
723 |
value=kwargs['do_sample'])
|
724 |
if kwargs['chat']:
|
725 |
iinput = gr.Textbox(lines=4, label="Input",
|
726 |
+
placeholder=kwargs['placeholder_input'],
|
727 |
+
visible=not os.environ.get("HUGGINGFACE_SPACES"))
|
728 |
+
# nominally empty for chat mode
|
729 |
context = gr.Textbox(lines=1, label="Context",
|
730 |
+
info="Ignored in chat mode.",
|
731 |
+
visible=not os.environ.get("HUGGINGFACE_SPACES"))
|
732 |
|
733 |
with gr.TabItem("Models"):
|
734 |
with gr.Row():
|
|
|
750 |
with gr.Column(scale=1):
|
751 |
add_model_button = gr.Button("Add new model name")
|
752 |
add_lora_button = gr.Button("Add new LORA name", visible=kwargs['show_lora'])
|
753 |
+
with gr.TabItem("System"):
|
754 |
+
with gr.Row():
|
755 |
+
with gr.Column():
|
756 |
+
system_text = gr.Textbox(label='System Info')
|
757 |
+
system_btn = gr.Button(value='Get System Info')
|
758 |
+
|
759 |
|
760 |
inputs_list = get_inputs_list(locals(), kwargs['model_lower'])
|
761 |
from functools import partial
|
|
|
826 |
traceback.print_exc()
|
827 |
clear_torch_cache()
|
828 |
return 'Response Score: GPU OOM'
|
829 |
+
except RuntimeError as e:
|
830 |
+
if 'Expected all tensors to be on the same device' in str(e) or 'expected scalar type Half but found Float' in str(e):
|
831 |
+
print("GPU Error: question: %s answer: %s exception: %s" % (question, answer, str(e)), flush=True)
|
832 |
+
traceback.print_exc()
|
833 |
+
clear_torch_cache()
|
834 |
+
return 'Response Score: GPU Error'
|
835 |
+
else:
|
836 |
+
raise
|
837 |
os.environ['TOKENIZERS_PARALLELISM'] = 'true'
|
838 |
return 'Response Score: {:.1%}'.format(score)
|
839 |
else:
|
|
|
1020 |
prompt_update_args = dict(fn=dropdown_prompt_type_list, inputs=prompt_type, outputs=prompt_type)
|
1021 |
chatbot_update_args = dict(fn=chatbot_list, inputs=[text_output, model_used], outputs=text_output)
|
1022 |
if not os.environ.get("HUGGINGFACE_SPACES"):
|
1023 |
+
load_model_event = load_model_button.click(**load_model_args) \
|
1024 |
+
.then(**prompt_update_args) \
|
1025 |
+
.then(**chatbot_update_args) \
|
1026 |
+
.then(clear_torch_cache)
|
1027 |
|
1028 |
def dropdown_model_list(list0, x):
|
1029 |
new_state = [list0[0] + [x]]
|
|
|
1051 |
callback.setup(inputs_list + [text_output], "flagged_data_points")
|
1052 |
flag_btn.click(lambda *args: callback.flag(args), inputs_list + [text_output], None, preprocess=False,
|
1053 |
api_name='flag')
|
1054 |
+
|
1055 |
+
def get_system_info():
|
1056 |
+
return gr.Textbox.update(value=system_info_print())
|
1057 |
+
|
1058 |
+
system_event = system_btn.click(get_system_info, outputs=system_text, api_name='system_info')
|
1059 |
+
|
1060 |
if kwargs['chat']:
|
1061 |
|
1062 |
# don't pass text_output, don't want to clear output, just stop it
|
|
|
1317 |
try:
|
1318 |
model.generate(**kwargs)
|
1319 |
except torch.cuda.OutOfMemoryError as e:
|
1320 |
+
print("GPU OOM: prompt: %s inputs_decoded: %s exception: %s" % (prompt, inputs_decoded, str(e)),
|
1321 |
+
flush=True)
|
1322 |
if kwargs['input_ids'] is not None:
|
1323 |
kwargs['input_ids'].cpu()
|
1324 |
kwargs['input_ids'] = None
|
1325 |
traceback.print_exc()
|
1326 |
clear_torch_cache()
|
1327 |
return
|
1328 |
+
except RuntimeError as e:
|
1329 |
+
if 'Expected all tensors to be on the same device' in str(
|
1330 |
+
e) or 'expected scalar type Half but found Float' in str(e):
|
1331 |
+
print(
|
1332 |
+
"GPU Error: prompt: %s inputs_decoded: %s exception: %s" % (prompt, inputs_decoded, str(e)),
|
1333 |
+
flush=True)
|
1334 |
+
traceback.print_exc()
|
1335 |
+
clear_torch_cache()
|
1336 |
+
return
|
1337 |
+
else:
|
1338 |
+
raise
|
1339 |
|
1340 |
for output in CallbackToGenerator(generate, callback=None, **gen_kwargs):
|
1341 |
decoded_output = decoder(output)
|
requirements.txt
CHANGED
@@ -23,6 +23,10 @@ git+https://github.com/huggingface/peft.git@098962fa6515f2e4fe83a757f5995d3ffbb1
|
|
23 |
transformers==4.28.1
|
24 |
tokenizers==0.13.3
|
25 |
|
|
|
|
|
|
|
|
|
26 |
# optional for finetune
|
27 |
tensorboard==2.12.1
|
28 |
neptune==1.1.1
|
|
|
23 |
transformers==4.28.1
|
24 |
tokenizers==0.13.3
|
25 |
|
26 |
+
# optional for generate
|
27 |
+
pynvml==11.5.0
|
28 |
+
psutil==5.9.4
|
29 |
+
|
30 |
# optional for finetune
|
31 |
tensorboard==2.12.1
|
32 |
neptune==1.1.1
|
utils.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1 |
import os
|
2 |
import gc
|
3 |
import random
|
|
|
4 |
import numpy as np
|
|
|
5 |
import torch
|
6 |
|
7 |
|
@@ -37,3 +39,51 @@ def clear_torch_cache():
|
|
37 |
torch.cuda.empty_cache()
|
38 |
torch.cuda.ipc_collect()
|
39 |
gc.collect()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
import gc
|
3 |
import random
|
4 |
+
import time
|
5 |
import numpy as np
|
6 |
+
import pandas as pd
|
7 |
import torch
|
8 |
|
9 |
|
|
|
39 |
torch.cuda.empty_cache()
|
40 |
torch.cuda.ipc_collect()
|
41 |
gc.collect()
|
42 |
+
|
43 |
+
|
44 |
+
def system_info():
|
45 |
+
import psutil
|
46 |
+
|
47 |
+
system = {}
|
48 |
+
# https://stackoverflow.com/questions/48951136/plot-multiple-graphs-in-one-plot-using-tensorboard
|
49 |
+
# https://arshren.medium.com/monitoring-your-devices-in-python-5191d672f749
|
50 |
+
temps = psutil.sensors_temperatures(fahrenheit=False)
|
51 |
+
if 'coretemp' in temps:
|
52 |
+
coretemp = temps['coretemp']
|
53 |
+
temp_dict = {k.label: k.current for k in coretemp}
|
54 |
+
for k, v in temp_dict.items():
|
55 |
+
system['CPU_C/%s' % k] = v
|
56 |
+
|
57 |
+
# https://github.com/gpuopenanalytics/pynvml/blob/master/help_query_gpu.txt
|
58 |
+
from pynvml.smi import nvidia_smi
|
59 |
+
nvsmi = nvidia_smi.getInstance()
|
60 |
+
|
61 |
+
gpu_power_dict = {'W_gpu%d' % i: x['power_readings']['power_draw'] for i, x in
|
62 |
+
enumerate(nvsmi.DeviceQuery('power.draw')['gpu'])}
|
63 |
+
for k, v in gpu_power_dict.items():
|
64 |
+
system['GPU_W/%s' % k] = v
|
65 |
+
|
66 |
+
gpu_temp_dict = {'C_gpu%d' % i: x['temperature']['gpu_temp'] for i, x in
|
67 |
+
enumerate(nvsmi.DeviceQuery('temperature.gpu')['gpu'])}
|
68 |
+
for k, v in gpu_temp_dict.items():
|
69 |
+
system['GPU_C/%s' % k] = v
|
70 |
+
|
71 |
+
gpu_memory_free_dict = {'MiB_gpu%d' % i: x['fb_memory_usage']['free'] for i, x in
|
72 |
+
enumerate(nvsmi.DeviceQuery('memory.free')['gpu'])}
|
73 |
+
gpu_memory_total_dict = {'MiB_gpu%d' % i: x['fb_memory_usage']['total'] for i, x in
|
74 |
+
enumerate(nvsmi.DeviceQuery('memory.total')['gpu'])}
|
75 |
+
gpu_memory_frac_dict = {k: gpu_memory_free_dict[k] / gpu_memory_total_dict[k] for k in gpu_memory_total_dict}
|
76 |
+
for k, v in gpu_memory_frac_dict.items():
|
77 |
+
system[f'GPU_M/%s' % k] = v
|
78 |
+
|
79 |
+
return system
|
80 |
+
|
81 |
+
|
82 |
+
def system_info_print():
|
83 |
+
try:
|
84 |
+
df = pd.DataFrame.from_dict(system_info(), orient='index')
|
85 |
+
# avoid slamming GPUs
|
86 |
+
time.sleep(1)
|
87 |
+
return df.to_markdown()
|
88 |
+
except Exception as e:
|
89 |
+
return "Error: %s" % str(e)
|