Spaces:
Running
on
Zero
Running
on
Zero
OpenSourceRonin
commited on
Commit
•
5c539b4
1
Parent(s):
d1789cc
Update app.py
Browse files
app.py
CHANGED
@@ -1,10 +1,6 @@
|
|
1 |
import spaces
|
2 |
import os
|
3 |
import threading
|
4 |
-
from collections import deque
|
5 |
-
|
6 |
-
import plotly.graph_objs as go
|
7 |
-
import pynvml
|
8 |
|
9 |
import gradio as gr
|
10 |
from huggingface_hub import snapshot_download
|
@@ -30,100 +26,6 @@ models = [
|
|
30 |
},
|
31 |
]
|
32 |
|
33 |
-
# Queues for storing historical data (saving the last 100 GPU utilization and memory usage values)
|
34 |
-
gpu_util_history = deque(maxlen=100)
|
35 |
-
mem_usage_history = deque(maxlen=100)
|
36 |
-
|
37 |
-
|
38 |
-
def initialize_nvml():
|
39 |
-
"""
|
40 |
-
Initialize NVML (NVIDIA Management Library).
|
41 |
-
"""
|
42 |
-
pynvml.nvmlInit()
|
43 |
-
|
44 |
-
|
45 |
-
def get_gpu_info():
|
46 |
-
"""
|
47 |
-
Get GPU utilization and memory usage information.
|
48 |
-
|
49 |
-
Returns:
|
50 |
-
dict: A dictionary containing GPU utilization and memory usage information.
|
51 |
-
"""
|
52 |
-
handle = pynvml.nvmlDeviceGetHandleByIndex(0) # Assuming a single GPU setup
|
53 |
-
utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
|
54 |
-
memory = pynvml.nvmlDeviceGetMemoryInfo(handle)
|
55 |
-
|
56 |
-
gpu_info = {
|
57 |
-
'gpu_util': utilization.gpu,
|
58 |
-
'mem_used': memory.used / 1024**2, # Convert bytes to MiB
|
59 |
-
'mem_total': memory.total / 1024**2, # Convert bytes to MiB
|
60 |
-
'mem_percent': (memory.used / memory.total) * 100
|
61 |
-
}
|
62 |
-
return gpu_info
|
63 |
-
|
64 |
-
|
65 |
-
def _update_charts(chart_height: int = 200) -> go.Figure:
|
66 |
-
"""
|
67 |
-
Update the GPU utilization and memory usage charts.
|
68 |
-
|
69 |
-
Args:
|
70 |
-
chart_height (int, optional): used to set the height of the chart. Defaults to 200.
|
71 |
-
|
72 |
-
Returns:
|
73 |
-
plotly.graph_objs.Figure: The updated figure containing the GPU and memory usage charts.
|
74 |
-
"""
|
75 |
-
# obtain GPU information
|
76 |
-
gpu_info = get_gpu_info()
|
77 |
-
|
78 |
-
# records the latest GPU utilization and memory usage values
|
79 |
-
gpu_util = round(gpu_info.get('gpu_util', 0), 1)
|
80 |
-
mem_used = round(gpu_info.get('mem_used', 0) / 1024, 2) # Convert MiB to GiB
|
81 |
-
gpu_util_history.append(gpu_util)
|
82 |
-
mem_usage_history.append(mem_used)
|
83 |
-
|
84 |
-
# create GPU utilization line chart
|
85 |
-
gpu_trace = go.Scatter(
|
86 |
-
y=list(gpu_util_history),
|
87 |
-
mode='lines+markers',
|
88 |
-
text=list(gpu_util_history),
|
89 |
-
line=dict(shape='spline', color='blue'), # Make the line smooth and set color
|
90 |
-
yaxis='y1' # Link to y-axis 1
|
91 |
-
)
|
92 |
-
|
93 |
-
# create memory usage line chart
|
94 |
-
mem_trace = go.Scatter(
|
95 |
-
y=list(mem_usage_history),
|
96 |
-
mode='lines+markers',
|
97 |
-
text=list(mem_usage_history),
|
98 |
-
line=dict(shape='spline', color='red'), # Make the line smooth and set color
|
99 |
-
yaxis='y2' # Link to y-axis 2
|
100 |
-
)
|
101 |
-
|
102 |
-
# set the layout of the chart
|
103 |
-
layout = go.Layout(
|
104 |
-
xaxis=dict(title=None, showticklabels=False, ticks=''),
|
105 |
-
yaxis=dict(
|
106 |
-
title='GPU Utilization (%)',
|
107 |
-
range=[-5, 110],
|
108 |
-
titlefont=dict(color='blue'),
|
109 |
-
tickfont=dict(color='blue'),
|
110 |
-
),
|
111 |
-
yaxis2=dict(title='Memory Usage (GiB)',
|
112 |
-
range=[0, max(24,
|
113 |
-
max(mem_usage_history) + 1)],
|
114 |
-
titlefont=dict(color='red'),
|
115 |
-
tickfont=dict(color='red'),
|
116 |
-
overlaying='y',
|
117 |
-
side='right'),
|
118 |
-
height=chart_height, # set the height of the chart
|
119 |
-
margin=dict(l=10, r=10, t=0, b=0), # set the margin of the chart
|
120 |
-
showlegend=False # disable the legend
|
121 |
-
)
|
122 |
-
|
123 |
-
fig = go.Figure(data=[gpu_trace, mem_trace], layout=layout)
|
124 |
-
return fig
|
125 |
-
|
126 |
-
|
127 |
def initialize_history():
|
128 |
"""
|
129 |
Initializes the GPU utilization and memory usage history.
|
@@ -134,13 +36,6 @@ def initialize_history():
|
|
134 |
mem_usage_history.append(round(gpu_info.get('mem_percent', 0), 1))
|
135 |
|
136 |
|
137 |
-
def enable_gpu_info():
|
138 |
-
pynvml.nvmlInit()
|
139 |
-
|
140 |
-
|
141 |
-
def disable_gpu_info():
|
142 |
-
pynvml.nvmlShutdown()
|
143 |
-
|
144 |
model_choices = [f"{model['name']} ({model['bits']})" for model in models]
|
145 |
display_to_model = {f"{model['name']} ({model['bits']})": model['name'] for model in models}
|
146 |
|
@@ -159,7 +54,8 @@ def download_models_in_background():
|
|
159 |
download_thread = threading.Thread(target=download_models_in_background)
|
160 |
download_thread.start()
|
161 |
|
162 |
-
|
|
|
163 |
|
164 |
@spaces.GPU
|
165 |
def respond(
|
@@ -173,12 +69,16 @@ def respond(
|
|
173 |
):
|
174 |
model_name = display_to_model[selected_model_display_label]
|
175 |
|
|
|
|
|
|
|
176 |
# Check if the model is already loaded
|
177 |
-
if model_name not
|
178 |
# Load and store the model in the cache
|
179 |
-
|
|
|
180 |
|
181 |
-
chat_completion =
|
182 |
|
183 |
messages = [{"role": "system", "content": system_message}]
|
184 |
|
@@ -240,4 +140,3 @@ with gr.Blocks(fill_height=True) as demo:
|
|
240 |
if __name__ == "__main__":
|
241 |
share = os.getenv("SHARE_LINK", None) in ["1", "true", "True"]
|
242 |
demo.launch(share=share)
|
243 |
-
# disable_gpu_info()
|
|
|
1 |
import spaces
|
2 |
import os
|
3 |
import threading
|
|
|
|
|
|
|
|
|
4 |
|
5 |
import gradio as gr
|
6 |
from huggingface_hub import snapshot_download
|
|
|
26 |
},
|
27 |
]
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
def initialize_history():
|
30 |
"""
|
31 |
Initializes the GPU utilization and memory usage history.
|
|
|
36 |
mem_usage_history.append(round(gpu_info.get('mem_percent', 0), 1))
|
37 |
|
38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
model_choices = [f"{model['name']} ({model['bits']})" for model in models]
|
40 |
display_to_model = {f"{model['name']} ({model['bits']})": model['name'] for model in models}
|
41 |
|
|
|
54 |
download_thread = threading.Thread(target=download_models_in_background)
|
55 |
download_thread.start()
|
56 |
|
57 |
+
loaded_model = None
|
58 |
+
loaded_model_name = None
|
59 |
|
60 |
@spaces.GPU
|
61 |
def respond(
|
|
|
69 |
):
|
70 |
model_name = display_to_model[selected_model_display_label]
|
71 |
|
72 |
+
global loaded_model
|
73 |
+
global loaded_model_name
|
74 |
+
|
75 |
# Check if the model is already loaded
|
76 |
+
if model_name is not loaded_model_name:
|
77 |
# Load and store the model in the cache
|
78 |
+
loaded_model = get_chat_loop_generator(model_name)
|
79 |
+
loaded_model_name = model_name
|
80 |
|
81 |
+
chat_completion = loaded_model
|
82 |
|
83 |
messages = [{"role": "system", "content": system_message}]
|
84 |
|
|
|
140 |
if __name__ == "__main__":
|
141 |
share = os.getenv("SHARE_LINK", None) in ["1", "true", "True"]
|
142 |
demo.launch(share=share)
|
|