Update app.py
Browse files
app.py
CHANGED
@@ -2,40 +2,25 @@ import gradio as gr
|
|
2 |
from llama_cpp import Llama
|
3 |
from huggingface_hub import hf_hub_download
|
4 |
import random
|
5 |
-
import spaces
|
6 |
-
import torch
|
7 |
|
8 |
-
#
|
9 |
-
import multiprocessing
|
10 |
-
n_cores = multiprocessing.cpu_count()
|
11 |
-
|
12 |
-
# Initialize model with optimized parameters
|
13 |
model_path = hf_hub_download(
|
14 |
repo_id="AstroMLab/AstroSage-8B-GGUF",
|
15 |
filename="AstroSage-8B-Q8_0.gguf"
|
16 |
)
|
17 |
|
18 |
-
# Optimized LLaMA parameters for A100
|
19 |
llm = Llama(
|
20 |
model_path=model_path,
|
21 |
-
n_ctx=2048,
|
22 |
-
n_threads=
|
23 |
-
n_batch=512, # Increase batch size for faster processing
|
24 |
-
n_gpu_layers=35, # Offload more layers to GPU
|
25 |
chat_format="llama-3",
|
26 |
seed=42,
|
27 |
-
f16_kv=True,
|
28 |
logits_all=False,
|
29 |
-
use_mmap=
|
30 |
-
use_gpu=True
|
31 |
-
tensor_split=None, # Let the model handle tensor splitting
|
32 |
)
|
33 |
|
34 |
-
# Optimize CUDA settings if available
|
35 |
-
if torch.cuda.is_available():
|
36 |
-
torch.backends.cuda.matmul.allow_tf32 = True # Allow TF32 for faster matrix multiplication
|
37 |
-
torch.backends.cudnn.benchmark = True # Enable cudnn autotuner
|
38 |
-
|
39 |
# Placeholder responses for when context is empty
|
40 |
GREETING_MESSAGES = [
|
41 |
"Greetings! I am AstroSage, your guide to the cosmos. What would you like to explore today?",
|
@@ -50,26 +35,21 @@ def user(user_message, history):
|
|
50 |
history = []
|
51 |
return "", history + [{"role": "user", "content": user_message}]
|
52 |
|
53 |
-
@spaces.GPU
|
54 |
def bot(history):
|
55 |
-
"""Generate and stream the bot's response
|
56 |
if not history:
|
57 |
history = []
|
58 |
-
|
59 |
-
# Optimize context by limiting history
|
60 |
-
max_history_tokens = 1024 # Reserve half of context for response
|
61 |
-
recent_history = history[-5:] # Keep only last 5 messages for context
|
62 |
-
|
63 |
# Prepare the messages for the model
|
64 |
messages = [
|
65 |
{
|
66 |
"role": "system",
|
67 |
-
"content": "You are AstroSage, an intelligent AI assistant specializing in astronomy, astrophysics, and space science.
|
68 |
}
|
69 |
]
|
70 |
|
71 |
-
# Add
|
72 |
-
for message in
|
73 |
messages.append({"role": message["role"], "content": message["content"]})
|
74 |
|
75 |
# Add the current user message
|
@@ -78,18 +58,13 @@ def bot(history):
|
|
78 |
# Start generating the response
|
79 |
history.append({"role": "assistant", "content": ""})
|
80 |
|
81 |
-
#
|
82 |
response = llm.create_chat_completion(
|
83 |
messages=messages,
|
84 |
max_tokens=512,
|
85 |
temperature=0.7,
|
86 |
top_p=0.95,
|
87 |
-
stream=True
|
88 |
-
top_k=40, # Add top-k sampling
|
89 |
-
repeat_penalty=1.1, # Slight penalty for repetition
|
90 |
-
mirostat_mode=2, # Enable Mirostat sampling
|
91 |
-
mirostat_tau=5.0,
|
92 |
-
mirostat_eta=0.1,
|
93 |
)
|
94 |
|
95 |
for chunk in response:
|
@@ -116,7 +91,7 @@ custom_css = """
|
|
116 |
}
|
117 |
"""
|
118 |
|
119 |
-
# Create the Gradio interface
|
120 |
with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo", neutral_hue="slate")) as demo:
|
121 |
gr.Markdown(
|
122 |
"""
|
@@ -166,7 +141,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo", neutra
|
|
166 |
label="Example Questions"
|
167 |
)
|
168 |
|
169 |
-
# Set up the message chain with
|
170 |
msg.submit(
|
171 |
user,
|
172 |
[msg, chatbot],
|
@@ -175,10 +150,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo", neutra
|
|
175 |
).then(
|
176 |
bot,
|
177 |
chatbot,
|
178 |
-
chatbot
|
179 |
-
queue=True, # Enable queuing for bot responses
|
180 |
-
batch=True, # Enable batching
|
181 |
-
max_batch_size=4 # Process up to 4 requests together
|
182 |
)
|
183 |
|
184 |
# Clear button functionality
|
@@ -187,7 +159,6 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo", neutra
|
|
187 |
# Initial greeting
|
188 |
demo.load(initial_greeting, None, chatbot, queue=False)
|
189 |
|
190 |
-
# Launch the app
|
191 |
if __name__ == "__main__":
|
192 |
-
#demo.queue(concurrency_count=2) # Allow 2 concurrent requests
|
193 |
demo.launch()
|
|
|
2 |
from llama_cpp import Llama
|
3 |
from huggingface_hub import hf_hub_download
|
4 |
import random
|
|
|
|
|
5 |
|
6 |
+
# Initialize model
|
|
|
|
|
|
|
|
|
7 |
model_path = hf_hub_download(
|
8 |
repo_id="AstroMLab/AstroSage-8B-GGUF",
|
9 |
filename="AstroSage-8B-Q8_0.gguf"
|
10 |
)
|
11 |
|
|
|
12 |
llm = Llama(
|
13 |
model_path=model_path,
|
14 |
+
n_ctx=2048,
|
15 |
+
n_threads=4,
|
|
|
|
|
16 |
chat_format="llama-3",
|
17 |
seed=42,
|
18 |
+
f16_kv=True,
|
19 |
logits_all=False,
|
20 |
+
use_mmap=True,
|
21 |
+
use_gpu=True
|
|
|
22 |
)
|
23 |
|
|
|
|
|
|
|
|
|
|
|
24 |
# Placeholder responses for when context is empty
|
25 |
GREETING_MESSAGES = [
|
26 |
"Greetings! I am AstroSage, your guide to the cosmos. What would you like to explore today?",
|
|
|
35 |
history = []
|
36 |
return "", history + [{"role": "user", "content": user_message}]
|
37 |
|
|
|
38 |
def bot(history):
|
39 |
+
"""Generate and stream the bot's response."""
|
40 |
if not history:
|
41 |
history = []
|
42 |
+
|
|
|
|
|
|
|
|
|
43 |
# Prepare the messages for the model
|
44 |
messages = [
|
45 |
{
|
46 |
"role": "system",
|
47 |
+
"content": "You are AstroSage, an intelligent AI assistant specializing in astronomy, astrophysics, and space science. You provide accurate, scientific information while making complex concepts accessible. You're enthusiastic about space exploration and maintain a sense of wonder about the cosmos."
|
48 |
}
|
49 |
]
|
50 |
|
51 |
+
# Add chat history
|
52 |
+
for message in history[:-1]: # Exclude the last message which we just added
|
53 |
messages.append({"role": message["role"], "content": message["content"]})
|
54 |
|
55 |
# Add the current user message
|
|
|
58 |
# Start generating the response
|
59 |
history.append({"role": "assistant", "content": ""})
|
60 |
|
61 |
+
# Stream the response
|
62 |
response = llm.create_chat_completion(
|
63 |
messages=messages,
|
64 |
max_tokens=512,
|
65 |
temperature=0.7,
|
66 |
top_p=0.95,
|
67 |
+
stream=True
|
|
|
|
|
|
|
|
|
|
|
68 |
)
|
69 |
|
70 |
for chunk in response:
|
|
|
91 |
}
|
92 |
"""
|
93 |
|
94 |
+
# Create the Gradio interface
|
95 |
with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo", neutral_hue="slate")) as demo:
|
96 |
gr.Markdown(
|
97 |
"""
|
|
|
141 |
label="Example Questions"
|
142 |
)
|
143 |
|
144 |
+
# Set up the message chain with streaming
|
145 |
msg.submit(
|
146 |
user,
|
147 |
[msg, chatbot],
|
|
|
150 |
).then(
|
151 |
bot,
|
152 |
chatbot,
|
153 |
+
chatbot
|
|
|
|
|
|
|
154 |
)
|
155 |
|
156 |
# Clear button functionality
|
|
|
159 |
# Initial greeting
|
160 |
demo.load(initial_greeting, None, chatbot, queue=False)
|
161 |
|
162 |
+
# Launch the app
|
163 |
if __name__ == "__main__":
|
|
|
164 |
demo.launch()
|