Tijmen2 commited on
Commit
f4500f5
1 Parent(s): 0264b98

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -46
app.py CHANGED
@@ -2,40 +2,25 @@ import gradio as gr
2
  from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
4
  import random
5
- import spaces
6
- import torch
7
 
8
- # Get the number of available CPU cores
9
- import multiprocessing
10
- n_cores = multiprocessing.cpu_count()
11
-
12
- # Initialize model with optimized parameters
13
  model_path = hf_hub_download(
14
  repo_id="AstroMLab/AstroSage-8B-GGUF",
15
  filename="AstroSage-8B-Q8_0.gguf"
16
  )
17
 
18
- # Optimized LLaMA parameters for A100
19
  llm = Llama(
20
  model_path=model_path,
21
- n_ctx=2048, # Keep context window reasonable
22
- n_threads=n_cores, # Use all available CPU cores
23
- n_batch=512, # Increase batch size for faster processing
24
- n_gpu_layers=35, # Offload more layers to GPU
25
  chat_format="llama-3",
26
  seed=42,
27
- f16_kv=True, # Use FP16 for key/value cache
28
  logits_all=False,
29
- use_mmap=False, # Disable memory mapping for faster loading
30
- use_gpu=True,
31
- tensor_split=None, # Let the model handle tensor splitting
32
  )
33
 
34
- # Optimize CUDA settings if available
35
- if torch.cuda.is_available():
36
- torch.backends.cuda.matmul.allow_tf32 = True # Allow TF32 for faster matrix multiplication
37
- torch.backends.cudnn.benchmark = True # Enable cudnn autotuner
38
-
39
  # Placeholder responses for when context is empty
40
  GREETING_MESSAGES = [
41
  "Greetings! I am AstroSage, your guide to the cosmos. What would you like to explore today?",
@@ -50,26 +35,21 @@ def user(user_message, history):
50
  history = []
51
  return "", history + [{"role": "user", "content": user_message}]
52
 
53
- @spaces.GPU
54
  def bot(history):
55
- """Generate and stream the bot's response with optimized parameters."""
56
  if not history:
57
  history = []
58
-
59
- # Optimize context by limiting history
60
- max_history_tokens = 1024 # Reserve half of context for response
61
- recent_history = history[-5:] # Keep only last 5 messages for context
62
-
63
  # Prepare the messages for the model
64
  messages = [
65
  {
66
  "role": "system",
67
- "content": "You are AstroSage, an intelligent AI assistant specializing in astronomy, astrophysics, and space science. Be concise and direct in your responses while maintaining accuracy."
68
  }
69
  ]
70
 
71
- # Add optimized chat history
72
- for message in recent_history[:-1]:
73
  messages.append({"role": message["role"], "content": message["content"]})
74
 
75
  # Add the current user message
@@ -78,18 +58,13 @@ def bot(history):
78
  # Start generating the response
79
  history.append({"role": "assistant", "content": ""})
80
 
81
- # Optimized streaming parameters
82
  response = llm.create_chat_completion(
83
  messages=messages,
84
  max_tokens=512,
85
  temperature=0.7,
86
  top_p=0.95,
87
- stream=True,
88
- top_k=40, # Add top-k sampling
89
- repeat_penalty=1.1, # Slight penalty for repetition
90
- mirostat_mode=2, # Enable Mirostat sampling
91
- mirostat_tau=5.0,
92
- mirostat_eta=0.1,
93
  )
94
 
95
  for chunk in response:
@@ -116,7 +91,7 @@ custom_css = """
116
  }
117
  """
118
 
119
- # Create the Gradio interface with optimized queue settings
120
  with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo", neutral_hue="slate")) as demo:
121
  gr.Markdown(
122
  """
@@ -166,7 +141,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo", neutra
166
  label="Example Questions"
167
  )
168
 
169
- # Set up the message chain with optimized queuing
170
  msg.submit(
171
  user,
172
  [msg, chatbot],
@@ -175,10 +150,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo", neutra
175
  ).then(
176
  bot,
177
  chatbot,
178
- chatbot,
179
- queue=True, # Enable queuing for bot responses
180
- batch=True, # Enable batching
181
- max_batch_size=4 # Process up to 4 requests together
182
  )
183
 
184
  # Clear button functionality
@@ -187,7 +159,6 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo", neutra
187
  # Initial greeting
188
  demo.load(initial_greeting, None, chatbot, queue=False)
189
 
190
- # Launch the app with optimized settings
191
  if __name__ == "__main__":
192
- #demo.queue(concurrency_count=2) # Allow 2 concurrent requests
193
  demo.launch()
 
2
  from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
4
  import random
 
 
5
 
6
+ # Initialize model
 
 
 
 
7
  model_path = hf_hub_download(
8
  repo_id="AstroMLab/AstroSage-8B-GGUF",
9
  filename="AstroSage-8B-Q8_0.gguf"
10
  )
11
 
 
12
  llm = Llama(
13
  model_path=model_path,
14
+ n_ctx=2048,
15
+ n_threads=4,
 
 
16
  chat_format="llama-3",
17
  seed=42,
18
+ f16_kv=True,
19
  logits_all=False,
20
+ use_mmap=True,
21
+ use_gpu=True
 
22
  )
23
 
 
 
 
 
 
24
  # Placeholder responses for when context is empty
25
  GREETING_MESSAGES = [
26
  "Greetings! I am AstroSage, your guide to the cosmos. What would you like to explore today?",
 
35
  history = []
36
  return "", history + [{"role": "user", "content": user_message}]
37
 
 
38
  def bot(history):
39
+ """Generate and stream the bot's response."""
40
  if not history:
41
  history = []
42
+
 
 
 
 
43
  # Prepare the messages for the model
44
  messages = [
45
  {
46
  "role": "system",
47
+ "content": "You are AstroSage, an intelligent AI assistant specializing in astronomy, astrophysics, and space science. You provide accurate, scientific information while making complex concepts accessible. You're enthusiastic about space exploration and maintain a sense of wonder about the cosmos."
48
  }
49
  ]
50
 
51
+ # Add chat history
52
+ for message in history[:-1]: # Exclude the last message which we just added
53
  messages.append({"role": message["role"], "content": message["content"]})
54
 
55
  # Add the current user message
 
58
  # Start generating the response
59
  history.append({"role": "assistant", "content": ""})
60
 
61
+ # Stream the response
62
  response = llm.create_chat_completion(
63
  messages=messages,
64
  max_tokens=512,
65
  temperature=0.7,
66
  top_p=0.95,
67
+ stream=True
 
 
 
 
 
68
  )
69
 
70
  for chunk in response:
 
91
  }
92
  """
93
 
94
+ # Create the Gradio interface
95
  with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo", neutral_hue="slate")) as demo:
96
  gr.Markdown(
97
  """
 
141
  label="Example Questions"
142
  )
143
 
144
+ # Set up the message chain with streaming
145
  msg.submit(
146
  user,
147
  [msg, chatbot],
 
150
  ).then(
151
  bot,
152
  chatbot,
153
+ chatbot
 
 
 
154
  )
155
 
156
  # Clear button functionality
 
159
  # Initial greeting
160
  demo.load(initial_greeting, None, chatbot, queue=False)
161
 
162
+ # Launch the app
163
  if __name__ == "__main__":
 
164
  demo.launch()