Hyperspeed-demo / app.py
Keeby-smilyai's picture
Update app.py
be56270 verified
import gradio as gr
import os
from huggingface_hub import hf_hub_download
from Hyperspeed import HyperSpeedEngine, convert_model
import time
# Config
MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
HYPER_PATH = "qwen.hyper"
def setup_model():
"""Download and convert model"""
if not os.path.exists(HYPER_PATH):
print("Downloading model...")
model_file = hf_hub_download(
repo_id=MODEL_NAME,
filename="model.safetensors",
cache_dir="./cache"
)
print("Converting to HyperSpeed...")
convert_model(model_file, HYPER_PATH)
print("Loading engine...")
engine = HyperSpeedEngine(HYPER_PATH, verbose=True)
engine.load_tokenizer(MODEL_NAME)
return engine
# Initialize
print("πŸš€ Starting HyperSpeed...")
engine = setup_model()
print("βœ“ Ready!")
def chat_respond(message, history, temperature, max_tokens, stream_enabled):
"""Chatbot response with streaming support"""
# Format chat history for Qwen
prompt = ""
for user_msg, assistant_msg in history:
prompt += f"User: {user_msg}\nAssistant: {assistant_msg}\n"
prompt += f"User: {message}\nAssistant:"
if stream_enabled:
# Streaming response
partial = ""
for token in engine.generate(
prompt,
max_tokens=max_tokens,
temperature=temperature,
stream=True
):
partial += token
yield partial
else:
# Non-streaming response
response = engine.generate(
prompt,
max_tokens=max_tokens,
temperature=temperature,
stream=False
)
# Extract only the assistant's new response
response = response.split("Assistant:")[-1].strip()
yield response
def get_model_info():
"""Get model stats"""
info = f"""### πŸš€ HyperSpeed Model Info
**Model:** {MODEL_NAME}
**Format:** HyperSpeed (.hyper)
**Total Tensors:** {len(engine.list_weights())}
**Features:**
- ⚑ Adaptive quantization (2/4/8-bit)
- 🎯 Cache-optimized layouts
- πŸ“¦ 2-4x compression
- 🌊 Token streaming support
**Metadata:**
```json
{engine.metadata}
```
"""
return info
# Create Gradio interface
with gr.Blocks(title="HyperSpeed Chatbot", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# πŸš€ HyperSpeed Chat - Qwen2.5-0.5B
Ultra-fast CPU inference with adaptive quantization and token streaming!
**Yes, HyperSpeed streams tokens!** ⚑ Enable streaming below to see tokens appear in real-time.
""")
with gr.Tab("πŸ’¬ Chat"):
chatbot = gr.Chatbot(
label="Conversation",
height=500,
show_copy_button=True
)
msg = gr.Textbox(
label="Message",
placeholder="Type your message here...",
lines=2
)
with gr.Row():
submit_btn = gr.Button("Send", variant="primary")
clear_btn = gr.Button("Clear")
with gr.Accordion("βš™οΈ Settings", open=False):
temperature = gr.Slider(
minimum=0.0,
maximum=2.0,
value=0.7,
step=0.1,
label="Temperature",
info="Higher = more creative"
)
max_tokens = gr.Slider(
minimum=10,
maximum=500,
value=150,
step=10,
label="Max Tokens",
info="Maximum length of response"
)
stream_toggle = gr.Checkbox(
label="Enable Streaming 🌊",
value=True,
info="Stream tokens in real-time"
)
# Chat functionality
msg.submit(
chat_respond,
inputs=[msg, chatbot, temperature, max_tokens, stream_toggle],
outputs=chatbot
).then(
lambda: "",
outputs=msg
)
submit_btn.click(
chat_respond,
inputs=[msg, chatbot, temperature, max_tokens, stream_toggle],
outputs=chatbot
).then(
lambda: "",
outputs=msg
)
clear_btn.click(lambda: None, None, chatbot, queue=False)
with gr.Tab("ℹ️ Model Info"):
info_btn = gr.Button("Show Model Info")
info_output = gr.Markdown()
info_btn.click(get_model_info, outputs=info_output)
with gr.Tab("πŸ” Weight Inspector"):
gr.Markdown("### Explore model weights")
weight_dropdown = gr.Dropdown(
choices=engine.list_weights()[:50], # Show first 50
label="Select weight tensor"
)
inspect_btn = gr.Button("Inspect Weight")
weight_info = gr.Markdown()
def inspect_weight(name):
if not name:
return "Please select a weight"
w = engine.get_weight(name)
import numpy as np
info = f"""## Weight: `{name}`
**Shape:** {w.shape}
**Size:** {w.size:,} parameters
**Dtype:** {w.dtype}
**Statistics:**
- Mean: {w.mean():.6f}
- Std: {w.std():.6f}
- Min: {w.min():.6f}
- Max: {w.max():.6f}
- Median: {np.median(w):.6f}
**Sample values (first 10):**
```
{', '.join([f'{v:.4f}' for v in w.flatten()[:10]])}
```
"""
return info
inspect_btn.click(
inspect_weight,
inputs=weight_dropdown,
outputs=weight_info
)
gr.Markdown("""
---
### 🎯 About HyperSpeed
HyperSpeed is a new LLM format optimized for CPU inference:
- **Adaptive Quantization**: 2/4/8-bit based on weight importance
- **Cache-Optimized**: Better CPU cache utilization
- **Fast Loading**: Lazy dequantization on-demand
- **Token Streaming**: Real-time response generation ⚑
- **Compression**: 2-4x smaller than original models
[GitHub](https://github.com/Smilyai-labs/Hyperspeed) | Install: `pip install git+https://github.com/Smilyai-labs/Hyperspeed.git`
""")
if __name__ == "__main__":
demo.queue() # Enable queue for streaming
demo.launch()