Spaces:

Keeby-smilyai
/

Hyperspeed-demo

Runtime error

App Files Files Community

Hyperspeed-demo / app.py

Keeby-smilyai

Update app.py

be56270 verified 26 days ago

raw

history blame contribute delete

6.35 kB

	import gradio as gr
	import os
	from huggingface_hub import hf_hub_download
	from Hyperspeed import HyperSpeedEngine, convert_model
	import time

	# Config
	MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
	HYPER_PATH = "qwen.hyper"

	def setup_model():
	"""Download and convert model"""
	if not os.path.exists(HYPER_PATH):
	print("Downloading model...")
	model_file = hf_hub_download(
	repo_id=MODEL_NAME,
	filename="model.safetensors",
	cache_dir="./cache"
	)

	print("Converting to HyperSpeed...")
	convert_model(model_file, HYPER_PATH)

	print("Loading engine...")
	engine = HyperSpeedEngine(HYPER_PATH, verbose=True)
	engine.load_tokenizer(MODEL_NAME)
	return engine

	# Initialize
	print("🚀 Starting HyperSpeed...")
	engine = setup_model()
	print("✓ Ready!")

	def chat_respond(message, history, temperature, max_tokens, stream_enabled):
	"""Chatbot response with streaming support"""

	# Format chat history for Qwen
	prompt = ""
	for user_msg, assistant_msg in history:
	prompt += f"User: {user_msg}\nAssistant: {assistant_msg}\n"
	prompt += f"User: {message}\nAssistant:"

	if stream_enabled:
	# Streaming response
	partial = ""
	for token in engine.generate(
	prompt,
	max_tokens=max_tokens,
	temperature=temperature,
	stream=True
	):
	partial += token
	yield partial
	else:
	# Non-streaming response
	response = engine.generate(
	prompt,
	max_tokens=max_tokens,
	temperature=temperature,
	stream=False
	)
	# Extract only the assistant's new response
	response = response.split("Assistant:")[-1].strip()
	yield response

	def get_model_info():
	"""Get model stats"""
	info = f"""### 🚀 HyperSpeed Model Info

	Model: {MODEL_NAME}
	Format: HyperSpeed (.hyper)
	Total Tensors: {len(engine.list_weights())}

	Features:
	- ⚡ Adaptive quantization (2/4/8-bit)
	- 🎯 Cache-optimized layouts
	- 📦 2-4x compression
	- 🌊 Token streaming support

	Metadata:
	```json
	{engine.metadata}
	```
	"""
	return info

	# Create Gradio interface
	with gr.Blocks(title="HyperSpeed Chatbot", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# 🚀 HyperSpeed Chat - Qwen2.5-0.5B

	Ultra-fast CPU inference with adaptive quantization and token streaming!

	Yes, HyperSpeed streams tokens! ⚡ Enable streaming below to see tokens appear in real-time.
	""")

	with gr.Tab("💬 Chat"):
	chatbot = gr.Chatbot(
	label="Conversation",
	height=500,
	show_copy_button=True
	)

	msg = gr.Textbox(
	label="Message",
	placeholder="Type your message here...",
	lines=2
	)

	with gr.Row():
	submit_btn = gr.Button("Send", variant="primary")
	clear_btn = gr.Button("Clear")

	with gr.Accordion("⚙️ Settings", open=False):
	temperature = gr.Slider(
	minimum=0.0,
	maximum=2.0,
	value=0.7,
	step=0.1,
	label="Temperature",
	info="Higher = more creative"
	)

	max_tokens = gr.Slider(
	minimum=10,
	maximum=500,
	value=150,
	step=10,
	label="Max Tokens",
	info="Maximum length of response"
	)

	stream_toggle = gr.Checkbox(
	label="Enable Streaming 🌊",
	value=True,
	info="Stream tokens in real-time"
	)

	# Chat functionality
	msg.submit(
	chat_respond,
	inputs=[msg, chatbot, temperature, max_tokens, stream_toggle],
	outputs=chatbot
	).then(
	lambda: "",
	outputs=msg
	)

	submit_btn.click(
	chat_respond,
	inputs=[msg, chatbot, temperature, max_tokens, stream_toggle],
	outputs=chatbot
	).then(
	lambda: "",
	outputs=msg
	)

	clear_btn.click(lambda: None, None, chatbot, queue=False)

	with gr.Tab("ℹ️ Model Info"):
	info_btn = gr.Button("Show Model Info")
	info_output = gr.Markdown()
	info_btn.click(get_model_info, outputs=info_output)

	with gr.Tab("🔍 Weight Inspector"):
	gr.Markdown("### Explore model weights")

	weight_dropdown = gr.Dropdown(
	choices=engine.list_weights()[:50], # Show first 50
	label="Select weight tensor"
	)

	inspect_btn = gr.Button("Inspect Weight")

	weight_info = gr.Markdown()

	def inspect_weight(name):
	if not name:
	return "Please select a weight"

	w = engine.get_weight(name)
	import numpy as np

	info = f"""## Weight: `{name}`

	Shape: {w.shape}
	Size: {w.size:,} parameters
	Dtype: {w.dtype}

	Statistics:
	- Mean: {w.mean():.6f}
	- Std: {w.std():.6f}
	- Min: {w.min():.6f}
	- Max: {w.max():.6f}
	- Median: {np.median(w):.6f}

	Sample values (first 10):
	```
	{', '.join([f'{v:.4f}' for v in w.flatten()[:10]])}
	```
	"""
	return info

	inspect_btn.click(
	inspect_weight,
	inputs=weight_dropdown,
	outputs=weight_info
	)

	gr.Markdown("""
	---

	### 🎯 About HyperSpeed

	HyperSpeed is a new LLM format optimized for CPU inference:

	- Adaptive Quantization: 2/4/8-bit based on weight importance
	- Cache-Optimized: Better CPU cache utilization
	- Fast Loading: Lazy dequantization on-demand
	- Token Streaming: Real-time response generation ⚡
	- Compression: 2-4x smaller than original models

	[GitHub](https://github.com/Smilyai-labs/Hyperspeed) \| Install: `pip install git+https://github.com/Smilyai-labs/Hyperspeed.git`
	""")

	if __name__ == "__main__":
	demo.queue() # Enable queue for streaming
	demo.launch()