Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import os | |
| from huggingface_hub import hf_hub_download | |
| from Hyperspeed import HyperSpeedEngine, convert_model | |
| import time | |
| # Config | |
| MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct" | |
| HYPER_PATH = "qwen.hyper" | |
| def setup_model(): | |
| """Download and convert model""" | |
| if not os.path.exists(HYPER_PATH): | |
| print("Downloading model...") | |
| model_file = hf_hub_download( | |
| repo_id=MODEL_NAME, | |
| filename="model.safetensors", | |
| cache_dir="./cache" | |
| ) | |
| print("Converting to HyperSpeed...") | |
| convert_model(model_file, HYPER_PATH) | |
| print("Loading engine...") | |
| engine = HyperSpeedEngine(HYPER_PATH, verbose=True) | |
| engine.load_tokenizer(MODEL_NAME) | |
| return engine | |
| # Initialize | |
| print("π Starting HyperSpeed...") | |
| engine = setup_model() | |
| print("β Ready!") | |
| def chat_respond(message, history, temperature, max_tokens, stream_enabled): | |
| """Chatbot response with streaming support""" | |
| # Format chat history for Qwen | |
| prompt = "" | |
| for user_msg, assistant_msg in history: | |
| prompt += f"User: {user_msg}\nAssistant: {assistant_msg}\n" | |
| prompt += f"User: {message}\nAssistant:" | |
| if stream_enabled: | |
| # Streaming response | |
| partial = "" | |
| for token in engine.generate( | |
| prompt, | |
| max_tokens=max_tokens, | |
| temperature=temperature, | |
| stream=True | |
| ): | |
| partial += token | |
| yield partial | |
| else: | |
| # Non-streaming response | |
| response = engine.generate( | |
| prompt, | |
| max_tokens=max_tokens, | |
| temperature=temperature, | |
| stream=False | |
| ) | |
| # Extract only the assistant's new response | |
| response = response.split("Assistant:")[-1].strip() | |
| yield response | |
| def get_model_info(): | |
| """Get model stats""" | |
| info = f"""### π HyperSpeed Model Info | |
| **Model:** {MODEL_NAME} | |
| **Format:** HyperSpeed (.hyper) | |
| **Total Tensors:** {len(engine.list_weights())} | |
| **Features:** | |
| - β‘ Adaptive quantization (2/4/8-bit) | |
| - π― Cache-optimized layouts | |
| - π¦ 2-4x compression | |
| - π Token streaming support | |
| **Metadata:** | |
| ```json | |
| {engine.metadata} | |
| ``` | |
| """ | |
| return info | |
| # Create Gradio interface | |
| with gr.Blocks(title="HyperSpeed Chatbot", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown(""" | |
| # π HyperSpeed Chat - Qwen2.5-0.5B | |
| Ultra-fast CPU inference with adaptive quantization and token streaming! | |
| **Yes, HyperSpeed streams tokens!** β‘ Enable streaming below to see tokens appear in real-time. | |
| """) | |
| with gr.Tab("π¬ Chat"): | |
| chatbot = gr.Chatbot( | |
| label="Conversation", | |
| height=500, | |
| show_copy_button=True | |
| ) | |
| msg = gr.Textbox( | |
| label="Message", | |
| placeholder="Type your message here...", | |
| lines=2 | |
| ) | |
| with gr.Row(): | |
| submit_btn = gr.Button("Send", variant="primary") | |
| clear_btn = gr.Button("Clear") | |
| with gr.Accordion("βοΈ Settings", open=False): | |
| temperature = gr.Slider( | |
| minimum=0.0, | |
| maximum=2.0, | |
| value=0.7, | |
| step=0.1, | |
| label="Temperature", | |
| info="Higher = more creative" | |
| ) | |
| max_tokens = gr.Slider( | |
| minimum=10, | |
| maximum=500, | |
| value=150, | |
| step=10, | |
| label="Max Tokens", | |
| info="Maximum length of response" | |
| ) | |
| stream_toggle = gr.Checkbox( | |
| label="Enable Streaming π", | |
| value=True, | |
| info="Stream tokens in real-time" | |
| ) | |
| # Chat functionality | |
| msg.submit( | |
| chat_respond, | |
| inputs=[msg, chatbot, temperature, max_tokens, stream_toggle], | |
| outputs=chatbot | |
| ).then( | |
| lambda: "", | |
| outputs=msg | |
| ) | |
| submit_btn.click( | |
| chat_respond, | |
| inputs=[msg, chatbot, temperature, max_tokens, stream_toggle], | |
| outputs=chatbot | |
| ).then( | |
| lambda: "", | |
| outputs=msg | |
| ) | |
| clear_btn.click(lambda: None, None, chatbot, queue=False) | |
| with gr.Tab("βΉοΈ Model Info"): | |
| info_btn = gr.Button("Show Model Info") | |
| info_output = gr.Markdown() | |
| info_btn.click(get_model_info, outputs=info_output) | |
| with gr.Tab("π Weight Inspector"): | |
| gr.Markdown("### Explore model weights") | |
| weight_dropdown = gr.Dropdown( | |
| choices=engine.list_weights()[:50], # Show first 50 | |
| label="Select weight tensor" | |
| ) | |
| inspect_btn = gr.Button("Inspect Weight") | |
| weight_info = gr.Markdown() | |
| def inspect_weight(name): | |
| if not name: | |
| return "Please select a weight" | |
| w = engine.get_weight(name) | |
| import numpy as np | |
| info = f"""## Weight: `{name}` | |
| **Shape:** {w.shape} | |
| **Size:** {w.size:,} parameters | |
| **Dtype:** {w.dtype} | |
| **Statistics:** | |
| - Mean: {w.mean():.6f} | |
| - Std: {w.std():.6f} | |
| - Min: {w.min():.6f} | |
| - Max: {w.max():.6f} | |
| - Median: {np.median(w):.6f} | |
| **Sample values (first 10):** | |
| ``` | |
| {', '.join([f'{v:.4f}' for v in w.flatten()[:10]])} | |
| ``` | |
| """ | |
| return info | |
| inspect_btn.click( | |
| inspect_weight, | |
| inputs=weight_dropdown, | |
| outputs=weight_info | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| ### π― About HyperSpeed | |
| HyperSpeed is a new LLM format optimized for CPU inference: | |
| - **Adaptive Quantization**: 2/4/8-bit based on weight importance | |
| - **Cache-Optimized**: Better CPU cache utilization | |
| - **Fast Loading**: Lazy dequantization on-demand | |
| - **Token Streaming**: Real-time response generation β‘ | |
| - **Compression**: 2-4x smaller than original models | |
| [GitHub](https://github.com/Smilyai-labs/Hyperspeed) | Install: `pip install git+https://github.com/Smilyai-labs/Hyperspeed.git` | |
| """) | |
| if __name__ == "__main__": | |
| demo.queue() # Enable queue for streaming | |
| demo.launch() |