Spaces:
Running
on
Zero
Running
on
Zero
| import gradio as gr | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig | |
| import torch | |
| import spaces | |
| import re | |
| # Initialize the model and tokenizer | |
| print("Loading VibeThinker model...") | |
| model = AutoModelForCausalLM.from_pretrained( | |
| "WeiboAI/VibeThinker-1.5B", | |
| low_cpu_mem_usage=True, | |
| torch_dtype=torch.bfloat16, | |
| device_map="auto", | |
| trust_remote_code=True | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| "WeiboAI/VibeThinker-1.5B", | |
| trust_remote_code=True | |
| ) | |
| print("Model loaded successfully!") | |
| def respond(message, history): | |
| """ | |
| Generate streaming response for the chatbot. | |
| Args: | |
| message: The user's current message | |
| history: List of previous conversation messages | |
| """ | |
| # Build messages from history | |
| messages = history if history else [] | |
| # Add current message | |
| messages.append({"role": "user", "content": message}) | |
| # Apply chat template | |
| text = tokenizer.apply_chat_template( | |
| messages, | |
| tokenize=False, | |
| add_generation_prompt=True | |
| ) | |
| # Tokenize | |
| model_inputs = tokenizer([text], return_tensors="pt").to(model.device) | |
| # Generation config - using dict format as in official docs | |
| generation_config = dict( | |
| max_new_tokens=4000, | |
| do_sample=True, | |
| temperature=0.6, | |
| top_p=0.95, | |
| top_k=None | |
| ) | |
| # Generate - passing GenerationConfig exactly as in docs | |
| generated_ids = model.generate( | |
| **model_inputs, | |
| generation_config=GenerationConfig(**generation_config) | |
| ) | |
| # Trim input from output - exactly as in official docs | |
| generated_ids = [ | |
| output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) | |
| ] | |
| # Decode - skip special tokens will help but we'll also filter manually | |
| response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
| # For streaming effect, yield character by character | |
| partial_response = "" | |
| for char in response: | |
| partial_response += char | |
| yield partial_response | |
| # Create the Gradio interface | |
| with gr.Blocks( | |
| theme=gr.themes.Soft(), | |
| css=""" | |
| .header-link { text-decoration: none; color: inherit; } | |
| .header-link:hover { text-decoration: underline; } | |
| """ | |
| ) as demo: | |
| gr.Markdown( | |
| """ | |
| # 💭 VibeThinker Chatbot | |
| Chat with [WeiboAI/VibeThinker-1.5B](https://huggingface.co/WeiboAI/VibeThinker-1.5B) - a powerful conversational AI model. | |
| <a href="https://huggingface.co/spaces/akhaliq/anycoder" class="header-link">Built with anycoder</a> | |
| """ | |
| ) | |
| gr.ChatInterface( | |
| fn=respond, | |
| type="messages", | |
| title="", | |
| description="Ask me anything! I'm powered by VibeThinker with ZeroGPU acceleration.", | |
| examples=[ | |
| "What is 2 + 2?", | |
| "Tell me a short joke", | |
| "What is the capital of France?", | |
| "Explain AI in one sentence", | |
| ], | |
| cache_examples=False, | |
| chatbot=gr.Chatbot(allow_tags=["think"]), | |
| ) | |
| gr.Markdown( | |
| """ | |
| ### About VibeThinker | |
| VibeThinker is a 1.5B parameter conversational AI model designed for engaging and thoughtful conversations. | |
| The model uses temperature sampling (0.6) for balanced creativity and coherence. | |
| **Powered by ZeroGPU** for efficient GPU resource allocation. | |
| """ | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |