akhaliq's picture
akhaliq HF Staff
Update app.py
7022707 verified
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
import torch
import spaces
import re
# Initialize the model and tokenizer
print("Loading VibeThinker model...")
model = AutoModelForCausalLM.from_pretrained(
"WeiboAI/VibeThinker-1.5B",
low_cpu_mem_usage=True,
torch_dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(
"WeiboAI/VibeThinker-1.5B",
trust_remote_code=True
)
print("Model loaded successfully!")
@spaces.GPU
def respond(message, history):
"""
Generate streaming response for the chatbot.
Args:
message: The user's current message
history: List of previous conversation messages
"""
# Build messages from history
messages = history if history else []
# Add current message
messages.append({"role": "user", "content": message})
# Apply chat template
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
# Tokenize
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
# Generation config - using dict format as in official docs
generation_config = dict(
max_new_tokens=4000,
do_sample=True,
temperature=0.6,
top_p=0.95,
top_k=None
)
# Generate - passing GenerationConfig exactly as in docs
generated_ids = model.generate(
**model_inputs,
generation_config=GenerationConfig(**generation_config)
)
# Trim input from output - exactly as in official docs
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
# Decode - skip special tokens will help but we'll also filter manually
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
# For streaming effect, yield character by character
partial_response = ""
for char in response:
partial_response += char
yield partial_response
# Create the Gradio interface
with gr.Blocks(
theme=gr.themes.Soft(),
css="""
.header-link { text-decoration: none; color: inherit; }
.header-link:hover { text-decoration: underline; }
"""
) as demo:
gr.Markdown(
"""
# 💭 VibeThinker Chatbot
Chat with [WeiboAI/VibeThinker-1.5B](https://huggingface.co/WeiboAI/VibeThinker-1.5B) - a powerful conversational AI model.
<a href="https://huggingface.co/spaces/akhaliq/anycoder" class="header-link">Built with anycoder</a>
"""
)
gr.ChatInterface(
fn=respond,
type="messages",
title="",
description="Ask me anything! I'm powered by VibeThinker with ZeroGPU acceleration.",
examples=[
"What is 2 + 2?",
"Tell me a short joke",
"What is the capital of France?",
"Explain AI in one sentence",
],
cache_examples=False,
chatbot=gr.Chatbot(allow_tags=["think"]),
)
gr.Markdown(
"""
### About VibeThinker
VibeThinker is a 1.5B parameter conversational AI model designed for engaging and thoughtful conversations.
The model uses temperature sampling (0.6) for balanced creativity and coherence.
**Powered by ZeroGPU** for efficient GPU resource allocation.
"""
)
if __name__ == "__main__":
demo.launch()