Spaces:
Build error
Build error
import gradio as gr | |
import requests | |
import json | |
import logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
# llama.cpp server endpoint | |
LLAMA_API_URL = "http://localhost:8000/v1/chat/completions" | |
class QwenChatbot: | |
def __init__(self, model="qwen3-14b-q4_k_xl"): | |
self.model = model | |
self.history = [] | |
def generate_response(self, user_input, max_new_tokens=512): | |
think_mode = user_input.endswith("/think") | |
if think_mode: | |
user_input = user_input.replace("/think", "").strip() | |
elif user_input.endswith("/no_think"): | |
user_input = user_input.replace("/no_think", "").strip() | |
# Format messages for llama.cpp | |
messages = self.history + [{"role": "user", "content": user_input}] | |
if think_mode: | |
messages.append({"role": "assistant", "content": "<think>\n\n</think>\n\n"}) | |
# Call llama.cpp API | |
try: | |
response = requests.post( | |
LLAMA_API_URL, | |
json={ | |
"model": self.model, | |
"messages": messages, | |
"max_tokens": max_new_tokens, | |
"temperature": 0.6 if think_mode else 0.7, | |
"top_p": 0.95 if think_mode else 0.8, | |
"top_k": 20, | |
"stream": True | |
}, | |
stream=True | |
) | |
response.raise_for_status() | |
full_response = "" | |
for line in response.iter_lines(): | |
if line: | |
chunk = json.loads(line.decode("utf-8").replace("data: ", "")) | |
if "choices" in chunk and chunk["choices"]: | |
content = chunk["choices"][0]["delta"].get("content", "") | |
full_response += content | |
yield full_response | |
self.history.append({"role": "user", "content": user_input}) | |
self.history.append({"role": "assistant", "content": full_response}) | |
except Exception as e: | |
logger.error(f"Error calling llama.cpp API: {e}") | |
yield f"Error: {str(e)}" | |
def chat_function(user_input, history): | |
chatbot = QwenChatbot() | |
for response in chatbot.generate_response(user_input): | |
yield response | |
demo = gr.ChatInterface( | |
fn=chat_function, | |
title="Qwen3 GGUF Chatbot (Streaming)", | |
description="Chat with Qwen3-14B GGUF model via llama.cpp. Use /think for thoughtful responses, /no_think for direct responses.", | |
chatbot=gr.Chatbot(height=500), | |
textbox=gr.Textbox(placeholder="Type your message..."), | |
submit_btn="Send", | |
concurrency_limit=1, | |
max_batch_size=1 | |
) | |
if __name__ == "__main__": | |
demo.launch(server_name="0.0.0.0", server_port=7860) |