Spaces:

broadfield-dev
/

QWEN3-GGUF

Build error

App Files Files Community

QWEN3-GGUF / _app.py

broadfield-dev

Rename app.py to _app.py

cc12a21 verified 4 months ago

raw

history blame contribute delete

2.8 kB

	import gradio as gr
	import requests
	import json
	import logging

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# llama.cpp server endpoint
	LLAMA_API_URL = "http://localhost:8000/v1/chat/completions"

	class QwenChatbot:
	def __init__(self, model="qwen3-14b-q4_k_xl"):
	self.model = model
	self.history = []

	def generate_response(self, user_input, max_new_tokens=512):
	think_mode = user_input.endswith("/think")
	if think_mode:
	user_input = user_input.replace("/think", "").strip()
	elif user_input.endswith("/no_think"):
	user_input = user_input.replace("/no_think", "").strip()

	# Format messages for llama.cpp
	messages = self.history + [{"role": "user", "content": user_input}]
	if think_mode:
	messages.append({"role": "assistant", "content": "<think>\n\n</think>\n\n"})

	# Call llama.cpp API
	try:
	response = requests.post(
	LLAMA_API_URL,
	json={
	"model": self.model,
	"messages": messages,
	"max_tokens": max_new_tokens,
	"temperature": 0.6 if think_mode else 0.7,
	"top_p": 0.95 if think_mode else 0.8,
	"top_k": 20,
	"stream": True
	},
	stream=True
	)
	response.raise_for_status()

	full_response = ""
	for line in response.iter_lines():
	if line:
	chunk = json.loads(line.decode("utf-8").replace("data: ", ""))
	if "choices" in chunk and chunk["choices"]:
	content = chunk["choices"][0]["delta"].get("content", "")
	full_response += content
	yield full_response

	self.history.append({"role": "user", "content": user_input})
	self.history.append({"role": "assistant", "content": full_response})

	except Exception as e:
	logger.error(f"Error calling llama.cpp API: {e}")
	yield f"Error: {str(e)}"

	def chat_function(user_input, history):
	chatbot = QwenChatbot()
	for response in chatbot.generate_response(user_input):
	yield response

	demo = gr.ChatInterface(
	fn=chat_function,
	title="Qwen3 GGUF Chatbot (Streaming)",
	description="Chat with Qwen3-14B GGUF model via llama.cpp. Use /think for thoughtful responses, /no_think for direct responses.",
	chatbot=gr.Chatbot(height=500),
	textbox=gr.Textbox(placeholder="Type your message..."),
	submit_btn="Send",
	concurrency_limit=1,
	max_batch_size=1
	)

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860)