Spaces:

yzhuang
/

MixtureOfInputs

Sleeping

App Files Files Community

MixtureOfInputs / app.py

yzhuang

Update app.py

2d7f359 verified about 2 months ago

raw

history blame

5.48 kB

	"""Gradio chat demo that streams responses from a (local) OpenAI‑compatible
	endpoint using the official `openai` Python SDK. The server is assumed to be
	running at http://0.0.0.0:8000 with the v1 REST routes. A custom header
	`X‑MIXINPUTS‑BETA` is forwarded so MoI can adjust its blending strength at
	runtime.

	Launch with:
	python app_openai.py
	"""

	from __future__ import annotations

	import os
	import openai
	import gradio as gr

	# ──────────────────────────────────────────────────────────────────────────────
	# OpenAI client configuration
	# ──────────────────────────────────────────────────────────────────────────────
	# ``openai`` still expects an API key even if the backend ignores it, so we use
	# a dummy value when none is provided. The base_url points to the local
	# vLLM server that speaks the OpenAI REST dialect.
	# -----------------------------------------------------------------------------
	openai.api_key = os.getenv("OPENAI_API_KEY", "EMPTY")
	openai.base_url = "http://0.0.0.0:8000/v1"

	# ──────────────────────────────────────────────────────────────────────────────
	# Chat handler
	# ──────────────────────────────────────────────────────────────────────────────

	def stream_completion(message: str,
	history: list[tuple[str, str]],
	max_tokens: int,
	temperature: float,
	top_p: float,
	beta: float):
	"""Gradio callback that yields streaming assistant replies.

	The function reconstructs the conversation excluding any system prompt
	and then calls ``openai.chat.completions.create`` with ``stream=True``.
	Each incoming delta is appended to an ``assistant`` buffer which is sent
	back to the Chatbot component for real‑time display.
	"""

	# Build OpenAI‑style message list from prior turns
	messages: list[dict[str, str]] = []
	for user_msg, assistant_msg in history:
	if user_msg:
	messages.append({"role": "user", "content": user_msg})
	if assistant_msg:
	messages.append({"role": "assistant", "content": assistant_msg})

	# Current user input comes last
	messages.append({"role": "user", "content": message})

	try:
	# Kick off streaming completion
	response = openai.chat.completions.create(
	model="Qwen/Qwen3-4B",
	messages=messages,
	temperature=temperature,
	top_p=top_p,
	max_tokens=max_tokens,
	stream=True,
	# Forward MoI blending coefficient to the backend
	extra_headers={"X-MIXINPUTS-BETA": str(beta)},
	)

	assistant = ""
	for chunk in response:
	# ``delta.content`` is None for e.g. role announcements; guard with or ""
	delta = chunk.choices[0].delta.content or ""
	assistant += delta
	yield history + [(message, assistant)] # live update

	except Exception as err: # pylint: disable=broad-except
	yield history + [(message, f"[ERROR] {err}")]


	# ──────────────────────────────────────────────────────────────────────────────
	# Gradio UI
	# ──────────────────────────────────────────────────────────────────────────────

	with gr.Blocks(title="🎨 Mixture of Inputs (MoI) Demo") as demo:
	gr.Markdown(
	"## 🎨 Mixture of Inputs (MoI) Demo \n"
	"Streaming vLLM demo with dynamic beta adjustment in MoI "
	"(higher beta → less blending)."
	)

	with gr.Row(): # sliders first
	beta = gr.Slider(0.0, 10.0, value=1.0, step=0.1, label="MoI β")
	temperature = gr.Slider(0.1, 1.0, value=0.6, step=0.1, label="Temperature")
	top_p = gr.Slider(0.1, 1.0, value=0.80, step=0.05, label="Top‑p")
	max_tokens = gr.Slider(1, 2048, value=512, step=1, label="Max new tokens")

	chatbot = gr.Chatbot(height=450)
	user_box = gr.Textbox(placeholder="Type a message and press Enter…", show_label=False)
	clear_btn = gr.Button("Clear chat")

	user_box.submit(
	fn=stream_completion,
	inputs=[user_box, chatbot, max_tokens, temperature, top_p, beta],
	outputs=chatbot,
	)

	clear_btn.click(lambda: None, None, chatbot, queue=False)

	# ──────────────────────────────────────────────────────────────────────────────
	if __name__ == "__main__":
	demo.launch()