MixtureOfInputs / app.py
yzhuang's picture
Update app.py
2d7f359 verified
raw
history blame
5.48 kB
"""Gradio chat demo that streams responses from a (local) OpenAI‑compatible
endpoint using the official `openai` Python SDK. The server is assumed to be
running at http://0.0.0.0:8000 with the v1 REST routes. A custom header
`X‑MIXINPUTS‑BETA` is forwarded so MoI can adjust its blending strength at
runtime.
Launch with:
python app_openai.py
"""
from __future__ import annotations
import os
import openai
import gradio as gr
# ──────────────────────────────────────────────────────────────────────────────
# OpenAI client configuration
# ──────────────────────────────────────────────────────────────────────────────
# ``openai`` still expects an API key even if the backend ignores it, so we use
# a dummy value when none is provided. The *base_url* points to the local
# vLLM server that speaks the OpenAI REST dialect.
# -----------------------------------------------------------------------------
openai.api_key = os.getenv("OPENAI_API_KEY", "EMPTY")
openai.base_url = "http://0.0.0.0:8000/v1"
# ──────────────────────────────────────────────────────────────────────────────
# Chat handler
# ──────────────────────────────────────────────────────────────────────────────
def stream_completion(message: str,
history: list[tuple[str, str]],
max_tokens: int,
temperature: float,
top_p: float,
beta: float):
"""Gradio callback that yields streaming assistant replies.
The function reconstructs the conversation *excluding* any system prompt
and then calls ``openai.chat.completions.create`` with ``stream=True``.
Each incoming delta is appended to an ``assistant`` buffer which is sent
back to the Chatbot component for real‑time display.
"""
# Build OpenAI‑style message list from prior turns
messages: list[dict[str, str]] = []
for user_msg, assistant_msg in history:
if user_msg:
messages.append({"role": "user", "content": user_msg})
if assistant_msg:
messages.append({"role": "assistant", "content": assistant_msg})
# Current user input comes last
messages.append({"role": "user", "content": message})
try:
# Kick off streaming completion
response = openai.chat.completions.create(
model="Qwen/Qwen3-4B",
messages=messages,
temperature=temperature,
top_p=top_p,
max_tokens=max_tokens,
stream=True,
# Forward MoI blending coefficient to the backend
extra_headers={"X-MIXINPUTS-BETA": str(beta)},
)
assistant = ""
for chunk in response:
# ``delta.content`` is None for e.g. role announcements; guard with or ""
delta = chunk.choices[0].delta.content or ""
assistant += delta
yield history + [(message, assistant)] # live update
except Exception as err: # pylint: disable=broad-except
yield history + [(message, f"[ERROR] {err}")]
# ──────────────────────────────────────────────────────────────────────────────
# Gradio UI
# ──────────────────────────────────────────────────────────────────────────────
with gr.Blocks(title="🎨 Mixture of Inputs (MoI) Demo") as demo:
gr.Markdown(
"## 🎨 Mixture of Inputs (MoI) Demo \n"
"Streaming vLLM demo with dynamic **beta** adjustment in MoI "
"(higher beta β†’ less blending)."
)
with gr.Row(): # sliders first
beta = gr.Slider(0.0, 10.0, value=1.0, step=0.1, label="MoI Ξ²")
temperature = gr.Slider(0.1, 1.0, value=0.6, step=0.1, label="Temperature")
top_p = gr.Slider(0.1, 1.0, value=0.80, step=0.05, label="Top‑p")
max_tokens = gr.Slider(1, 2048, value=512, step=1, label="Max new tokens")
chatbot = gr.Chatbot(height=450)
user_box = gr.Textbox(placeholder="Type a message and press Enter…", show_label=False)
clear_btn = gr.Button("Clear chat")
user_box.submit(
fn=stream_completion,
inputs=[user_box, chatbot, max_tokens, temperature, top_p, beta],
outputs=chatbot,
)
clear_btn.click(lambda: None, None, chatbot, queue=False)
# ──────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
demo.launch()