| import gradio as gr |
| from openai import OpenAI |
|
|
| |
| client = OpenAI( |
| base_url="http://localhost:8004/v1", |
| api_key="token-not-needed", |
| ) |
|
|
| def predict(message, history): |
| history_openai_format = [] |
| |
| |
| for pair in history: |
| |
| if len(pair) >= 2: |
| history_openai_format.append({"role": "user", "content": str(pair[0])}) |
| history_openai_format.append({"role": "assistant", "content": str(pair[1])}) |
| |
| |
| history_openai_format.append({"role": "user", "content": message}) |
|
|
| |
| response = client.chat.completions.create( |
| model="Qwen/Qwen3-30B-A3B-Instruct-2507", |
| messages=history_openai_format, |
| temperature=0.7, |
| stream=True |
| ) |
|
|
| partial_message = "" |
| for chunk in response: |
| if chunk.choices[0].delta.content is not None: |
| partial_message += chunk.choices[0].delta.content |
| yield partial_message |
|
|
| |
| demo = gr.ChatInterface( |
| fn=predict, |
| title="Qwen3 vLLM Chat", |
| description="Interface for Qwen/Qwen3-30B-A3B-Instruct-2507 running on vLLM", |
| examples=["What is the capital of France?", "Write a Python function for quicksort."] |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch(server_name="0.0.0.0", server_port=7860, share=True) |