Spaces:

plarnholt
/

excom-ai-demo

Paused

excom-ai-demo / app.py

Peter Larnholt

Upgrade to vLLM 0.6.3.post1 and remove pyairports workarounds

e48919a about 1 month ago

3.54 kB

	"""
	HF Spaces (Docker SDK) app
	- Launches vLLM (OpenAI-compatible) on localhost:API_PORT
	- FastAPI proxies /v1/* → vLLM (so clients can use OpenAI SDK / LangChain)
	- Gradio UI at "/"
	- Defaults for A10G 24GB (Qwen 2.5 14B AWQ, 8k context)
	"""

	import os, time, threading, subprocess, requests
	from fastapi import FastAPI, Request, Response
	import gradio as gr

	MODEL_ID = os.environ.get("MODEL_ID", "Qwen/Qwen2.5-14B-Instruct-AWQ")
	API_PORT = int(os.environ.get("API_PORT", "8000")) # vLLM internal port
	SYSTEM_PROMPT = os.environ.get(
	"SYSTEM_PROMPT",
	"You are ExCom AI, a professional assistant that answers precisely and clearly."
	)

	VLLM_ARGS = [
	"python3", "-m", "vllm.entrypoints.openai.api_server",
	"--model", MODEL_ID,
	"--host", "0.0.0.0",
	"--port", str(API_PORT),
	"--served-model-name", "excom-ai",
	"--max-model-len", "8192", # fits A10G 24GB
	"--gpu-memory-utilization", "0.90",
	"--trust-remote-code",
	]
	if "AWQ" in MODEL_ID.upper():
	VLLM_ARGS += ["--quantization", "awq_marlin"] # faster AWQ kernel if available

	def launch_vllm():
	print(f"[vLLM] Launch: {MODEL_ID}")
	subprocess.Popen(VLLM_ARGS)

	def wait_vllm_ready(timeout=900, interval=3):
	url = f"http://127.0.0.1:{API_PORT}/v1/models"
	start = time.time()
	while time.time() - start < timeout:
	try:
	r = requests.get(url, timeout=3)
	if r.ok:
	print("[vLLM] Ready.")
	return True
	except Exception:
	pass
	time.sleep(interval)
	print("[vLLM] Not ready in time.")
	return False

	threading.Thread(target=launch_vllm, daemon=True).start()
	threading.Thread(target=wait_vllm_ready, daemon=True).start()

	app = FastAPI()

	@app.get("/health")
	def health():
	try:
	r = requests.get(f"http://127.0.0.1:{API_PORT}/v1/models", timeout=2)
	return {"upstream_ok": r.ok}
	except Exception as e:
	return {"upstream_ok": False, "error": str(e)}

	@app.get("/v1/models")
	def proxy_models():
	r = requests.get(f"http://127.0.0.1:{API_PORT}/v1/models", timeout=30)
	return Response(content=r.content, media_type=r.headers.get("content-type","application/json"), status_code=r.status_code)

	@app.post("/v1/chat/completions")
	async def proxy_chat(req: Request):
	body = await req.body()
	r = requests.post(f"http://127.0.0.1:{API_PORT}/v1/chat/completions",
	data=body,
	headers={"Content-Type": "application/json"},
	timeout=600)
	return Response(content=r.content, media_type=r.headers.get("content-type","application/json"), status_code=r.status_code)

	# -------- Gradio (messages mode) --------
	_ready = {"ok": False}
	def ensure_ready():
	if _ready["ok"]: return True
	if wait_vllm_ready(timeout=60): _ready["ok"] = True; return True
	return False

	def chat_fn(user_message: str, history: list[dict]):
	if not ensure_ready():
	return "⏳ Model is loading… please retry shortly."
	messages = [{"role":"system","content":SYSTEM_PROMPT}] + history + [{"role":"user","content":user_message}]
	payload = {"model":"excom-ai","messages":messages,"temperature":0.4}
	r = requests.post(f"http://127.0.0.1:{API_PORT}/v1/chat/completions", json=payload, timeout=600)
	r.raise_for_status()
	return r.json()["choices"][0]["message"]["content"]

	ui = gr.ChatInterface(fn=chat_fn, title="ExCom AI — Qwen 2.5 14B AWQ (vLLM)", type="messages")
	ui.queue()
	app = gr.mount_gradio_app(app, ui, path="/")