"""
HF Spaces (Docker SDK) app
- Launches vLLM (OpenAI-compatible) on localhost:API_PORT
- FastAPI proxies /v1/* → vLLM (so clients can use OpenAI SDK / LangChain)
- Gradio UI at "/"
- Defaults for A10G 24GB (Qwen 2.5 14B AWQ, 8k context)
"""

import os, time, threading, subprocess, requests
from fastapi import FastAPI, Request, Response
import gradio as gr

MODEL_ID = os.environ.get("MODEL_ID", "Qwen/Qwen2.5-14B-Instruct-AWQ")
API_PORT = int(os.environ.get("API_PORT", "8000"))  # vLLM internal port
SYSTEM_PROMPT = os.environ.get(
    "SYSTEM_PROMPT",
    "You are ExCom AI, a professional assistant that answers precisely and clearly."
)

VLLM_ARGS = [
    "python3", "-m", "vllm.entrypoints.openai.api_server",
    "--model", MODEL_ID,
    "--host", "0.0.0.0",
    "--port", str(API_PORT),
    "--served-model-name", "excom-ai",
    "--max-model-len", "8192",               # fits A10G 24GB
    "--gpu-memory-utilization", "0.90",
    "--trust-remote-code",
]
if "AWQ" in MODEL_ID.upper():
    VLLM_ARGS += ["--quantization", "awq_marlin"]  # faster AWQ kernel if available

def launch_vllm():
    print(f"[vLLM] Launch: {MODEL_ID}")
    subprocess.Popen(VLLM_ARGS)

def wait_vllm_ready(timeout=900, interval=3):
    url = f"http://127.0.0.1:{API_PORT}/v1/models"
    start = time.time()
    while time.time() - start < timeout:
        try:
            r = requests.get(url, timeout=3)
            if r.ok:
                print("[vLLM] Ready.")
                return True
        except Exception:
            pass
        time.sleep(interval)
    print("[vLLM] Not ready in time.")
    return False

threading.Thread(target=launch_vllm, daemon=True).start()
threading.Thread(target=wait_vllm_ready, daemon=True).start()

app = FastAPI()

@app.get("/health")
def health():
    try:
        r = requests.get(f"http://127.0.0.1:{API_PORT}/v1/models", timeout=2)
        return {"upstream_ok": r.ok}
    except Exception as e:
        return {"upstream_ok": False, "error": str(e)}

@app.get("/v1/models")
def proxy_models():
    r = requests.get(f"http://127.0.0.1:{API_PORT}/v1/models", timeout=30)
    return Response(content=r.content, media_type=r.headers.get("content-type","application/json"), status_code=r.status_code)

@app.post("/v1/chat/completions")
async def proxy_chat(req: Request):
    body = await req.body()
    r = requests.post(f"http://127.0.0.1:{API_PORT}/v1/chat/completions",
                      data=body,
                      headers={"Content-Type": "application/json"},
                      timeout=600)
    return Response(content=r.content, media_type=r.headers.get("content-type","application/json"), status_code=r.status_code)

# -------- Gradio (messages mode) --------
_ready = {"ok": False}
def ensure_ready():
    if _ready["ok"]: return True
    if wait_vllm_ready(timeout=60): _ready["ok"] = True; return True
    return False

def chat_fn(user_message: str, history: list[dict]):
    if not ensure_ready():
        return "⏳ Model is loading… please retry shortly."
    messages = [{"role":"system","content":SYSTEM_PROMPT}] + history + [{"role":"user","content":user_message}]
    payload = {"model":"excom-ai","messages":messages,"temperature":0.4}
    r = requests.post(f"http://127.0.0.1:{API_PORT}/v1/chat/completions", json=payload, timeout=600)
    r.raise_for_status()
    return r.json()["choices"][0]["message"]["content"]

ui = gr.ChatInterface(fn=chat_fn, title="ExCom AI — Qwen 2.5 14B AWQ (vLLM)", type="messages")
ui.queue()
app = gr.mount_gradio_app(app, ui, path="/")