Spaces:

umairali64488
/

multi_llm_debugging_engine

Sleeping

App Files Files Community

umairali64488 commited on Feb 25

Commit

c8cd4b6

verified ·

1 Parent(s): 47a1a1c

Upload 6 files

Browse files

Files changed (6) hide show

app/__init__.py +0 -0
app/config.py +39 -0
app/llm_chain.py +198 -0
app/llm_factory.py +54 -0
app/main.py +149 -0
app/prompts.py +73 -0

app/__init__.py ADDED Viewed

File without changes

app/config.py ADDED Viewed

	@@ -0,0 +1,39 @@

+"""
+config.py — Application settings via pydantic-settings.
+Set OPENROUTER_API_KEY in .env locally or in HF Space Secrets.
+"""
+from pydantic_settings import BaseSettings, SettingsConfigDict
+class Settings(BaseSettings):
+    model_config = SettingsConfigDict(env_file=".env", extra="ignore")
+    # ── OpenRouter ────────────────────────────────────────────────────────────
+    OPENROUTER_API_KEY: str = ""
+    BASE_URL: str = "https://openrouter.ai/api/v1"
+    # ── Panel LLMs (queried in parallel) ──────────────────────────────────────
+    PANEL_MODELS: list = [
+        "arcee-ai/trinity-large-preview:free",
+        "stepfun/step-3.5-flash:free",
+        "nvidia/nemotron-3-nano-30b-a3b:free",
+    ]
+    PANEL_MODEL_LABELS: dict = {
+        "arcee-ai/trinity-large-preview:free": "Trinity Large",
+        "stepfun/step-3.5-flash:free":         "StepFun Flash",
+        "nvidia/nemotron-3-nano-30b-a3b:free": "Nemotron Nano",
+    }
+    # ── Judge LLM ─────────────────────────────────────────────────────────────
+    JUDGE_MODEL: str = "qwen/qwen3-30b-a3b:free"
+    JUDGE_LABEL: str = "Qwen3 Judge"
+    # ── Generation params ─────────────────────────────────────────────────────
+    PANEL_TEMPERATURE: float = 0.3
+    JUDGE_TEMPERATURE: float = 0.1
+    PANEL_MAX_TOKENS:  int   = 2048
+    JUDGE_MAX_TOKENS:  int   = 4096
+settings = Settings()

app/llm_chain.py ADDED Viewed

	@@ -0,0 +1,198 @@

+"""
+llm_chain.py — LangChain LCEL pipeline for multi-LLM code debugging.
+Flow:
+  1. Build one LCEL chain per panel model  (PANEL_PROMPT | llm | StrOutputParser)
+  2. Run all 3 chains concurrently with asyncio.gather  (ainvoke)
+  3. Feed all results into the judge chain               (JUDGE_PROMPT | llm | StrOutputParser)
+  4. Parse judge output into {reasoning, final_answer}
+  5. Return structured dict consumed by the FastAPI endpoint
+"""
+import asyncio
+import time
+from typing import Any
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.runnables import RunnablePassthrough
+from app.config import settings
+from app.llm_factory import make_panel_llm, make_judge_llm
+from app.prompts import PANEL_PROMPT, JUDGE_PROMPT
+# ── Helpers ───────────────────────────────────────────────────────────────────
+def _split_judge_output(raw: str) -> tuple[str, str]:
+    """
+    Split judge output on the markdown headers we asked for.
+    Returns (reasoning, final_answer).
+    """
+    reasoning    = ""
+    final_answer = raw.strip()
+    if "## ✅ Final Answer" in raw:
+        parts        = raw.split("## ✅ Final Answer", 1)
+        final_answer = parts[1].strip()
+        reasoning    = parts[0].replace("## 🔍 Reasoning", "").strip()
+    elif "## 🔍 Reasoning" in raw:
+        reasoning    = raw.split("## 🔍 Reasoning", 1)[1].strip()
+    return reasoning, final_answer
+def _error_panel(model: str, label: str, error: str, latency_ms: float) -> dict:
+    return {
+        "model":      model,
+        "label":      label,
+        "response":   None,
+        "latency_ms": latency_ms,
+        "error":      error,
+    }
+# ── Panel chain ───────────────────────────────────────────────────────────────
+def build_panel_chain(model: str, temperature: float):
+    """
+    LCEL chain for a single panel model:
+        PANEL_PROMPT | ChatOpenAI | StrOutputParser
+    Input:  {"question": str}
+    Output: str  (model's raw markdown answer)
+    """
+    llm = make_panel_llm(model, temperature=temperature)
+    return PANEL_PROMPT | llm | StrOutputParser()
+async def _run_panel(model: str, label: str, question: str, temperature: float) -> dict:
+    """
+    Invoke one panel chain, return structured result dict.
+    Errors are caught so one failing model doesn't abort everything.
+    """
+    chain = build_panel_chain(model, temperature)
+    start = time.perf_counter()
+    try:
+        response  = await chain.ainvoke({"question": question})
+        latency   = round((time.perf_counter() - start) * 1000, 1)
+        return {
+            "model":      model,
+            "label":      label,
+            "response":   response.strip(),
+            "latency_ms": latency,
+            "error":      None,
+        }
+    except Exception as exc:
+        latency = round((time.perf_counter() - start) * 1000, 1)
+        return _error_panel(model, label, str(exc), latency)
+# ── Judge chain ───────────────────────────────────────────────────────────────
+def build_judge_chain():
+    """
+    LCEL chain for the Qwen3 judge:
+        JUDGE_PROMPT | ChatOpenAI | StrOutputParser
+    Input:  {"question", "label_1", "response_1", "label_2", "response_2",
+              "label_3", "response_3"}
+    Output: str  (full judge markdown text)
+    """
+    llm = make_judge_llm()
+    return JUDGE_PROMPT | llm | StrOutputParser()
+async def _run_judge(question: str, panel_results: list[dict]) -> dict:
+    """
+    Build judge inputs from panel results, invoke the judge chain,
+    and parse the structured output.
+    """
+    chain = build_judge_chain()
+    start = time.perf_counter()
+    # Pad missing responses so the prompt always has 3 slots
+    padded = []
+    for r in panel_results:
+        padded.append(r)
+    while len(padded) < 3:
+        padded.append({"label": "N/A", "response": "[No response — model failed]"})
+    def _content(r: dict) -> str:
+        if r.get("response"):
+            return r["response"]
+        return f"[Error: {r.get('error', 'unknown')}]"
+    judge_input = {
+        "question":   question,
+        "label_1":    padded[0]["label"],
+        "response_1": _content(padded[0]),
+        "label_2":    padded[1]["label"],
+        "response_2": _content(padded[1]),
+        "label_3":    padded[2]["label"],
+        "response_3": _content(padded[2]),
+    }
+    try:
+        raw     = await chain.ainvoke(judge_input)
+        latency = round((time.perf_counter() - start) * 1000, 1)
+        reasoning, final_answer = _split_judge_output(raw)
+        return {
+            "model":         settings.JUDGE_MODEL,
+            "label":         settings.JUDGE_LABEL,
+            "reasoning":     reasoning,
+            "final_answer":  final_answer,
+            "full_response": raw.strip(),
+            "latency_ms":    latency,
+            "error":         None,
+        }
+    except Exception as exc:
+        latency = round((time.perf_counter() - start) * 1000, 1)
+        return {
+            "model":         settings.JUDGE_MODEL,
+            "label":         settings.JUDGE_LABEL,
+            "reasoning":     "",
+            "final_answer":  None,
+            "full_response": None,
+            "latency_ms":    latency,
+            "error":         str(exc),
+        }
+# ── Main entry point ──────────────────────────────────────────────────────────
+async def debug_query(question: str, temperature: float = 0.3) -> dict[str, Any]:
+    """
+    Full multi-LLM pipeline:
+      Phase 1 — query 3 panel models concurrently (asyncio.gather)
+      Phase 2 — send all results to the Qwen3 judge
+    Returns:
+    {
+        "question":  str,
+        "panel":     list[dict],   # one dict per model
+        "judge":     dict,         # reasoning + final_answer
+        "total_ms":  float,
+    }
+    """
+    total_start = time.perf_counter()
+    # ── Phase 1: parallel panel calls ────────────────────────────────────────
+    panel_tasks = [
+        _run_panel(
+            model       = model,
+            label       = settings.PANEL_MODEL_LABELS.get(model, model.split("/")[-1]),
+            question    = question,
+            temperature = temperature,
+        )
+        for model in settings.PANEL_MODELS
+    ]
+    panel_results: list[dict] = list(await asyncio.gather(*panel_tasks))
+    # ── Phase 2: judge ────────────────────────────────────────────────────────
+    judge_result = await _run_judge(question, panel_results)
+    total_ms = round((time.perf_counter() - total_start) * 1000, 1)
+    return {
+        "question": question,
+        "panel":    panel_results,
+        "judge":    judge_result,
+        "total_ms": total_ms,
+    }

app/llm_factory.py ADDED Viewed

	@@ -0,0 +1,54 @@

+"""
+llm_factory.py — Creates LangChain ChatOpenAI instances pointing at OpenRouter.
+All LLMs in this project are built through this single factory so config
+stays in one place.
+"""
+from langchain_openai import ChatOpenAI
+from app.config import settings
+def make_llm(
+    model: str,
+    temperature: float | None = None,
+    max_tokens: int | None = None,
+) -> ChatOpenAI:
+    """
+    Return a LangChain ChatOpenAI client configured for OpenRouter.
+    Args:
+        model:       OpenRouter model string  e.g. "arcee-ai/trinity-large-preview:free"
+        temperature: Override; defaults to settings.PANEL_TEMPERATURE
+        max_tokens:  Override; defaults to settings.PANEL_MAX_TOKENS
+    """
+    if not settings.OPENROUTER_API_KEY:
+        raise ValueError(
+            "OPENROUTER_API_KEY is not set. "
+            "Add it to your .env file or HF Space Secrets."
+        )
+    return ChatOpenAI(
+        model=model,
+        openai_api_key=settings.OPENROUTER_API_KEY,
+        openai_api_base=settings.BASE_URL,
+        temperature=temperature if temperature is not None else settings.PANEL_TEMPERATURE,
+        max_tokens=max_tokens if max_tokens is not None else settings.PANEL_MAX_TOKENS,
+        default_headers={
+            "HTTP-Referer": "https://codedebug.local",
+            "X-Title":      "CodeDebug Multi-LLM Assistant",
+        },
+    )
+def make_panel_llm(model: str, temperature: float | None = None) -> ChatOpenAI:
+    """Panel LLM with panel token budget."""
+    return make_llm(model, temperature=temperature, max_tokens=settings.PANEL_MAX_TOKENS)
+def make_judge_llm() -> ChatOpenAI:
+    """Judge LLM with larger token budget and near-zero temperature."""
+    return make_llm(
+        settings.JUDGE_MODEL,
+        temperature=settings.JUDGE_TEMPERATURE,
+        max_tokens=settings.JUDGE_MAX_TOKENS,
+    )

app/main.py ADDED Viewed

	@@ -0,0 +1,149 @@

+"""
+main.py — FastAPI app for the CodeDebug Multi-LLM Debugger.
+Endpoints:
+  GET  /              → serves frontend/index.html
+  GET  /health        → health check
+  GET  /api/v1/models → list configured models
+  POST /api/v1/debug  → main pipeline (3 panel LLMs → Qwen3 judge)
+"""
+from contextlib import asynccontextmanager
+from pathlib import Path
+from fastapi import FastAPI, HTTPException, Request
+from fastapi.exceptions import RequestValidationError
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import FileResponse, JSONResponse
+from pydantic import BaseModel, Field, validator
+from app.config import settings
+from app.llm_chain import debug_query
+# ── Lifespan ──────────────────────────────────────────────────────────────────
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    print("⚡ Starting CodeDebug Multi-LLM Debugger...")
+    print(f"   Panel models : {settings.PANEL_MODELS}")
+    print(f"   Judge model  : {settings.JUDGE_MODEL}")
+    if not settings.OPENROUTER_API_KEY:
+        print("⚠️  WARNING: OPENROUTER_API_KEY is not set — requests will fail!")
+    else:
+        print("✅ API key loaded. Ready!")
+    yield
+    print("🛑 Shutting down")
+# ── App ───────────────────────────────────────────────────────────────────────
+app = FastAPI(
+    title="CodeDebug Multi-LLM Debugger",
+    version="2.0.0",
+    description="3 panel LLMs in parallel → Qwen3 judge synthesizes the final answer.",
+    lifespan=lifespan,
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# ── Validation error handler ──────────────────────────────────────────────────
+@app.exception_handler(RequestValidationError)
+async def validation_error_handler(request: Request, exc: RequestValidationError):
+    print(f"❌ Validation error: {exc.errors()}")
+    return JSONResponse(
+        status_code=422,
+        content={"error": str(exc.errors()), "detail": exc.errors()},
+    )
+# ── Request schema ────────────────────────────────────────────────────────────
+class DebugRequest(BaseModel):
+    question:    str   = Field(..., min_length=1, description="The debugging question or code snippet")
+    temperature: float = Field(default=0.3, ge=0.0, le=1.0, description="Sampling temperature for panel models")
+    @validator("temperature", pre=True)
+    def coerce_float(cls, v):
+        try:
+            return float(v)
+        except Exception:
+            return 0.3
+    @validator("question")
+    def strip_question(cls, v):
+        v = v.strip()
+        if not v:
+            raise ValueError("question must not be empty")
+        return v
+# ── Frontend ──────────────────────────────────────────────────────────────────
+FRONTEND = Path(__file__).parent.parent / "frontend"
+@app.get("/", include_in_schema=False)
+def serve_ui():
+    """Serve the single-page frontend."""
+    index = FRONTEND / "index.html"
+    if index.exists():
+        return FileResponse(str(index))
+    return JSONResponse(
+        status_code=200,
+        content={"message": "CodeDebug API running. See /docs for endpoints."},
+    )
+# ── System endpoints ──────────────────────────────────────────────────────────
+@app.get("/health", tags=["system"])
+def health():
+    """Health check."""
+    return {
+        "status":       "ok",
+        "version":      "2.0.0",
+        "api_key_set":  bool(settings.OPENROUTER_API_KEY),
+    }
+@app.get("/api/v1/models", tags=["system"])
+def list_models():
+    """List all configured models."""
+    return {
+        "panel_models": [
+            {"model": m, "label": settings.PANEL_MODEL_LABELS.get(m, m)}
+            for m in settings.PANEL_MODELS
+        ],
+        "judge_model": {
+            "model": settings.JUDGE_MODEL,
+            "label": settings.JUDGE_LABEL,
+        },
+    }
+# ── Debug endpoint ────────────────────────────────────────────────────────────
+@app.post("/api/v1/debug", tags=["debug"])
+async def debug(body: DebugRequest):
+    """
+    Main endpoint.
+    Sends `question` to 3 panel LLMs in parallel (LangChain LCEL chains),
+    then forwards all responses to the Qwen3 judge which returns:
+    - reasoning  : judge's analysis of the three answers
+    - final_answer: synthesised definitive answer
+    """
+    if not settings.OPENROUTER_API_KEY:
+        raise HTTPException(
+            status_code=503,
+            detail="OPENROUTER_API_KEY is not configured on the server.",
+        )
+    print(f"📨 '{body.question[:70]}...' | temp={body.temperature}")
+    result = await debug_query(body.question, body.temperature)
+    successful = sum(1 for p in result["panel"] if not p.get("error"))
+    print(f"✅ {successful}/3 panel responses · judge {'ok' if not result['judge'].get('error') else 'failed'} · {result['total_ms']}ms")
+    return result

app/prompts.py ADDED Viewed

	@@ -0,0 +1,73 @@

+"""
+prompts.py — All LangChain ChatPromptTemplates in one place.
+Import from here; never hardcode prompts elsewhere.
+"""
+from langchain_core.prompts import ChatPromptTemplate
+# ── Panel: Code Debugging Prompt ──────────────────────────────────────────────
+PANEL_PROMPT = ChatPromptTemplate.from_messages([
+    (
+        "system",
+        """You are an expert software debugger and code assistant.
+When analyzing code or debugging questions:
+- Identify the exact bug, error, or issue
+- Explain the ROOT CAUSE clearly
+- Provide a CORRECTED solution with working code examples
+- If multiple issues exist, list all of them
+- Be concise, precise, and professional
+Use markdown with fenced code blocks (```language) for all code.""",
+    ),
+    (
+        "human",
+        "{question}",
+    ),
+])
+# ── Judge: Synthesis Prompt ────────────────────────────────────────────────────
+JUDGE_PROMPT = ChatPromptTemplate.from_messages([
+    (
+        "system",
+        """You are a master software engineering judge evaluating responses from multiple AI models.
+Your job:
+1. Critically analyze each model's response for correctness, completeness, and clarity
+2. Identify what each model got RIGHT and WRONG
+3. Synthesize the single best possible answer, combining the strongest elements
+Always structure your response EXACTLY like this (keep the exact headers):
+## 🔍 Reasoning
+[Your critical analysis: what each model got right/wrong, which answer was most accurate, gaps you noticed]
+## ✅ Final Answer
+[The definitive synthesized answer with correct code examples in markdown fenced blocks]""",
+    ),
+    (
+        "human",
+        """USER'S DEBUGGING QUESTION:
+{question}
+---
+### Response 1 — {label_1}
+{response_1}
+---
+### Response 2 — {label_2}
+{response_2}
+---
+### Response 3 — {label_3}
+{response_3}
+---
+Now analyze all three responses and provide your reasoning and the definitive final answer.""",
+    ),
+])