Spaces:

ar9av
/

sql-agent-openenv

Sleeping

App Files Files Community

ar9avg commited on 8 days ago

Commit

3c665d2

1 Parent(s): d796343

Initial submission: SQL Agent OpenEnv for Meta+HF hackathon

Browse files

Files changed (49) hide show

.gitignore +8 -0
Dockerfile +58 -0
README.md +13 -0
backend/api/__init__.py +0 -0
backend/api/demo.py +495 -0
backend/api/openenv.py +138 -0
backend/data/.gitkeep +0 -0
backend/data/benchmark.db +0 -0
backend/env/__init__.py +0 -0
backend/env/database.py +430 -0
backend/env/sql_env.py +594 -0
backend/env/tasks.py +345 -0
backend/gepa/__init__.py +0 -0
backend/gepa/optimizer.py +347 -0
backend/main.py +104 -0
backend/requirements.txt +9 -0
backend/rl/__init__.py +0 -0
backend/rl/environment.py +266 -0
backend/rl/error_classifier.py +98 -0
backend/rl/experience.py +208 -0
backend/rl/grader.py +99 -0
backend/rl/linucb.py +190 -0
backend/rl/repair_strategies.py +219 -0
backend/rl/types.py +161 -0
frontend/index.html +14 -0
frontend/package-lock.json +0 -0
frontend/package.json +30 -0
frontend/postcss.config.js +6 -0
frontend/src/App.tsx +179 -0
frontend/src/components/BenchmarkPanel.tsx +384 -0
frontend/src/components/ChatPanel.tsx +599 -0
frontend/src/components/ERDiagram.tsx +234 -0
frontend/src/components/Header.tsx +110 -0
frontend/src/components/LeftSidebar.tsx +157 -0
frontend/src/components/PerformanceGraph.tsx +175 -0
frontend/src/components/PromptEvolution.tsx +148 -0
frontend/src/components/ResultsTable.tsx +78 -0
frontend/src/components/RightSidebar.tsx +27 -0
frontend/src/index.css +187 -0
frontend/src/lib/api.ts +97 -0
frontend/src/lib/types.ts +131 -0
frontend/src/main.tsx +19 -0
frontend/src/store/useStore.ts +175 -0
frontend/src/vite-env.d.ts +9 -0
frontend/tailwind.config.js +20 -0
frontend/tsconfig.json +24 -0
frontend/vite.config.ts +28 -0
inference.py +230 -0
openenv.yaml +137 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+.venv/
+__pycache__/
+*.pyc
+backend/data/rl_weights.json
+backend/data/rl_experiences.json
+backend/data/gepa_prompt.json
+node_modules/
+frontend/dist/

Dockerfile ADDED Viewed

	@@ -0,0 +1,58 @@

+# SQL Agent OpenEnv — Docker build for Hugging Face Spaces
+#
+# Stage 1: Build React frontend
+# Stage 2: Python FastAPI app serving both the API and static UI
+#
+# HF Spaces expects the app to listen on port 7860.
+# ── Stage 1: Frontend build ───────────────────────────────────────────────────
+FROM node:20-slim AS frontend-builder
+WORKDIR /app/frontend
+# Install deps first (layer cache)
+COPY frontend/package.json frontend/package-lock.json* ./
+RUN npm ci --prefer-offline --no-audit
+# Build the app
+COPY frontend/ ./
+RUN npm run build
+# ── Stage 2: Python runtime ───────────────────────────────────────────────────
+FROM python:3.11-slim
+# System deps
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    gcc \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+# Python deps
+COPY backend/requirements.txt ./requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy backend source
+COPY backend/ ./backend/
+# Copy built frontend
+COPY --from=frontend-builder /app/frontend/dist ./frontend/dist
+# Copy repo-root artefacts
+COPY inference.py openenv.yaml README.md ./
+# Ensure data dir exists (RL weights, GEPA prompts, SQLite DB)
+RUN mkdir -p ./backend/data
+# ── HF Spaces config ──────────────────────────────────────────────────────────
+EXPOSE 7860
+ENV PORT=7860 \
+    PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1
+# Run from backend/ so relative imports and data/ paths resolve correctly
+WORKDIR /app/backend
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]

README.md CHANGED Viewed

@@ -1,4 +1,17 @@
 ---
 title: Sql Agent Openenv
 emoji: 🏢
 colorFrom: yellow

 ---
+title: SQL Agent OpenEnv
+emoji: 🗄️
+colorFrom: blue
+colorTo: purple
+sdk: docker
+pinned: false
+tags:
+  - openenv
+  - sql
+  - reinforcement-learning
+  - contextual-bandit
+---
+---
 title: Sql Agent Openenv
 emoji: 🏢
 colorFrom: yellow

backend/api/__init__.py ADDED Viewed

File without changes

backend/api/demo.py ADDED Viewed

	@@ -0,0 +1,495 @@

+"""
+Demo API routes — streaming SSE endpoints matching the original TypeScript API.
+Routes:
+  GET  /api/init
+  POST /api/execute-query   (SSE)
+  POST /api/benchmark       (SSE)
+  GET  /api/rl-state
+  GET  /api/schema-graph
+  POST /api/feedback
+"""
+from __future__ import annotations
+import asyncio
+import json
+import time
+from typing import AsyncIterator, Optional
+from fastapi import APIRouter
+from pydantic import BaseModel
+from sse_starlette.sse import EventSourceResponse
+from env.database import (
+    ensure_seeded,
+    get_table_stats,
+    get_schema_info,
+    get_schema_graph,
+    execute_query,
+)
+from env.tasks import TASKS, get_task
+from env.sql_env import SQLAgentEnv, Action, get_env, BASE_SYSTEM_PROMPT, _clean_sql
+from rl.environment import get_bandit_state
+from rl.types import RepairAction, REPAIR_ACTION_NAMES, REPAIR_ACTION_BY_NAME
+from rl.error_classifier import classify_error, extract_offending_token
+from rl.grader import GraderInput, compute_reward, compute_episode_reward
+from rl.types import RLState, EpisodeStep, featurize, ERROR_CLASS_NAMES
+from gepa.optimizer import get_gepa, QueryResult
+router = APIRouter()
+# ─── /api/init ────────────────────────────────────────────────────
+@router.get("/init")
+async def init_db():
+    seeded = ensure_seeded()
+    tables = get_table_stats()
+    return {"tables": tables, "seeded": seeded}
+# ─── /api/execute-query ───────────────────────────────────────────
+class ExecuteQueryRequest(BaseModel):
+    question: str
+    task_id: str = "simple_queries"
+@router.post("/execute-query")
+async def execute_query_stream(req: ExecuteQueryRequest):
+    async def event_generator() -> AsyncIterator[dict]:
+        env = get_env()
+        obs = env.reset(req.task_id)
+        # Pick first question of task matching question text, or default
+        task = get_task(req.task_id)
+        question_obj = task.questions[0]
+        # Override question text
+        env._episode.question = req.question  # type: ignore[union-attr]
+        max_attempts = env.MAX_ATTEMPTS
+        done = False
+        all_step_rewards: list[float] = []
+        success = False
+        # Initial generate action
+        action = Action(repair_action="generate")
+        for attempt in range(1, max_attempts + 1):
+            yield {"data": json.dumps({"type": "attempt_start", "attempt": attempt})}
+            ep = env._episode  # type: ignore[union-attr]
+            ep.attempt_number = attempt
+            # Generate SQL with streaming
+            from env.sql_env import _make_client, _MODEL
+            from openai import AsyncOpenAI
+            if attempt == 1 or ep.current_sql is None:
+                system_prompt = BASE_SYSTEM_PROMPT
+                user_msg = (
+                    f"Schema:\n{obs.schema_info}\n\nQuestion: {req.question}\n\n"
+                    "Write a SQL query to answer this question."
+                )
+            else:
+                from rl.repair_strategies import RepairContext, get_repair_system_suffix, build_repair_user_message
+                from env.sql_env import REPAIR_ACTION_BY_NAME
+                # Bandit selects action
+                if ep.current_features is not None:
+                    repair_enum, scores = env._bandit.select_action(ep.current_features)
+                    ucb_scores = {
+                        REPAIR_ACTION_NAMES[RepairAction(i)]: round(scores[i], 4)
+                        for i in range(len(scores))
+                    }
+                    action = Action(repair_action=REPAIR_ACTION_NAMES[repair_enum])
+                    yield {"data": json.dumps({
+                        "type": "rl_action",
+                        "action": action.repair_action,
+                        "ucb_scores": ucb_scores,
+                    })}
+                else:
+                    repair_enum = RepairAction.REWRITE_FULL
+                    action = Action(repair_action="rewrite_full")
+                suffix = get_repair_system_suffix(repair_enum)
+                offending = extract_offending_token(ep.error_message or "")
+                ctx = RepairContext(
+                    schema=obs.schema_info,
+                    question=req.question,
+                    failing_sql=ep.current_sql or "",
+                    error_message=ep.error_message or "",
+                    offending_token=offending,
+                )
+                system_prompt = BASE_SYSTEM_PROMPT + suffix
+                user_msg = build_repair_user_message(repair_enum, ctx)
+            # Stream SQL generation
+            client = _make_client()
+            chunks: list[str] = []
+            try:
+                stream = await client.chat.completions.create(
+                    model=_MODEL,
+                    messages=[
+                        {"role": "system", "content": system_prompt},
+                        {"role": "user", "content": user_msg},
+                    ],
+                    stream=True,
+                    temperature=0.1,
+                )
+                async for chunk in stream:
+                    delta = chunk.choices[0].delta.content
+                    if delta:
+                        chunks.append(delta)
+                        yield {"data": json.dumps({"type": "sql_chunk", "chunk": delta})}
+            except Exception as e:
+                yield {"data": json.dumps({"type": "error", "error": str(e), "error_class": "other"})}
+                break
+            generated_sql = _clean_sql("".join(chunks))
+            yield {"data": json.dumps({"type": "sql_complete", "sql": generated_sql})}
+            yield {"data": json.dumps({"type": "executing"})}
+            rows, error = execute_query(generated_sql)
+            from env.tasks import grade_response
+            task_score = grade_response(
+                req.task_id, question_obj.id, generated_sql, rows, error, attempt
+            )
+            attempt_success = task_score >= 0.8
+            current_error_class = None
+            error_class_name = None
+            if error:
+                ec = classify_error(error)
+                current_error_class = ec
+                error_class_name = ERROR_CLASS_NAMES[ec]
+                error_changed = (
+                    ep.previous_error_class is not None
+                    and ep.previous_error_class != current_error_class
+                )
+                if ep.previous_error_class == current_error_class:
+                    ep.consecutive_same_error += 1
+                else:
+                    ep.consecutive_same_error = 1
+                rl_state = RLState(
+                    error_class=current_error_class,
+                    attempt_number=attempt,
+                    previous_action=ep.last_action,
+                    error_changed=error_changed,
+                    consecutive_same_error=ep.consecutive_same_error,
+                )
+                ep.current_rl_state = rl_state
+                ep.current_features = featurize(rl_state)
+                # Stream diagnosis chunk
+                try:
+                    diag_stream = await client.chat.completions.create(
+                        model=_MODEL,
+                        messages=[
+                            {"role": "system", "content": "You are a SQL debugger. Briefly explain the error in one sentence."},
+                            {"role": "user", "content": f"Error: {error}\nSQL: {generated_sql}"},
+                        ],
+                        stream=True,
+                        temperature=0.3,
+                    )
+                    async for chunk in diag_stream:
+                        delta = chunk.choices[0].delta.content
+                        if delta:
+                            yield {"data": json.dumps({"type": "diagnosis_chunk", "chunk": delta})}
+                except Exception:
+                    pass
+                yield {"data": json.dumps({"type": "error", "error": error, "error_class": error_class_name})}
+            # Grader + RL update
+            grader_in = GraderInput(
+                success=attempt_success,
+                attempt_number=attempt,
+                current_error_class=current_error_class,
+                previous_error_class=ep.previous_error_class,
+            )
+            grader_out = compute_reward(grader_in)
+            all_step_rewards.append(grader_out.reward)
+            if ep.current_rl_state and ep.current_features:
+                repair_enum_for_step = REPAIR_ACTION_BY_NAME.get(
+                    action.repair_action, RepairAction.REWRITE_FULL
+                )
+                step_obj = EpisodeStep(
+                    state=ep.current_rl_state,
+                    featurized=ep.current_features,
+                    action=repair_enum_for_step,
+                    reward=grader_out.reward,
+                    error_message=error or "",
+                    sql=generated_sql,
+                    success=attempt_success,
+                )
+                ep.steps.append(step_obj)
+                env._bandit.update(ep.current_features, repair_enum_for_step, grader_out.reward)
+                ep.last_action = repair_enum_for_step
+            ep.current_sql = generated_sql
+            ep.error_message = error
+            ep.error_class = error_class_name
+            ep.previous_error_class = current_error_class
+            yield {"data": json.dumps({
+                "type": "rl_reward",
+                "reward": grader_out.reward,
+                "breakdown": {
+                    "base": grader_out.breakdown.base,
+                    "attempt_penalty": grader_out.breakdown.attempt_penalty,
+                    "severity_bonus": grader_out.breakdown.severity_bonus,
+                    "change_bonus": grader_out.breakdown.change_bonus,
+                },
+            })}
+            if attempt_success:
+                success = True
+                yield {"data": json.dumps({
+                    "type": "success",
+                    "rows": rows,
+                    "row_count": len(rows),
+                    "sql": generated_sql,
+                })}
+                done = True
+                break
+        total_reward = compute_episode_reward(all_step_rewards, success)
+        yield {"data": json.dumps({
+            "type": "rl_episode_end",
+            "total_reward": total_reward,
+            "success": success,
+        })}
+        # Record GEPA history
+        gepa = get_gepa()
+        gepa.record_result(QueryResult(
+            question=req.question,
+            final_sql=env._episode.current_sql or "" if env._episode else "",  # type: ignore[union-attr]
+            attempts=len(all_step_rewards),
+            success=success,
+            errors=[s.error_message for s in (env._episode.steps if env._episode else []) if s.error_message],
+            timestamp=time.time(),
+        ))
+        # Finalize episode
+        env._finalize_episode(success=success)
+        if env._episode:
+            env._episode.done = True
+            env._episode.success = success
+        # Trigger GEPA if needed
+        if gepa.should_optimize():
+            try:
+                await gepa.run_optimization_cycle()
+            except Exception:
+                pass
+    return EventSourceResponse(event_generator())
+# ─── /api/benchmark ───────────────────────────────────────────────
+class BenchmarkRequest(BaseModel):
+    task_id: str = "simple_queries"
+@router.post("/benchmark")
+async def run_benchmark(req: BenchmarkRequest):
+    async def event_generator() -> AsyncIterator[dict]:
+        task = get_task(req.task_id)
+        scores: list[float] = []
+        for question_obj in task.questions:
+            yield {"data": json.dumps({
+                "type": "query_start",
+                "query_id": question_obj.id,
+                "question": question_obj.question,
+            })}
+            # Run the question through the env
+            env = SQLAgentEnv()
+            obs = env.reset_with_question(req.task_id, question_obj.id)
+            attempt = 0
+            sql = ""
+            success = False
+            task_score = 0.0
+            max_attempts = env.MAX_ATTEMPTS
+            ep = env._episode  # type: ignore[union-attr]
+            gepa = get_gepa()
+            system_prompt = gepa.get_current_prompt()
+            from env.sql_env import _make_client, _MODEL
+            for attempt in range(1, max_attempts + 1):
+                ep.attempt_number = attempt
+                if attempt == 1 or ep.current_sql is None:
+                    user_msg = (
+                        f"Schema:\n{obs.schema_info}\n\n"
+                        f"Question: {question_obj.question}\n\n"
+                        "Write a SQL query to answer this question."
+                    )
+                    sys_prompt = system_prompt
+                else:
+                    from rl.repair_strategies import RepairContext, get_repair_system_suffix, build_repair_user_message
+                    if ep.current_features is not None:
+                        repair_enum, _ = env._bandit.select_action(ep.current_features)
+                    else:
+                        repair_enum = RepairAction.REWRITE_FULL
+                    suffix = get_repair_system_suffix(repair_enum)
+                    offending = extract_offending_token(ep.error_message or "")
+                    ctx = RepairContext(
+                        schema=obs.schema_info,
+                        question=question_obj.question,
+                        failing_sql=ep.current_sql or "",
+                        error_message=ep.error_message or "",
+                        offending_token=offending,
+                    )
+                    sys_prompt = system_prompt + suffix
+                    user_msg = build_repair_user_message(repair_enum, ctx)
+                client = _make_client()
+                try:
+                    resp = await client.chat.completions.create(
+                        model=_MODEL,
+                        messages=[
+                            {"role": "system", "content": sys_prompt},
+                            {"role": "user", "content": user_msg},
+                        ],
+                        temperature=0.1,
+                    )
+                    sql = _clean_sql(resp.choices[0].message.content or "")
+                except Exception as e:
+                    break
+                rows, error = execute_query(sql)
+                from env.tasks import grade_response
+                task_score = grade_response(
+                    req.task_id, question_obj.id, sql, rows, error, attempt
+                )
+                success = task_score >= 0.8
+                current_ec = None
+                if error:
+                    ec = classify_error(error)
+                    current_ec = ec
+                    error_changed = ep.previous_error_class is not None and ep.previous_error_class != ec
+                    if ep.previous_error_class == ec:
+                        ep.consecutive_same_error += 1
+                    else:
+                        ep.consecutive_same_error = 1
+                    rl_state = RLState(
+                        error_class=ec,
+                        attempt_number=attempt,
+                        previous_action=ep.last_action,
+                        error_changed=error_changed,
+                        consecutive_same_error=ep.consecutive_same_error,
+                    )
+                    ep.current_rl_state = rl_state
+                    ep.current_features = featurize(rl_state)
+                from rl.grader import GraderInput, compute_reward
+                grader_in = GraderInput(
+                    success=success,
+                    attempt_number=attempt,
+                    current_error_class=current_ec,
+                    previous_error_class=ep.previous_error_class,
+                )
+                grader_out = compute_reward(grader_in)
+                ep.current_sql = sql
+                ep.error_message = error
+                ep.error_class = ERROR_CLASS_NAMES[current_ec] if current_ec else None
+                ep.previous_error_class = current_ec
+                if success:
+                    break
+            scores.append(task_score)
+            yield {"data": json.dumps({
+                "type": "query_result",
+                "query_id": question_obj.id,
+                "success": success,
+                "score": task_score,
+                "sql": sql,
+                "attempts": attempt,
+            })}
+        overall_score = sum(scores) / len(scores) if scores else 0.0
+        yield {"data": json.dumps({
+            "type": "done",
+            "overall_score": overall_score,
+            "task_id": req.task_id,
+        })}
+    return EventSourceResponse(event_generator())
+# ─── /api/rl-state ────────────────────────────────────────────────
+@router.get("/rl-state")
+async def get_rl_state():
+    state = get_bandit_state()
+    action_names = [REPAIR_ACTION_NAMES[RepairAction(i)] for i in range(8)]
+    action_distribution = {
+        name: state["action_counts"][i]
+        for i, name in enumerate(action_names)
+    }
+    return {
+        "action_counts": state["action_counts"],
+        "alpha": state["alpha"],
+        "total_updates": state["total_updates"],
+        "action_distribution": action_distribution,
+    }
+# ─── /api/schema-graph ────────────────────────────────────────────
+@router.get("/schema-graph")
+async def schema_graph():
+    return get_schema_graph()
+# ─── /api/feedback ────────────────────────────────────────────────
+class FeedbackRequest(BaseModel):
+    question: str
+    sql: str
+    correct: bool
+@router.post("/feedback")
+async def submit_feedback(req: FeedbackRequest):
+    gepa = get_gepa()
+    gepa.record_result(QueryResult(
+        question=req.question,
+        final_sql=req.sql,
+        attempts=1,
+        success=req.correct,
+        errors=[] if req.correct else ["User marked as incorrect"],
+        timestamp=time.time(),
+    ))
+    result = None
+    if not req.correct and gepa.should_optimize():
+        try:
+            result = await gepa.run_optimization_cycle(
+                user_feedback_context=f"User marked query as incorrect.\nQuestion: {req.question}\nSQL: {req.sql}"
+            )
+        except Exception:
+            pass
+    return {
+        "received": True,
+        "gepa_triggered": result is not None,
+        "reflection": result.get("reflection") if result else None,
+    }

backend/api/openenv.py ADDED Viewed

	@@ -0,0 +1,138 @@

+"""
+OpenEnv spec routes.
+POST /env/reset   → Observation
+POST /env/step    → {observation: Observation, reward: RewardInfo}
+GET  /env/state   → current episode state dict
+GET  /env/tasks   → list of task metadata
+GET  /env/info    → env metadata
+"""
+from __future__ import annotations
+from fastapi import APIRouter, HTTPException
+from pydantic import BaseModel
+from typing import Optional
+from env.sql_env import get_env, Observation, Action, RewardInfo
+from env.tasks import get_all_tasks
+router = APIRouter()
+# ─── Request Models ───────────────────────────────────────────────
+class ResetRequest(BaseModel):
+    task_id: str = "simple_queries"
+    question_id: Optional[str] = None
+class StepRequest(BaseModel):
+    repair_action: str = "generate"
+    custom_sql: Optional[str] = None
+# ─── Routes ───────────────────────────────────────────────────────
+@router.post("/reset", response_model=Observation)
+async def env_reset(req: ResetRequest):
+    """Reset the environment to start a new episode."""
+    env = get_env()
+    if req.question_id:
+        obs = env.reset_with_question(req.task_id, req.question_id)
+    else:
+        obs = env.reset(req.task_id)
+    return obs
+@router.post("/step")
+async def env_step(req: StepRequest):
+    """Execute one step in the current episode."""
+    env = get_env()
+    try:
+        action = Action(
+            repair_action=req.repair_action,
+            custom_sql=req.custom_sql,
+        )
+        obs, reward = await env.step(action)
+        return {
+            "observation": obs.model_dump(),
+            "reward": reward.model_dump(),
+        }
+    except RuntimeError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+@router.get("/state")
+async def env_state():
+    """Get the current episode state."""
+    env = get_env()
+    return env.state()
+@router.get("/tasks")
+async def list_tasks():
+    """List all available tasks with metadata."""
+    tasks = get_all_tasks()
+    return [
+        {
+            "id": t.id,
+            "name": t.name,
+            "difficulty": t.difficulty,
+            "description": t.description,
+            "question_count": len(t.questions),
+            "questions": [
+                {
+                    "id": q.id,
+                    "question": q.question,
+                    "hint_tables": q.hint_tables,
+                }
+                for q in t.questions
+            ],
+        }
+        for t in tasks
+    ]
+@router.get("/info")
+async def env_info():
+    """Return environment metadata (matches openenv.yaml spec)."""
+    return {
+        "name": "sql-agent-openenv",
+        "version": "1.0.0",
+        "description": "SQL generation and repair environment with RL-driven repair strategy selection.",
+        "action_space": {
+            "type": "discrete",
+            "actions": [
+                "generate",
+                "rewrite_full",
+                "fix_column",
+                "fix_table",
+                "add_groupby",
+                "rewrite_cte",
+                "fix_syntax",
+                "change_dialect",
+                "relax_filter",
+            ],
+        },
+        "observation_space": {
+            "type": "dict",
+            "fields": [
+                "question",
+                "schema_info",
+                "current_sql",
+                "error_message",
+                "error_class",
+                "attempt_number",
+                "max_attempts",
+                "task_id",
+                "task_difficulty",
+            ],
+        },
+        "reward_range": [-1.5, 1.5],
+        "max_steps": 5,
+        "tasks": ["simple_queries", "join_queries", "complex_queries"],
+        "rl_algorithm": "LinUCB (contextual bandit)",
+        "feature_dim": 20,
+        "num_actions": 8,
+    }

backend/data/.gitkeep ADDED Viewed

File without changes

backend/data/benchmark.db ADDED Viewed

Binary file (32.8 kB). View file

backend/env/__init__.py ADDED Viewed

File without changes

backend/env/database.py ADDED Viewed

	@@ -0,0 +1,430 @@

+"""
+SQLite database setup and schema for the benchmark marketplace.
+Tables:
+  sellers   (id, name, email, country, rating)
+  users     (id, name, email, created_at, country)
+  products  (id, name, category, price, stock_quantity, seller_id)
+  orders    (id, user_id, product_id, quantity, total_price, status, created_at)
+  reviews   (id, user_id, product_id, rating, comment, created_at)
+~50 rows per table of realistic seed data.
+"""
+from __future__ import annotations
+import os
+import sqlite3
+from pathlib import Path
+from typing import Any
+_DATA_DIR = Path(os.environ.get("DATA_DIR", Path(__file__).parent.parent / "data"))
+DB_PATH = _DATA_DIR / "benchmark.db"
+# ─── Schema ───────────────────────────────────────────────────────
+_DDL = """
+CREATE TABLE IF NOT EXISTS sellers (
+    id          INTEGER PRIMARY KEY,
+    name        TEXT    NOT NULL,
+    email       TEXT    NOT NULL UNIQUE,
+    country     TEXT    NOT NULL,
+    rating      REAL    NOT NULL DEFAULT 4.0
+);
+CREATE TABLE IF NOT EXISTS users (
+    id          INTEGER PRIMARY KEY,
+    name        TEXT    NOT NULL,
+    email       TEXT    NOT NULL UNIQUE,
+    created_at  TEXT    NOT NULL,
+    country     TEXT    NOT NULL
+);
+CREATE TABLE IF NOT EXISTS products (
+    id             INTEGER PRIMARY KEY,
+    name           TEXT    NOT NULL,
+    category       TEXT    NOT NULL,
+    price          REAL    NOT NULL,
+    stock_quantity INTEGER NOT NULL DEFAULT 0,
+    seller_id      INTEGER NOT NULL REFERENCES sellers(id)
+);
+CREATE TABLE IF NOT EXISTS orders (
+    id          INTEGER PRIMARY KEY,
+    user_id     INTEGER NOT NULL REFERENCES users(id),
+    product_id  INTEGER NOT NULL REFERENCES products(id),
+    quantity    INTEGER NOT NULL DEFAULT 1,
+    total_price REAL    NOT NULL,
+    status      TEXT    NOT NULL DEFAULT 'pending',
+    created_at  TEXT    NOT NULL
+);
+CREATE TABLE IF NOT EXISTS reviews (
+    id          INTEGER PRIMARY KEY,
+    user_id     INTEGER NOT NULL REFERENCES users(id),
+    product_id  INTEGER NOT NULL REFERENCES products(id),
+    rating      INTEGER NOT NULL CHECK(rating BETWEEN 1 AND 5),
+    comment     TEXT,
+    created_at  TEXT    NOT NULL
+);
+"""
+# ─── Seed Data ────────────────────────────────────────────────────
+_SELLERS = [
+    (1, "TechGadgets Inc", "contact@techgadgets.com", "USA", 4.8),
+    (2, "FashionHub", "info@fashionhub.co.uk", "UK", 4.5),
+    (3, "HomeDecor Pro", "sales@homedecopro.de", "Germany", 4.3),
+    (4, "SportZone", "hello@sportzone.fr", "France", 4.6),
+    (5, "BookWorld", "support@bookworld.ca", "Canada", 4.9),
+    (6, "ElectroMart", "contact@electromart.jp", "Japan", 4.7),
+    (7, "GreenGrocer", "team@greengrocer.au", "Australia", 4.4),
+    (8, "KidsToys Hub", "info@kidstoys.us", "USA", 4.2),
+    (9, "PetSupplies Co", "hello@petsupplies.nl", "Netherlands", 4.6),
+    (10, "OfficeSupply Plus", "contact@officesupply.sg", "Singapore", 4.1),
+]
+_USERS = [
+    (1, "Alice Johnson", "alice@example.com", "2023-01-15", "USA"),
+    (2, "Bob Smith", "bob@example.com", "2023-02-10", "UK"),
+    (3, "Carol White", "carol@example.com", "2023-03-05", "Canada"),
+    (4, "David Brown", "david@example.com", "2023-03-20", "Germany"),
+    (5, "Emma Davis", "emma@example.com", "2023-04-12", "France"),
+    (6, "Frank Miller", "frank@example.com", "2023-05-01", "Australia"),
+    (7, "Grace Wilson", "grace@example.com", "2023-05-18", "Japan"),
+    (8, "Henry Taylor", "henry@example.com", "2023-06-03", "USA"),
+    (9, "Isabella Anderson", "isabella@example.com", "2023-06-25", "UK"),
+    (10, "Jack Martinez", "jack@example.com", "2023-07-09", "Spain"),
+    (11, "Karen Thomas", "karen@example.com", "2023-07-22", "Italy"),
+    (12, "Liam Jackson", "liam@example.com", "2023-08-04", "Brazil"),
+    (13, "Mia Harris", "mia@example.com", "2023-08-17", "Canada"),
+    (14, "Noah Martin", "noah@example.com", "2023-09-01", "USA"),
+    (15, "Olivia Garcia", "olivia@example.com", "2023-09-14", "Mexico"),
+    (16, "Paul Robinson", "paul@example.com", "2023-10-02", "Australia"),
+    (17, "Quinn Lewis", "quinn@example.com", "2023-10-20", "New Zealand"),
+    (18, "Rachel Walker", "rachel@example.com", "2023-11-05", "UK"),
+    (19, "Sam Hall", "sam@example.com", "2023-11-19", "USA"),
+    (20, "Tina Allen", "tina@example.com", "2023-12-01", "Germany"),
+    (21, "Umar Young", "umar@example.com", "2024-01-08", "Pakistan"),
+    (22, "Vera Hernandez", "vera@example.com", "2024-01-22", "Spain"),
+    (23, "Will King", "will@example.com", "2024-02-06", "USA"),
+    (24, "Xena Wright", "xena@example.com", "2024-02-20", "Canada"),
+    (25, "Yusuf Lopez", "yusuf@example.com", "2024-03-05", "Morocco"),
+    (26, "Zoe Hill", "zoe@example.com", "2024-03-19", "UK"),
+    (27, "Aaron Scott", "aaron@example.com", "2024-04-02", "USA"),
+    (28, "Bella Green", "bella@example.com", "2024-04-16", "Australia"),
+    (29, "Carlos Adams", "carlos@example.com", "2024-05-01", "Brazil"),
+    (30, "Diana Baker", "diana@example.com", "2024-05-15", "Canada"),
+    (31, "Ethan Gonzalez", "ethan@example.com", "2024-05-29", "USA"),
+    (32, "Fatima Nelson", "fatima@example.com", "2024-06-12", "Nigeria"),
+    (33, "George Carter", "george@example.com", "2024-06-26", "UK"),
+    (34, "Hannah Mitchell", "hannah@example.com", "2024-07-10", "Germany"),
+    (35, "Ivan Perez", "ivan@example.com", "2024-07-24", "Russia"),
+    (36, "Julia Roberts", "juliar@example.com", "2024-08-07", "USA"),
+    (37, "Kevin Turner", "kevin@example.com", "2024-08-21", "Canada"),
+    (38, "Luna Phillips", "luna@example.com", "2024-09-04", "France"),
+    (39, "Mike Campbell", "mike@example.com", "2024-09-18", "USA"),
+    (40, "Nancy Parker", "nancy@example.com", "2024-10-02", "Japan"),
+    (41, "Oscar Evans", "oscar@example.com", "2024-10-16", "UK"),
+    (42, "Penny Edwards", "penny@example.com", "2024-10-30", "Australia"),
+    (43, "Roy Collins", "roy@example.com", "2024-11-13", "USA"),
+    (44, "Sara Stewart", "sara@example.com", "2024-11-27", "Canada"),
+    (45, "Tom Morris", "tom@example.com", "2024-12-11", "UK"),
+    (46, "Uma Rogers", "uma@example.com", "2024-12-25", "India"),
+    (47, "Victor Reed", "victor@example.com", "2025-01-08", "USA"),
+    (48, "Wendy Cook", "wendy@example.com", "2025-01-22", "Germany"),
+    (49, "Xavier Morgan", "xavier@example.com", "2025-02-05", "France"),
+    (50, "Yasmin Bell", "yasmin@example.com", "2025-02-19", "UK"),
+]
+_PRODUCTS = [
+    (1, "Wireless Headphones Pro", "Electronics", 149.99, 120, 1),
+    (2, "Laptop Stand Adjustable", "Electronics", 49.99, 200, 1),
+    (3, "USB-C Hub 7-in-1", "Electronics", 39.99, 350, 6),
+    (4, "Mechanical Keyboard RGB", "Electronics", 89.99, 85, 6),
+    (5, "Webcam 4K Ultra", "Electronics", 129.99, 60, 1),
+    (6, "Summer Floral Dress", "Fashion", 59.99, 180, 2),
+    (7, "Men Slim Fit Chinos", "Fashion", 44.99, 220, 2),
+    (8, "Leather Wallet Bifold", "Fashion", 34.99, 300, 2),
+    (9, "Running Shoes Ultralight", "Fashion", 109.99, 95, 4),
+    (10, "Yoga Pants High Waist", "Fashion", 54.99, 150, 4),
+    (11, "Ceramic Vase Set", "Home & Garden", 79.99, 70, 3),
+    (12, "Bamboo Cutting Board", "Home & Garden", 29.99, 400, 3),
+    (13, "Scented Candle Collection", "Home & Garden", 24.99, 500, 3),
+    (14, "Smart LED Bulb Pack", "Home & Garden", 59.99, 250, 1),
+    (15, "Coffee Table Book Stand", "Home & Garden", 49.99, 130, 3),
+    (16, "Protein Powder Vanilla", "Sports & Fitness", 54.99, 210, 4),
+    (17, "Resistance Band Set", "Sports & Fitness", 24.99, 600, 4),
+    (18, "Yoga Mat Non-Slip", "Sports & Fitness", 39.99, 300, 4),
+    (19, "Tennis Racket Pro", "Sports & Fitness", 89.99, 45, 4),
+    (20, "Water Bottle Insulated", "Sports & Fitness", 29.99, 450, 7),
+    (21, "The Python Handbook", "Books", 29.99, 200, 5),
+    (22, "Machine Learning Basics", "Books", 34.99, 175, 5),
+    (23, "Data Structures Guide", "Books", 27.99, 220, 5),
+    (24, "Mystery Novel Collection", "Books", 49.99, 100, 5),
+    (25, "Children Story Box Set", "Books", 44.99, 130, 8),
+    (26, "Dog Bed Orthopedic", "Pet Supplies", 79.99, 90, 9),
+    (27, "Cat Scratching Post", "Pet Supplies", 34.99, 170, 9),
+    (28, "Fish Tank Starter Kit", "Pet Supplies", 59.99, 55, 9),
+    (29, "Bird Cage Deluxe", "Pet Supplies", 89.99, 35, 9),
+    (30, "Pet Grooming Kit", "Pet Supplies", 39.99, 140, 9),
+    (31, "LEGO City Set 600pcs", "Toys", 69.99, 80, 8),
+    (32, "Remote Control Car", "Toys", 49.99, 120, 8),
+    (33, "Board Game Strategy", "Toys", 34.99, 200, 8),
+    (34, "Puzzle 1000 Pieces", "Toys", 24.99, 350, 8),
+    (35, "Art & Craft Kit Kids", "Toys", 29.99, 280, 8),
+    (36, "Office Desk Organizer", "Office", 39.99, 300, 10),
+    (37, "Wireless Mouse Ergonomic", "Electronics", 59.99, 200, 6),
+    (38, "Notebook Set Premium", "Office", 19.99, 600, 10),
+    (39, "Sticky Notes Colorful", "Office", 9.99, 800, 10),
+    (40, "Printer Paper Ream", "Office", 14.99, 500, 10),
+    (41, "Smart Watch Fitness", "Electronics", 199.99, 75, 1),
+    (42, "Blender High Power", "Home & Garden", 89.99, 110, 3),
+    (43, "Air Purifier HEPA", "Home & Garden", 149.99, 65, 1),
+    (44, "Backpack Waterproof", "Fashion", 79.99, 160, 2),
+    (45, "Sunglasses Polarized", "Fashion", 69.99, 200, 2),
+    (46, "Dumbbells Set 20kg", "Sports & Fitness", 79.99, 85, 4),
+    (47, "Jump Rope Speed", "Sports & Fitness", 19.99, 400, 4),
+    (48, "Graphic Novel Bundle", "Books", 59.99, 90, 5),
+    (49, "Phone Stand Adjustable", "Electronics", 24.99, 350, 6),
+    (50, "Desk Lamp LED", "Office", 44.99, 230, 10),
+]
+_ORDERS = [
+    (1, 1, 1, 1, 149.99, "delivered", "2024-01-10"),
+    (2, 2, 6, 2, 119.98, "delivered", "2024-01-15"),
+    (3, 3, 21, 1, 29.99, "delivered", "2024-01-20"),
+    (4, 4, 11, 1, 79.99, "delivered", "2024-01-25"),
+    (5, 5, 16, 2, 109.98, "delivered", "2024-02-01"),
+    (6, 6, 31, 1, 69.99, "delivered", "2024-02-05"),
+    (7, 7, 3, 2, 79.98, "shipped", "2024-02-10"),
+    (8, 8, 41, 1, 199.99, "delivered", "2024-02-14"),
+    (9, 9, 26, 1, 79.99, "delivered", "2024-02-18"),
+    (10, 10, 17, 3, 74.97, "delivered", "2024-02-22"),
+    (11, 11, 22, 1, 34.99, "delivered", "2024-03-01"),
+    (12, 12, 7, 1, 44.99, "delivered", "2024-03-05"),
+    (13, 13, 18, 2, 79.98, "delivered", "2024-03-10"),
+    (14, 14, 37, 1, 59.99, "shipped", "2024-03-14"),
+    (15, 15, 44, 1, 79.99, "delivered", "2024-03-18"),
+    (16, 16, 2, 1, 49.99, "delivered", "2024-03-22"),
+    (17, 17, 50, 1, 44.99, "pending", "2024-03-26"),
+    (18, 18, 5, 1, 129.99, "delivered", "2024-04-01"),
+    (19, 19, 12, 2, 59.98, "delivered", "2024-04-05"),
+    (20, 20, 33, 1, 34.99, "delivered", "2024-04-09"),
+    (21, 21, 9, 1, 109.99, "delivered", "2024-04-13"),
+    (22, 22, 14, 2, 119.98, "delivered", "2024-04-17"),
+    (23, 23, 43, 1, 149.99, "shipped", "2024-04-21"),
+    (24, 24, 25, 1, 44.99, "delivered", "2024-04-25"),
+    (25, 25, 8, 2, 69.98, "delivered", "2024-04-29"),
+    (26, 26, 4, 1, 89.99, "delivered", "2024-05-03"),
+    (27, 27, 29, 1, 89.99, "delivered", "2024-05-07"),
+    (28, 28, 20, 3, 89.97, "delivered", "2024-05-11"),
+    (29, 29, 35, 2, 59.98, "delivered", "2024-05-15"),
+    (30, 30, 46, 1, 79.99, "pending", "2024-05-19"),
+    (31, 31, 13, 5, 124.95, "delivered", "2024-05-23"),
+    (32, 32, 36, 2, 79.98, "delivered", "2024-05-27"),
+    (33, 33, 48, 1, 59.99, "delivered", "2024-05-31"),
+    (34, 34, 1, 1, 149.99, "delivered", "2024-06-04"),
+    (35, 35, 24, 1, 49.99, "delivered", "2024-06-08"),
+    (36, 36, 10, 2, 109.98, "shipped", "2024-06-12"),
+    (37, 37, 42, 1, 89.99, "delivered", "2024-06-16"),
+    (38, 38, 27, 1, 34.99, "delivered", "2024-06-20"),
+    (39, 39, 6, 1, 59.99, "delivered", "2024-06-24"),
+    (40, 40, 41, 1, 199.99, "delivered", "2024-06-28"),
+    (41, 41, 19, 1, 89.99, "cancelled", "2024-07-02"),
+    (42, 42, 34, 2, 49.98, "delivered", "2024-07-06"),
+    (43, 43, 23, 1, 27.99, "delivered", "2024-07-10"),
+    (44, 44, 47, 3, 59.97, "delivered", "2024-07-14"),
+    (45, 45, 15, 1, 49.99, "delivered", "2024-07-18"),
+    (46, 46, 32, 1, 49.99, "delivered", "2024-07-22"),
+    (47, 47, 3, 1, 39.99, "pending", "2024-07-26"),
+    (48, 48, 28, 1, 59.99, "delivered", "2024-07-30"),
+    (49, 49, 39, 10, 99.90, "delivered", "2024-08-03"),
+    (50, 50, 21, 2, 59.98, "delivered", "2024-08-07"),
+]
+_REVIEWS = [
+    (1, 1, 1, 5, "Excellent headphones, crystal clear sound!", "2024-01-15"),
+    (2, 2, 6, 4, "Beautiful dress, fits perfectly.", "2024-01-20"),
+    (3, 3, 21, 5, "Best Python book for beginners.", "2024-01-25"),
+    (4, 4, 11, 4, "Very elegant vase set.", "2024-01-30"),
+    (5, 5, 16, 3, "Decent protein powder, average taste.", "2024-02-05"),
+    (6, 6, 31, 5, "My kid loves this LEGO set!", "2024-02-10"),
+    (7, 7, 3, 5, "Incredibly useful USB hub.", "2024-02-15"),
+    (8, 8, 41, 5, "Smart watch exceeded expectations.", "2024-02-20"),
+    (9, 9, 26, 4, "Dog loves the orthopedic bed.", "2024-02-25"),
+    (10, 10, 17, 5, "Great resistance bands, very durable.", "2024-03-01"),
+    (11, 11, 22, 4, "Solid ML intro book.", "2024-03-06"),
+    (12, 12, 7, 3, "Chinos are OK, sizing runs small.", "2024-03-11"),
+    (13, 13, 18, 5, "Perfect yoga mat, non-slip is great.", "2024-03-16"),
+    (14, 14, 37, 4, "Smooth wireless mouse.", "2024-03-21"),
+    (15, 15, 44, 5, "Waterproof backpack is amazing.", "2024-03-26"),
+    (16, 16, 2, 4, "Laptop stand is sturdy and adjustable.", "2024-03-31"),
+    (17, 17, 49, 3, "Decent phone stand but wobbly.", "2024-04-05"),
+    (18, 18, 5, 5, "Best webcam I've ever used.", "2024-04-10"),
+    (19, 19, 12, 5, "Bamboo cutting board is beautiful.", "2024-04-15"),
+    (20, 20, 33, 4, "Fun strategy board game.", "2024-04-20"),
+    (21, 21, 9, 5, "Running shoes are so comfortable!", "2024-04-25"),
+    (22, 22, 14, 4, "Smart bulbs work well with app.", "2024-04-30"),
+    (23, 23, 43, 4, "Air purifier is quiet and effective.", "2024-05-05"),
+    (24, 24, 25, 5, "Beautiful story box set for kids.", "2024-05-10"),
+    (25, 25, 8, 4, "Leather wallet is high quality.", "2024-05-15"),
+    (26, 26, 4, 5, "Mechanical keyboard is a joy to type on.", "2024-05-20"),
+    (27, 27, 29, 4, "Bird cage is spacious and well-made.", "2024-05-25"),
+    (28, 28, 20, 5, "Water bottle keeps drinks cold all day.", "2024-05-30"),
+    (29, 29, 35, 4, "Great art kit for kids.", "2024-06-04"),
+    (30, 30, 46, 4, "Solid dumbbells, good grip.", "2024-06-09"),
+    (31, 1, 13, 5, "Scented candles smell amazing.", "2024-06-14"),
+    (32, 2, 36, 4, "Desk organizer keeps my workspace tidy.", "2024-06-19"),
+    (33, 3, 48, 5, "Graphic novel bundle is worth every penny.", "2024-06-24"),
+    (34, 4, 1, 4, "Good headphones, comfy for long sessions.", "2024-06-29"),
+    (35, 5, 24, 5, "Love these mystery novels!", "2024-07-04"),
+    (36, 6, 10, 4, "High waist yoga pants are flattering.", "2024-07-09"),
+    (37, 7, 42, 4, "Powerful blender, handles frozen fruit.", "2024-07-14"),
+    (38, 8, 27, 5, "Cat scratching post is well built.", "2024-07-19"),
+    (39, 9, 6, 4, "Floral dress is as pictured.", "2024-07-24"),
+    (40, 10, 41, 5, "Smart watch has excellent battery life.", "2024-07-29"),
+    (41, 11, 19, 2, "Tennis racket feels cheap for the price.", "2024-08-03"),
+    (42, 12, 34, 5, "Puzzle is a perfect family activity.", "2024-08-08"),
+    (43, 13, 23, 5, "Data structures book is very clear.", "2024-08-13"),
+    (44, 14, 47, 4, "Jump rope is fast and durable.", "2024-08-18"),
+    (45, 15, 15, 3, "Book stand is okay, a bit light.", "2024-08-23"),
+    (46, 16, 32, 5, "Remote control car is very fast!", "2024-08-28"),
+    (47, 17, 3, 4, "USB hub works great on MacBook.", "2024-09-02"),
+    (48, 18, 28, 4, "Fish tank kit is easy to set up.", "2024-09-07"),
+    (49, 19, 38, 5, "Premium notebook has great paper.", "2024-09-12"),
+    (50, 20, 21, 5, "Python handbook is my go-to reference.", "2024-09-17"),
+]
+# ─── Public API ───────────────────────────────────────────────────
+def get_db_path() -> Path:
+    return DB_PATH
+def ensure_seeded() -> bool:
+    """
+    Create the database and populate seed data if not already done.
+    Returns True if seed was needed (first run), False if already seeded.
+    """
+    _DATA_DIR.mkdir(parents=True, exist_ok=True)
+    conn = sqlite3.connect(str(DB_PATH))
+    try:
+        conn.executescript(_DDL)
+        conn.commit()
+        count = conn.execute("SELECT COUNT(*) FROM users").fetchone()[0]
+        if count >= 50:
+            return False  # Already seeded
+        conn.execute("DELETE FROM reviews")
+        conn.execute("DELETE FROM orders")
+        conn.execute("DELETE FROM products")
+        conn.execute("DELETE FROM users")
+        conn.execute("DELETE FROM sellers")
+        conn.executemany(
+            "INSERT OR REPLACE INTO sellers VALUES (?,?,?,?,?)", _SELLERS
+        )
+        conn.executemany(
+            "INSERT OR REPLACE INTO users VALUES (?,?,?,?,?)", _USERS
+        )
+        conn.executemany(
+            "INSERT OR REPLACE INTO products VALUES (?,?,?,?,?,?)", _PRODUCTS
+        )
+        conn.executemany(
+            "INSERT OR REPLACE INTO orders VALUES (?,?,?,?,?,?,?)", _ORDERS
+        )
+        conn.executemany(
+            "INSERT OR REPLACE INTO reviews VALUES (?,?,?,?,?,?)", _REVIEWS
+        )
+        conn.commit()
+        return True
+    finally:
+        conn.close()
+def get_schema_info() -> str:
+    """
+    Return a concise textual schema summary for use in prompts.
+    """
+    conn = sqlite3.connect(str(DB_PATH))
+    try:
+        lines = []
+        for table in ["sellers", "users", "products", "orders", "reviews"]:
+            info = conn.execute(f"PRAGMA table_info({table})").fetchall()
+            cols = ", ".join(
+                f"{col[1]} {col[2]}{'(PK)' if col[5] else ''}"
+                for col in info
+            )
+            row_count = conn.execute(f"SELECT COUNT(*) FROM {table}").fetchone()[0]
+            lines.append(f"Table: {table} ({row_count} rows)\n  Columns: {cols}")
+        return "\n\n".join(lines)
+    finally:
+        conn.close()
+def execute_query(sql: str) -> tuple[list[dict], str | None]:
+    """
+    Execute a SQL query and return (rows, error_message).
+    rows is a list of dicts; error_message is None on success.
+    """
+    conn = sqlite3.connect(str(DB_PATH))
+    conn.row_factory = sqlite3.Row
+    try:
+        cursor = conn.execute(sql)
+        rows = [dict(row) for row in cursor.fetchall()]
+        return rows, None
+    except sqlite3.Error as e:
+        return [], str(e)
+    finally:
+        conn.close()
+def get_table_stats() -> list[dict]:
+    """Return [{name, rows}, ...] for all tables."""
+    conn = sqlite3.connect(str(DB_PATH))
+    try:
+        tables = ["sellers", "users", "products", "orders", "reviews"]
+        return [
+            {
+                "name": t,
+                "rows": conn.execute(f"SELECT COUNT(*) FROM {t}").fetchone()[0],
+            }
+            for t in tables
+        ]
+    finally:
+        conn.close()
+def get_schema_graph() -> dict:
+    """Return schema graph with tables, columns, and foreign keys."""
+    conn = sqlite3.connect(str(DB_PATH))
+    try:
+        tables = []
+        for table in ["sellers", "users", "products", "orders", "reviews"]:
+            info = conn.execute(f"PRAGMA table_info({table})").fetchall()
+            columns = [
+                {"name": col[1], "type": col[2], "pk": bool(col[5])}
+                for col in info
+            ]
+            tables.append({"name": table, "columns": columns})
+        foreign_keys = []
+        for table in ["sellers", "users", "products", "orders", "reviews"]:
+            fks = conn.execute(f"PRAGMA foreign_key_list({table})").fetchall()
+            for fk in fks:
+                foreign_keys.append(
+                    {
+                        "from_table": table,
+                        "from_col": fk[3],
+                        "to_table": fk[2],
+                        "to_col": fk[4],
+                    }
+                )
+        return {"tables": tables, "foreign_keys": foreign_keys}
+    finally:
+        conn.close()

backend/env/sql_env.py ADDED Viewed

	@@ -0,0 +1,594 @@

+"""
+SQLAgentEnv — OpenEnv-compliant environment for SQL generation.
+Observation → Action → (Observation, Reward) loop.
+The step() function:
+  1. Selects a repair prompt based on action.repair_action
+  2. Calls the LLM (OpenAI-compatible) to generate/repair SQL
+  3. Executes SQL on the benchmark DB
+  4. Classifies any error
+  5. Computes reward via grader
+  6. Updates LinUCB bandit
+  7. Returns (new_observation, reward)
+Environment variables:
+  API_BASE_URL  — OpenAI-compatible base URL (default: https://api.openai.com/v1)
+  MODEL_NAME    — model to use (default: gpt-4o-mini)
+  HF_TOKEN      — bearer token / API key
+"""
+from __future__ import annotations
+import asyncio
+import os
+import re
+from typing import Optional, AsyncIterator
+from openai import AsyncOpenAI
+from pydantic import BaseModel
+from env.database import ensure_seeded, get_schema_info, execute_query
+from env.tasks import get_task, get_all_tasks, TASKS
+from rl.types import RepairAction, REPAIR_ACTION_NAMES, REPAIR_ACTION_BY_NAME
+from rl.error_classifier import classify_error, extract_offending_token
+from rl.grader import GraderInput, compute_reward, compute_episode_reward
+from rl.linucb import LinUCB
+from rl.repair_strategies import RepairContext, get_repair_system_suffix, build_repair_user_message
+from rl.experience import record_episode
+from rl.types import RLState, EpisodeStep, featurize, ERROR_CLASS_NAMES
+# ─── OpenEnv Models ──────────────────────────────────────────────
+class Observation(BaseModel):
+    question: str
+    schema_info: str
+    current_sql: Optional[str] = None
+    error_message: Optional[str] = None
+    error_class: Optional[str] = None
+    attempt_number: int = 0
+    max_attempts: int = 5
+    task_id: str
+    task_difficulty: str
+class Action(BaseModel):
+    repair_action: str  # one of 8 repair action names or "generate"
+    custom_sql: Optional[str] = None  # optional direct SQL override
+class RewardInfo(BaseModel):
+    value: float
+    success: bool
+    done: bool
+    info: dict
+# ─── LLM Client ──────────────────────────────────────────────────
+def _make_client() -> AsyncOpenAI:
+    return AsyncOpenAI(
+        api_key=os.environ.get("HF_TOKEN", ""),
+        base_url=os.environ.get("API_BASE_URL", "https://api.openai.com/v1"),
+    )
+_MODEL = os.environ.get("MODEL_NAME", "gpt-4o-mini")
+BASE_SYSTEM_PROMPT = """You are a SQL expert. Given a natural language question and a SQLite database schema, write a correct SQL query.
+Rules:
+- Output ONLY the SQL query, nothing else
+- No markdown, no code fences, no explanation
+- Use SQLite syntax
+- Do not include semicolons at the end"""
+def _clean_sql(raw: str) -> str:
+    """Strip markdown code fences and extra whitespace."""
+    raw = raw.strip()
+    raw = re.sub(r"^```(?:sql)?\s*", "", raw, flags=re.IGNORECASE)
+    raw = re.sub(r"\s*```$", "", raw)
+    return raw.strip().rstrip(";")
+async def _call_llm(
+    system_prompt: str,
+    user_message: str,
+    stream: bool = False,
+) -> AsyncIterator[str] | str:
+    """Call the LLM and return the generated text."""
+    client = _make_client()
+    if stream:
+        async def _gen():
+            resp = await client.chat.completions.create(
+                model=_MODEL,
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": user_message},
+                ],
+                stream=True,
+                temperature=0.1,
+            )
+            async for chunk in resp:
+                delta = chunk.choices[0].delta.content
+                if delta:
+                    yield delta
+        return _gen()
+    else:
+        resp = await client.chat.completions.create(
+            model=_MODEL,
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_message},
+            ],
+            temperature=0.1,
+        )
+        return resp.choices[0].message.content or ""
+# ─── Episode State ────────────────────────────────────────────────
+class _Episode:
+    def __init__(self, task_id: str, question_id: str, question: str) -> None:
+        self.task_id = task_id
+        self.question_id = question_id
+        self.question = question
+        self.attempt_number = 0
+        self.current_sql: Optional[str] = None
+        self.error_message: Optional[str] = None
+        self.error_class: Optional[str] = None
+        self.steps: list[EpisodeStep] = []
+        self.step_rewards: list[float] = []
+        self.previous_error_class = None
+        self.consecutive_same_error = 0
+        self.last_action: Optional[RepairAction] = None
+        self.current_rl_state: Optional[RLState] = None
+        self.current_features: Optional[list[float]] = None
+        self.done = False
+        self.success = False
+# ─── Main Environment Class ───────────────────────────────────────
+class SQLAgentEnv:
+    """
+    OpenEnv-compliant environment for SQL generation and repair.
+    One active episode at a time.
+    """
+    MAX_ATTEMPTS = 5
+    def __init__(self) -> None:
+        ensure_seeded()
+        self._bandit = LinUCB()
+        self._episode: Optional[_Episode] = None
+        self._schema_info = get_schema_info()
+    def reset(self, task_id: str = "simple_queries") -> Observation:
+        """Start a new episode, picking the first question of the task."""
+        if self._episode and self._episode.steps and not self._episode.done:
+            self._finalize_episode(success=False)
+        task = get_task(task_id)
+        question_obj = task.questions[0]
+        self._episode = _Episode(
+            task_id=task_id,
+            question_id=question_obj.id,
+            question=question_obj.question,
+        )
+        return self._build_observation()
+    def reset_with_question(
+        self, task_id: str, question_id: str
+    ) -> Observation:
+        """Start a new episode for a specific question."""
+        if self._episode and self._episode.steps and not self._episode.done:
+            self._finalize_episode(success=False)
+        task = get_task(task_id)
+        question_obj = next(
+            (q for q in task.questions if q.id == question_id), task.questions[0]
+        )
+        self._episode = _Episode(
+            task_id=task_id,
+            question_id=question_obj.id,
+            question=question_obj.question,
+        )
+        return self._build_observation()
+    async def step(self, action: Action) -> tuple[Observation, RewardInfo]:
+        """
+        Execute one step:
+          1. Generate/repair SQL via LLM
+          2. Execute SQL
+          3. Grade and reward
+          4. Update bandit
+        """
+        if self._episode is None:
+            raise RuntimeError("Call reset() before step()")
+        if self._episode.done:
+            raise RuntimeError("Episode is done. Call reset() to start a new one.")
+        ep = self._episode
+        ep.attempt_number += 1
+        # ── 1. Build prompt ──────────────────────────────────────
+        if action.custom_sql:
+            generated_sql = action.custom_sql
+        else:
+            generated_sql = await self._generate_sql(action, ep)
+        generated_sql = _clean_sql(generated_sql)
+        # ── 2. Execute SQL ───────────────────────────────────────
+        rows, error = execute_query(generated_sql)
+        success = error is None and len(rows) > 0
+        # ── 3. Grade ─────────────────────────────────────────────
+        task = get_task(ep.task_id)
+        question_obj = next(q for q in task.questions if q.id == ep.question_id)
+        from env.tasks import grade_response
+        task_score = grade_response(
+            ep.task_id, ep.question_id, generated_sql, rows, error, ep.attempt_number
+        )
+        success = task_score >= 0.8
+        # ── 4. RL state + reward ─────────────────────────────────
+        current_error_class = None
+        error_class_name = None
+        if error:
+            ec = classify_error(error)
+            current_error_class = ec
+            error_class_name = ERROR_CLASS_NAMES[ec]
+            error_changed = (
+                ep.previous_error_class is not None
+                and ep.previous_error_class != current_error_class
+            )
+            if ep.previous_error_class == current_error_class:
+                ep.consecutive_same_error += 1
+            else:
+                ep.consecutive_same_error = 1
+            rl_state = RLState(
+                error_class=current_error_class,
+                attempt_number=ep.attempt_number,
+                previous_action=ep.last_action,
+                error_changed=error_changed,
+                consecutive_same_error=ep.consecutive_same_error,
+            )
+            ep.current_rl_state = rl_state
+            ep.current_features = featurize(rl_state)
+        grader_in = GraderInput(
+            success=success,
+            attempt_number=ep.attempt_number,
+            current_error_class=current_error_class,
+            previous_error_class=ep.previous_error_class,
+        )
+        grader_out = compute_reward(grader_in)
+        if ep.current_rl_state and ep.current_features:
+            # Determine action index
+            if action.repair_action == "generate":
+                repair_action_enum = RepairAction.REWRITE_FULL
+            else:
+                repair_action_enum = REPAIR_ACTION_BY_NAME.get(
+                    action.repair_action, RepairAction.REWRITE_FULL
+                )
+            step_obj = EpisodeStep(
+                state=ep.current_rl_state,
+                featurized=ep.current_features,
+                action=repair_action_enum,
+                reward=grader_out.reward,
+                error_message=error or "",
+                sql=generated_sql,
+                success=success,
+            )
+            ep.steps.append(step_obj)
+        ep.step_rewards.append(grader_out.reward)
+        ep.current_sql = generated_sql
+        ep.error_message = error
+        ep.error_class = error_class_name
+        ep.previous_error_class = current_error_class
+        # ── 5. Done check ────────────────────────────────────────
+        done = success or ep.attempt_number >= self.MAX_ATTEMPTS
+        if done:
+            self._finalize_episode(success=success)
+            ep.done = True
+            ep.success = success
+        obs = self._build_observation()
+        reward_info = RewardInfo(
+            value=grader_out.reward,
+            success=success,
+            done=done,
+            info={
+                "task_score": task_score,
+                "attempt": ep.attempt_number,
+                "breakdown": {
+                    "base": grader_out.breakdown.base,
+                    "attempt_penalty": grader_out.breakdown.attempt_penalty,
+                    "severity_bonus": grader_out.breakdown.severity_bonus,
+                    "change_bonus": grader_out.breakdown.change_bonus,
+                },
+                "rows": rows[:5] if rows else [],
+                "row_count": len(rows),
+                "sql": generated_sql,
+            },
+        )
+        return obs, reward_info
+    async def step_streaming(
+        self, action: Action
+    ) -> AsyncIterator[dict]:
+        """
+        Step with SSE-compatible event streaming.
+        Yields dicts representing stream events.
+        """
+        if self._episode is None:
+            raise RuntimeError("Call reset() before step_streaming()")
+        ep = self._episode
+        ep.attempt_number += 1
+        yield {"type": "attempt_start", "attempt": ep.attempt_number}
+        # Generate SQL
+        if action.custom_sql:
+            generated_sql = action.custom_sql
+            yield {"type": "sql_complete", "sql": generated_sql}
+        else:
+            chunks = []
+            async for chunk in await self._generate_sql_streaming(action, ep):
+                chunks.append(chunk)
+                yield {"type": "sql_chunk", "chunk": chunk}
+            generated_sql = _clean_sql("".join(chunks))
+            yield {"type": "sql_complete", "sql": generated_sql}
+        yield {"type": "executing"}
+        rows, error = execute_query(generated_sql)
+        from env.tasks import grade_response
+        task_score = grade_response(
+            ep.task_id, ep.question_id, generated_sql, rows, error, ep.attempt_number
+        )
+        success = task_score >= 0.8
+        # RL processing
+        current_error_class = None
+        error_class_name = None
+        repair_action_enum = RepairAction.REWRITE_FULL
+        if action.repair_action != "generate":
+            repair_action_enum = REPAIR_ACTION_BY_NAME.get(
+                action.repair_action, RepairAction.REWRITE_FULL
+            )
+        if error:
+            ec = classify_error(error)
+            current_error_class = ec
+            error_class_name = ERROR_CLASS_NAMES[ec]
+            error_changed = (
+                ep.previous_error_class is not None
+                and ep.previous_error_class != current_error_class
+            )
+            if ep.previous_error_class == current_error_class:
+                ep.consecutive_same_error += 1
+            else:
+                ep.consecutive_same_error = 1
+            rl_state = RLState(
+                error_class=current_error_class,
+                attempt_number=ep.attempt_number,
+                previous_action=ep.last_action,
+                error_changed=error_changed,
+                consecutive_same_error=ep.consecutive_same_error,
+            )
+            ep.current_rl_state = rl_state
+            ep.current_features = featurize(rl_state)
+            _, scores = self._bandit.select_action(ep.current_features)
+            ucb_scores = {
+                REPAIR_ACTION_NAMES[RepairAction(i)]: round(scores[i], 4)
+                for i in range(len(scores))
+            }
+            yield {
+                "type": "rl_action",
+                "action": REPAIR_ACTION_NAMES[repair_action_enum],
+                "ucb_scores": ucb_scores,
+            }
+            yield {"type": "error", "error": error, "error_class": error_class_name}
+        grader_in = GraderInput(
+            success=success,
+            attempt_number=ep.attempt_number,
+            current_error_class=current_error_class,
+            previous_error_class=ep.previous_error_class,
+        )
+        grader_out = compute_reward(grader_in)
+        if ep.current_rl_state and ep.current_features:
+            step_obj = EpisodeStep(
+                state=ep.current_rl_state,
+                featurized=ep.current_features,
+                action=repair_action_enum,
+                reward=grader_out.reward,
+                error_message=error or "",
+                sql=generated_sql,
+                success=success,
+            )
+            ep.steps.append(step_obj)
+            self._bandit.update(ep.current_features, repair_action_enum, grader_out.reward)
+        ep.step_rewards.append(grader_out.reward)
+        ep.current_sql = generated_sql
+        ep.error_message = error
+        ep.error_class = error_class_name
+        ep.previous_error_class = current_error_class
+        yield {
+            "type": "rl_reward",
+            "reward": grader_out.reward,
+            "breakdown": {
+                "base": grader_out.breakdown.base,
+                "attempt_penalty": grader_out.breakdown.attempt_penalty,
+                "severity_bonus": grader_out.breakdown.severity_bonus,
+                "change_bonus": grader_out.breakdown.change_bonus,
+            },
+        }
+        done = success or ep.attempt_number >= self.MAX_ATTEMPTS
+        if success:
+            yield {
+                "type": "success",
+                "rows": rows,
+                "row_count": len(rows),
+                "sql": generated_sql,
+            }
+        if done:
+            total_reward = compute_episode_reward(ep.step_rewards, success)
+            self._finalize_episode(success=success)
+            ep.done = True
+            ep.success = success
+            yield {
+                "type": "rl_episode_end",
+                "total_reward": total_reward,
+                "success": success,
+            }
+    def state(self) -> dict:
+        if self._episode is None:
+            return {"active": False}
+        ep = self._episode
+        return {
+            "active": True,
+            "task_id": ep.task_id,
+            "question_id": ep.question_id,
+            "question": ep.question,
+            "attempt_number": ep.attempt_number,
+            "max_attempts": self.MAX_ATTEMPTS,
+            "current_sql": ep.current_sql,
+            "error_message": ep.error_message,
+            "error_class": ep.error_class,
+            "done": ep.done,
+            "success": ep.success,
+            "step_rewards": ep.step_rewards,
+            "total_reward": compute_episode_reward(ep.step_rewards, ep.success),
+        }
+    # ─── Private Helpers ──────────────────────────────────────────
+    def _build_observation(self) -> Observation:
+        if self._episode is None:
+            raise RuntimeError("No active episode")
+        ep = self._episode
+        task = get_task(ep.task_id)
+        return Observation(
+            question=ep.question,
+            schema_info=self._schema_info,
+            current_sql=ep.current_sql,
+            error_message=ep.error_message,
+            error_class=ep.error_class,
+            attempt_number=ep.attempt_number,
+            max_attempts=self.MAX_ATTEMPTS,
+            task_id=ep.task_id,
+            task_difficulty=task.difficulty,
+        )
+    async def _generate_sql(self, action: Action, ep: _Episode) -> str:
+        if action.repair_action == "generate" or ep.current_sql is None:
+            system = BASE_SYSTEM_PROMPT
+            user = (
+                f"Schema:\n{self._schema_info}\n\n"
+                f"Question: {ep.question}\n\n"
+                "Write a SQL query to answer this question."
+            )
+        else:
+            repair_action_enum = REPAIR_ACTION_BY_NAME.get(
+                action.repair_action, RepairAction.REWRITE_FULL
+            )
+            suffix = get_repair_system_suffix(repair_action_enum)
+            offending_token = extract_offending_token(ep.error_message or "")
+            ctx = RepairContext(
+                schema=self._schema_info,
+                question=ep.question,
+                failing_sql=ep.current_sql or "",
+                error_message=ep.error_message or "",
+                offending_token=offending_token,
+            )
+            system = BASE_SYSTEM_PROMPT + suffix
+            user = build_repair_user_message(repair_action_enum, ctx)
+        result = await _call_llm(system, user, stream=False)
+        return result  # type: ignore[return-value]
+    async def _generate_sql_streaming(
+        self, action: Action, ep: _Episode
+    ) -> AsyncIterator[str]:
+        if action.repair_action == "generate" or ep.current_sql is None:
+            system = BASE_SYSTEM_PROMPT
+            user = (
+                f"Schema:\n{self._schema_info}\n\n"
+                f"Question: {ep.question}\n\n"
+                "Write a SQL query to answer this question."
+            )
+        else:
+            repair_action_enum = REPAIR_ACTION_BY_NAME.get(
+                action.repair_action, RepairAction.REWRITE_FULL
+            )
+            suffix = get_repair_system_suffix(repair_action_enum)
+            offending_token = extract_offending_token(ep.error_message or "")
+            ctx = RepairContext(
+                schema=self._schema_info,
+                question=ep.question,
+                failing_sql=ep.current_sql or "",
+                error_message=ep.error_message or "",
+                offending_token=offending_token,
+            )
+            system = BASE_SYSTEM_PROMPT + suffix
+            user = build_repair_user_message(repair_action_enum, ctx)
+        return await _call_llm(system, user, stream=True)  # type: ignore[return-value]
+    def _finalize_episode(self, success: bool) -> None:
+        ep = self._episode
+        if ep is None or not ep.steps:
+            return
+        try:
+            episode_obj, relabeled = record_episode(ep.question, ep.steps, success)
+            for exp in relabeled:
+                self._bandit.update(exp.state, exp.action, exp.reward)
+            self._bandit.decay_alpha()
+        except Exception:
+            pass
+# ─── Singleton instance ───────────────────────────────────────────
+_env_instance: Optional[SQLAgentEnv] = None
+def get_env() -> SQLAgentEnv:
+    global _env_instance
+    if _env_instance is None:
+        _env_instance = SQLAgentEnv()
+    return _env_instance

backend/env/tasks.py ADDED Viewed

	@@ -0,0 +1,345 @@

+"""
+Task definitions for the SQL agent benchmark.
+Three difficulty tiers, each with 5 questions and a grader function.
+Grader contract: grader(sql, rows, error, attempts) -> float in [0.0, 1.0]
+  - rows: list[dict] from the executed SQL (may be empty)
+  - error: str | None
+  - attempts: int (1-indexed count of attempts taken)
+"""
+from __future__ import annotations
+import re
+from dataclasses import dataclass, field
+from typing import Callable, Optional
+from env.database import execute_query
+# ─── Task Definitions ─────────────────────────────────────────────
+@dataclass
+class TaskQuestion:
+    id: str
+    question: str
+    expected_columns: list[str]     # at least these columns should appear
+    min_rows: int                   # minimum expected rows
+    max_rows: Optional[int] = None  # None = no upper bound
+    hint_tables: list[str] = field(default_factory=list)  # tables that must be touched
+@dataclass
+class Task:
+    id: str
+    name: str
+    difficulty: str  # "easy" | "medium" | "hard"
+    description: str
+    questions: list[TaskQuestion]
+    grader: Callable  # grader(question, sql, rows, error, attempts) -> float
+# ─── Grader Helpers ───────────────────────────────────────────────
+def _has_required_columns(rows: list[dict], required: list[str]) -> bool:
+    if not rows:
+        return False
+    row_keys = {k.lower() for k in rows[0].keys()}
+    return all(col.lower() in row_keys for col in required)
+def _row_count_score(rows: list[dict], min_rows: int, max_rows: Optional[int]) -> float:
+    n = len(rows)
+    if n == 0:
+        return 0.0
+    if n >= min_rows:
+        if max_rows is None or n <= max_rows:
+            return 1.0
+        # Over the expected maximum — might be a missing WHERE clause
+        return 0.5
+    # Partial result
+    return 0.5 * (n / min_rows)
+# ─── Task 1: Simple Queries (Easy) ────────────────────────────────
+_SIMPLE_QUESTIONS = [
+    TaskQuestion(
+        id="sq-01",
+        question="List all users from the USA.",
+        expected_columns=["name", "email", "country"],
+        min_rows=10,
+        max_rows=25,
+        hint_tables=["users"],
+    ),
+    TaskQuestion(
+        id="sq-02",
+        question="Show all products in the 'Electronics' category with their prices.",
+        expected_columns=["name", "price"],
+        min_rows=8,
+        max_rows=20,
+        hint_tables=["products"],
+    ),
+    TaskQuestion(
+        id="sq-03",
+        question="Find all orders with status 'delivered'.",
+        expected_columns=["id", "status"],
+        min_rows=30,
+        max_rows=50,
+        hint_tables=["orders"],
+    ),
+    TaskQuestion(
+        id="sq-04",
+        question="List all sellers and their countries.",
+        expected_columns=["name", "country"],
+        min_rows=10,
+        max_rows=10,
+        hint_tables=["sellers"],
+    ),
+    TaskQuestion(
+        id="sq-05",
+        question="Show all reviews with a rating of 5 stars.",
+        expected_columns=["rating"],
+        min_rows=15,
+        max_rows=35,
+        hint_tables=["reviews"],
+    ),
+]
+def _grade_simple(
+    question: TaskQuestion,
+    sql: str,
+    rows: list[dict],
+    error: Optional[str],
+    attempts: int,
+) -> float:
+    if error:
+        return 0.0
+    col_ok = _has_required_columns(rows, question.expected_columns)
+    row_score = _row_count_score(rows, question.min_rows, question.max_rows)
+    if col_ok and row_score == 1.0:
+        return 1.0
+    if col_ok or row_score >= 0.5:
+        return 0.5
+    return 0.0
+_TASK_SIMPLE = Task(
+    id="simple_queries",
+    name="Simple Queries",
+    difficulty="easy",
+    description="Single-table SELECT queries with basic filters.",
+    questions=_SIMPLE_QUESTIONS,
+    grader=_grade_simple,
+)
+# ─── Task 2: Join Queries (Medium) ────────────────────────────────
+_JOIN_QUESTIONS = [
+    TaskQuestion(
+        id="jq-01",
+        question="Show the total number of orders per user, including the user's name.",
+        expected_columns=["name"],
+        min_rows=10,
+        hint_tables=["users", "orders"],
+    ),
+    TaskQuestion(
+        id="jq-02",
+        question="List products along with the name of their seller.",
+        expected_columns=["name", "name"],  # product name + seller name both called 'name'
+        min_rows=20,
+        hint_tables=["products", "sellers"],
+    ),
+    TaskQuestion(
+        id="jq-03",
+        question="Find the average rating for each product category.",
+        expected_columns=["category"],
+        min_rows=5,
+        max_rows=10,
+        hint_tables=["products", "reviews"],
+    ),
+    TaskQuestion(
+        id="jq-04",
+        question="Show the total revenue (sum of total_price) per seller.",
+        expected_columns=["name"],
+        min_rows=5,
+        hint_tables=["sellers", "products", "orders"],
+    ),
+    TaskQuestion(
+        id="jq-05",
+        question="List the top 5 most reviewed products with their review counts.",
+        expected_columns=["name"],
+        min_rows=5,
+        max_rows=5,
+        hint_tables=["products", "reviews"],
+    ),
+]
+def _grade_join(
+    question: TaskQuestion,
+    sql: str,
+    rows: list[dict],
+    error: Optional[str],
+    attempts: int,
+) -> float:
+    if error:
+        return 0.0
+    col_ok = _has_required_columns(rows, [question.expected_columns[0]])
+    row_score = _row_count_score(rows, question.min_rows, question.max_rows)
+    base = 0.0
+    if col_ok and row_score == 1.0:
+        base = 1.0
+    elif col_ok or row_score >= 0.5:
+        base = 0.5
+    # Penalize extra attempts
+    attempt_penalty = max(0.0, 0.1 * (attempts - 1))
+    return max(0.0, base - attempt_penalty)
+_TASK_JOIN = Task(
+    id="join_queries",
+    name="Join Queries",
+    difficulty="medium",
+    description="Multi-table JOINs with GROUP BY and aggregation.",
+    questions=_JOIN_QUESTIONS,
+    grader=_grade_join,
+)
+# ─── Task 3: Complex Queries (Hard) ───────────────────────────────
+_COMPLEX_QUESTIONS = [
+    TaskQuestion(
+        id="cq-01",
+        question=(
+            "Find users who have placed more than 1 order, showing their name "
+            "and total number of orders, ordered by order count descending."
+        ),
+        expected_columns=["name"],
+        min_rows=1,
+        hint_tables=["users", "orders"],
+    ),
+    TaskQuestion(
+        id="cq-02",
+        question=(
+            "For each product category, show the category name, number of products, "
+            "average price, and total stock. Use a CTE."
+        ),
+        expected_columns=["category"],
+        min_rows=5,
+        max_rows=10,
+        hint_tables=["products"],
+    ),
+    TaskQuestion(
+        id="cq-03",
+        question=(
+            "Show each seller's name, their total sales revenue, and rank them "
+            "by revenue using a window function (RANK() or ROW_NUMBER())."
+        ),
+        expected_columns=["name"],
+        min_rows=5,
+        hint_tables=["sellers", "products", "orders"],
+    ),
+    TaskQuestion(
+        id="cq-04",
+        question=(
+            "Find the top-rated product in each category (highest average review rating). "
+            "Show category, product name, and average rating."
+        ),
+        expected_columns=["category", "name"],
+        min_rows=5,
+        max_rows=10,
+        hint_tables=["products", "reviews"],
+    ),
+    TaskQuestion(
+        id="cq-05",
+        question=(
+            "Calculate the month-over-month order count for 2024, showing year, "
+            "month, order_count, and a running total."
+        ),
+        expected_columns=["month"],
+        min_rows=6,
+        max_rows=12,
+        hint_tables=["orders"],
+    ),
+]
+def _grade_complex(
+    question: TaskQuestion,
+    sql: str,
+    rows: list[dict],
+    error: Optional[str],
+    attempts: int,
+) -> float:
+    if error:
+        return 0.0
+    col_ok = _has_required_columns(rows, question.expected_columns)
+    row_score = _row_count_score(rows, question.min_rows, question.max_rows)
+    if not col_ok or row_score == 0.0:
+        return 0.0
+    # Hard task base max is 0.8 unless first-attempt bonus
+    if row_score == 1.0 and col_ok:
+        base = 0.8 + (0.2 if attempts == 1 else 0.0)
+    else:
+        base = 0.4  # partial
+    # Strict attempt penalty for hard queries
+    attempt_penalty = 0.1 * (attempts - 1)
+    return max(0.0, base - attempt_penalty)
+_TASK_COMPLEX = Task(
+    id="complex_queries",
+    name="Complex Queries",
+    difficulty="hard",
+    description="CTEs, window functions, and nested aggregations.",
+    questions=_COMPLEX_QUESTIONS,
+    grader=_grade_complex,
+)
+# ─── Registry ─────────────────────────────────────────────────────
+TASKS: dict[str, Task] = {
+    "simple_queries": _TASK_SIMPLE,
+    "join_queries":   _TASK_JOIN,
+    "complex_queries": _TASK_COMPLEX,
+}
+def get_task(task_id: str) -> Task:
+    if task_id not in TASKS:
+        raise ValueError(f"Unknown task_id: {task_id!r}. Valid: {list(TASKS)}")
+    return TASKS[task_id]
+def get_all_tasks() -> list[Task]:
+    return list(TASKS.values())
+def grade_response(
+    task_id: str,
+    question_id: str,
+    sql: str,
+    rows: list[dict],
+    error: Optional[str],
+    attempts: int,
+) -> float:
+    task = get_task(task_id)
+    question = next((q for q in task.questions if q.id == question_id), None)
+    if question is None:
+        raise ValueError(f"Unknown question_id {question_id!r} in task {task_id!r}")
+    return task.grader(question, sql, rows, error, attempts)

backend/gepa/__init__.py ADDED Viewed

File without changes

backend/gepa/optimizer.py ADDED Viewed

	@@ -0,0 +1,347 @@

+"""
+GEPA (Goal-directed Evolutionary Prompt Adaptation) optimizer.
+Ported from gepa.ts. Key steps:
+  1. Reflection: LLM analyzes failure history, outputs diagnosis
+  2. Mutation: LLM rewrites system prompt based on diagnosis
+  3. Scoring: Run 3 golden queries with new prompt, compute score
+  4. Pareto front: Keep top 3 prompts by (score, diversity)
+State is persisted to data/gepa_prompt.json.
+"""
+from __future__ import annotations
+import json
+import os
+import time
+from pathlib import Path
+from typing import Optional
+from openai import AsyncOpenAI
+from pydantic import BaseModel
+_DATA_DIR = Path(os.environ.get("DATA_DIR", Path(__file__).parent.parent / "data"))
+GEPA_PATH = _DATA_DIR / "gepa_prompt.json"
+_MODEL = os.environ.get("MODEL_NAME", "gpt-4o-mini")
+SEED_SYSTEM_PROMPT = """You are a SQL expert. Given a natural language question and a SQLite database schema, write a correct SQL query.
+Rules:
+- Output ONLY the SQL query, nothing else
+- No markdown, no code fences, no explanation
+- Use SQLite syntax"""
+# ─── Models ──────────────────────────────────────────────────────
+class QueryResult(BaseModel):
+    question: str
+    final_sql: str
+    attempts: int
+    success: bool
+    errors: list[str]
+    timestamp: float
+class Candidate(BaseModel):
+    system_prompt: str
+    score: float
+    avg_attempts: float
+    success_rate: float
+    generation: int
+    feedback: list[str]
+# ─── LLM Helper ──────────────────────────────────────────────────
+def _make_client() -> AsyncOpenAI:
+    return AsyncOpenAI(
+        api_key=os.environ.get("HF_TOKEN", ""),
+        base_url=os.environ.get("API_BASE_URL", "https://api.openai.com/v1"),
+    )
+async def _complete(system: str, user: str) -> str:
+    client = _make_client()
+    resp = await client.chat.completions.create(
+        model=_MODEL,
+        messages=[
+            {"role": "system", "content": system},
+            {"role": "user", "content": user},
+        ],
+        temperature=0.7,
+    )
+    return resp.choices[0].message.content or ""
+# ─── Golden Queries for Scoring ──────────────────────────────────
+_GOLDEN_QUERIES = [
+    {
+        "id": "gq-01",
+        "question": "List all users from the USA.",
+        "expected_min_rows": 10,
+    },
+    {
+        "id": "gq-02",
+        "question": "Show all products in the 'Electronics' category.",
+        "expected_min_rows": 8,
+    },
+    {
+        "id": "gq-03",
+        "question": "Find the total number of orders per user.",
+        "expected_min_rows": 10,
+    },
+    {
+        "id": "gq-04",
+        "question": "Show the average rating for each product category.",
+        "expected_min_rows": 5,
+    },
+    {
+        "id": "gq-05",
+        "question": "List products along with their seller name.",
+        "expected_min_rows": 20,
+    },
+]
+# ─── Optimizer Class ──────────────────────────────────────────────
+class GEPAOptimizer:
+    def __init__(self) -> None:
+        self._history: list[QueryResult] = []
+        self._pareto_front: list[Candidate] = [
+            Candidate(
+                system_prompt=SEED_SYSTEM_PROMPT,
+                score=0.5,
+                avg_attempts=3.0,
+                success_rate=0.5,
+                generation=0,
+                feedback=[],
+            )
+        ]
+        self._load()
+    # ─── Public Interface ─────────────────────────────────────────
+    def record_result(self, result: QueryResult) -> None:
+        self._history.append(result)
+        self._save()
+    def get_current_prompt(self) -> str:
+        if not self._pareto_front:
+            return SEED_SYSTEM_PROMPT
+        return max(self._pareto_front, key=lambda c: c.score).system_prompt
+    def get_history(self) -> list[QueryResult]:
+        return list(self._history)
+    def get_pareto_front(self) -> list[Candidate]:
+        return list(self._pareto_front)
+    def set_current_prompt(self, prompt: str) -> None:
+        if self._pareto_front:
+            best = max(self._pareto_front, key=lambda c: c.score)
+            best.system_prompt = prompt
+        else:
+            self._pareto_front.append(
+                Candidate(
+                    system_prompt=prompt,
+                    score=0.5,
+                    avg_attempts=3.0,
+                    success_rate=0.5,
+                    generation=0,
+                    feedback=[],
+                )
+            )
+        self._save()
+    def should_optimize(self) -> bool:
+        return len(self._history) > 0 and len(self._history) % 4 == 0
+    def reset(self) -> None:
+        self._history.clear()
+        self._pareto_front.clear()
+        self._pareto_front.append(
+            Candidate(
+                system_prompt=SEED_SYSTEM_PROMPT,
+                score=0.5,
+                avg_attempts=3.0,
+                success_rate=0.5,
+                generation=0,
+                feedback=[],
+            )
+        )
+        self._save()
+    async def run_optimization_cycle(
+        self,
+        user_feedback_context: Optional[str] = None,
+        dialect: str = "SQLite",
+    ) -> Optional[dict]:
+        """
+        Run one GEPA cycle: reflect → mutate → score → update Pareto front.
+        Returns {new_prompt, reflection} or None if not enough data.
+        """
+        if len(self._history) < 2:
+            return None
+        recent_failures = [
+            h for h in self._history if h.attempts > 1 or not h.success
+        ][-8:]
+        if len(recent_failures) < 2:
+            return None
+        current_best = self.get_current_prompt()
+        # ── Step 1: Reflect ──────────────────────────────────────
+        failure_summary = "\n\n---\n\n".join(
+            f'Query {i+1}: "{f.question}"\n'
+            f"Attempts: {f.attempts}\n"
+            f"Errors:\n" + "\n".join(f"  - {e}" for e in f.errors) + "\n"
+            f"Final SQL: {f.final_sql}"
+            for i, f in enumerate(recent_failures)
+        )
+        user_ctx_block = (
+            f"\n\nUser conversation:\n{user_feedback_context}"
+            if user_feedback_context
+            else ""
+        )
+        reflection = await _complete(
+            f"You are an expert SQL prompt engineer analyzing why an LLM SQL agent is failing.\n"
+            f"The target database is {dialect} — all rules must use {dialect} syntax.\n"
+            "Your job: identify specific, recurring patterns in these failures and state EXACTLY "
+            "what rules or knowledge the system prompt is missing.\n"
+            "Be very specific — name the exact functions, syntax patterns, or schema reasoning gaps.\n"
+            "Output a concise diagnosis (3-5 bullet points max).",
+            f"Current system prompt:\n{current_best}\n\n"
+            f"Recent failures:\n{failure_summary}{user_ctx_block}",
+        )
+        # ── Step 2: Mutate ───────────────────────────────────────
+        current_generation = max(c.generation for c in self._pareto_front) if self._pareto_front else 0
+        new_prompt = await _complete(
+            f"You are an expert prompt engineer. Improve a system prompt for a {dialect} SQL generation agent.\n"
+            "Rules for the new prompt:\n"
+            "- Keep it concise and actionable\n"
+            f"- The target database is {dialect} — use ONLY {dialect} syntax and functions\n"
+            "- Add specific rules that address the diagnosed failure patterns\n"
+            "- Do NOT add generic fluff — every rule must be earned by a real failure\n"
+            "- Output ONLY the improved system prompt text, nothing else",
+            f"Current system prompt:\n{current_best}\n\n"
+            f"Diagnosed failure patterns:\n{reflection}\n\n"
+            "Write the improved system prompt:",
+        )
+        # ── Step 3: Score ────────────────────────────────────────
+        benchmark_score = await self._score_prompt(new_prompt)
+        current_avg_attempts = (
+            sum(h.attempts for h in self._history) / len(self._history)
+            if self._history
+            else 3.0
+        )
+        new_candidate = Candidate(
+            system_prompt=new_prompt,
+            score=benchmark_score,
+            avg_attempts=max(current_avg_attempts - 0.5, 1.0),
+            success_rate=benchmark_score,
+            generation=current_generation + 1,
+            feedback=[reflection],
+        )
+        # ── Step 4: Update Pareto front ──────────────────────────
+        self._pareto_front.append(new_candidate)
+        self._pareto_front.sort(key=lambda c: c.score, reverse=True)
+        if len(self._pareto_front) > 3:
+            self._pareto_front = self._pareto_front[:3]
+        self._save()
+        return {"new_prompt": new_prompt, "reflection": reflection}
+    async def _score_prompt(self, prompt: str) -> float:
+        """
+        Score a prompt by running 3 golden queries and measuring success rate.
+        """
+        from env.database import execute_query, get_schema_info
+        import re
+        schema = get_schema_info()
+        client = _make_client()
+        scores = []
+        for gq in _GOLDEN_QUERIES[:3]:
+            try:
+                resp = await client.chat.completions.create(
+                    model=_MODEL,
+                    messages=[
+                        {"role": "system", "content": prompt},
+                        {
+                            "role": "user",
+                            "content": (
+                                f"Schema:\n{schema}\n\n"
+                                f"Question: {gq['question']}\n\n"
+                                "Write a SQL query."
+                            ),
+                        },
+                    ],
+                    temperature=0.1,
+                )
+                sql = resp.choices[0].message.content or ""
+                sql = re.sub(r"^```(?:sql)?\s*", "", sql.strip(), flags=re.IGNORECASE)
+                sql = re.sub(r"\s*```$", "", sql).strip().rstrip(";")
+                rows, error = execute_query(sql)
+                if error is None and len(rows) >= gq["expected_min_rows"]:
+                    scores.append(1.0)
+                elif error is None and rows:
+                    scores.append(0.5)
+                else:
+                    scores.append(0.0)
+            except Exception:
+                scores.append(0.0)
+        return sum(scores) / len(scores) if scores else 0.3
+    # ─── Persistence ─────────────────────────────────────────────
+    def _save(self) -> None:
+        try:
+            GEPA_PATH.parent.mkdir(parents=True, exist_ok=True)
+            data = {
+                "history": [r.model_dump() for r in self._history[-100:]],
+                "pareto_front": [c.model_dump() for c in self._pareto_front],
+            }
+            GEPA_PATH.write_text(json.dumps(data, default=str))
+        except Exception:
+            pass
+    def _load(self) -> None:
+        try:
+            if not GEPA_PATH.exists():
+                return
+            data = json.loads(GEPA_PATH.read_text())
+            self._history = [QueryResult(**r) for r in data.get("history", [])]
+            loaded_front = [Candidate(**c) for c in data.get("pareto_front", [])]
+            if loaded_front:
+                self._pareto_front = loaded_front
+        except Exception:
+            pass
+# ─── Singleton ────────────────────────────────────────────────────
+_gepa_instance: Optional[GEPAOptimizer] = None
+def get_gepa() -> GEPAOptimizer:
+    global _gepa_instance
+    if _gepa_instance is None:
+        _gepa_instance = GEPAOptimizer()
+    return _gepa_instance

backend/main.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""
+SQL Agent OpenEnv — FastAPI entry point.
+Start with:
+  uvicorn main:app --reload --port 8000
+Environment variables:
+  API_BASE_URL   — OpenAI-compatible base URL
+  MODEL_NAME     — model name
+  HF_TOKEN       — API key / bearer token
+  DATA_DIR       — override data directory (default: ./data)
+"""
+from __future__ import annotations
+import os
+from pathlib import Path
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.staticfiles import StaticFiles
+from api.demo import router as demo_router
+from api.openenv import router as openenv_router, ResetRequest, StepRequest, env_reset, env_step, env_state
+from env.database import ensure_seeded
+app = FastAPI(
+    title="SQL Agent OpenEnv",
+    description=(
+        "A SQL generation environment powered by a LinUCB contextual bandit "
+        "and GEPA prompt evolution, built for the Meta + Hugging Face OpenEnv hackathon."
+    ),
+    version="1.0.0",
+)
+# ─── CORS ────────────────────────────────────────────────────────
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# ─── Routers ─────────────────────────────────────────────────────
+app.include_router(demo_router, prefix="/api", tags=["demo"])
+app.include_router(openenv_router, prefix="/env", tags=["openenv"])
+# ─── Top-level OpenEnv aliases (required by openenv validate + pre-validation) ─
+# The validator pings POST <url>/reset — these mirror /env/* without the prefix.
+@app.post("/reset", tags=["openenv"])
+async def root_reset(req: ResetRequest = None):
+    return await env_reset(req or ResetRequest())
+@app.post("/step", tags=["openenv"])
+async def root_step(req: StepRequest = None):
+    return await env_step(req or StepRequest())
+@app.get("/state", tags=["openenv"])
+async def root_state():
+    return await env_state()
+# ─── Health check ────────────────────────────────────────────────
+@app.get("/health", tags=["system"])
+async def health():
+    return {"status": "ok", "service": "sql-agent-openenv"}
+# ─── Startup ─────────────────────────────────────────────────────
+@app.on_event("startup")
+async def startup_event():
+    """Seed the database on first startup."""
+    try:
+        ensure_seeded()
+    except Exception as e:
+        print(f"Warning: database seed failed: {e}")
+# ─── Static files (frontend) — mount last ─────────────────────────
+_frontend_dist = Path(__file__).parent.parent / "frontend" / "dist"
+if _frontend_dist.exists():
+    app.mount(
+        "/",
+        StaticFiles(directory=str(_frontend_dist), html=True),
+        name="frontend",
+    )
+else:
+    @app.get("/", tags=["system"])
+    async def root():
+        return {
+            "message": "SQL Agent OpenEnv API",
+            "docs": "/docs",
+            "health": "/health",
+            "env_info": "/env/info",
+        }

backend/requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+fastapi>=0.115.0
+uvicorn[standard]>=0.30.0
+openai>=1.40.0
+pydantic>=2.8.0
+numpy>=1.26.0
+aiofiles>=24.0.0
+python-multipart>=0.0.9
+sse-starlette>=2.1.0
+aiosqlite>=0.20.0

backend/rl/__init__.py ADDED Viewed

File without changes

backend/rl/environment.py ADDED Viewed

	@@ -0,0 +1,266 @@

+"""
+SQLDebugEnvironment — Gym-like RL environment for the SQL debug loop.
+Lifecycle:
+  1. env.reset(question)           — start new episode
+  2. env.observe_error(error, sql) — classify error, build state
+  3. env.select_action()           — bandit picks repair strategy
+  4. env.get_repair_prompt(...)    — get specialized prompt for chosen action
+  5. env.record_step(success)      — record outcome, compute reward
+  6. Repeat 2-5 until success or max attempts
+  7. env.end_episode(success)      — finalize, HER relabeling, bandit update
+This module is a stateful singleton — one active episode at a time.
+"""
+from __future__ import annotations
+import time
+from typing import Optional
+from rl.types import (
+    RLState,
+    RepairAction,
+    ErrorClass,
+    EpisodeStep,
+    RLMetrics,
+    featurize,
+    REPAIR_ACTION_NAMES,
+    ERROR_CLASS_NAMES,
+)
+from rl.error_classifier import classify_error, extract_offending_token
+from rl.grader import GraderInput, compute_reward
+from rl.linucb import LinUCB
+from rl.experience import record_episode, get_metrics, reset_experience
+from rl.repair_strategies import (
+    RepairContext,
+    get_repair_system_suffix,
+    build_repair_user_message,
+)
+# ─── Singleton State ─────────────────────────────────────────────
+_bandit: Optional[LinUCB] = None
+class _EpisodeContext:
+    def __init__(self, question: str) -> None:
+        self.question = question
+        self.steps: list[EpisodeStep] = []
+        self.previous_error_class: Optional[ErrorClass] = None
+        self.consecutive_same_error: int = 0
+        self.last_action: Optional[RepairAction] = None
+        self.current_state: Optional[RLState] = None
+        self.current_features: Optional[list[float]] = None
+_current_episode: Optional[_EpisodeContext] = None
+def _get_bandit() -> LinUCB:
+    global _bandit
+    if _bandit is None:
+        _bandit = LinUCB()
+    return _bandit
+# ─── Environment Interface ────────────────────────────────────────
+def reset(question: str) -> None:
+    """Start a new episode. If a previous episode was active, end it as failure."""
+    global _current_episode
+    if _current_episode and _current_episode.steps:
+        end_episode(False)
+    _current_episode = _EpisodeContext(question)
+def observe_error(
+    error_message: str,
+    failing_sql: str,
+    attempt_number: int,
+) -> dict:
+    """
+    Classify the SQL execution error and build the RL state.
+    Returns a dict with keys: error_class, error_class_name, state.
+    """
+    if _current_episode is None:
+        raise RuntimeError("Call reset() before observe_error()")
+    error_class = classify_error(error_message)
+    error_changed = (
+        _current_episode.previous_error_class is not None
+        and _current_episode.previous_error_class != error_class
+    )
+    if _current_episode.previous_error_class == error_class:
+        _current_episode.consecutive_same_error += 1
+    else:
+        _current_episode.consecutive_same_error = 1
+    state = RLState(
+        error_class=error_class,
+        attempt_number=attempt_number,
+        previous_action=_current_episode.last_action,
+        error_changed=error_changed,
+        consecutive_same_error=_current_episode.consecutive_same_error,
+    )
+    _current_episode.current_state = state
+    _current_episode.current_features = featurize(state)
+    return {
+        "error_class": error_class,
+        "error_class_name": ERROR_CLASS_NAMES[error_class],
+        "state": state,
+    }
+def select_action() -> dict:
+    """
+    Ask the bandit to select a repair action based on current state.
+    Returns dict with keys: action, action_name, scores.
+    """
+    if _current_episode is None or _current_episode.current_features is None:
+        raise RuntimeError("Call observe_error() before select_action()")
+    b = _get_bandit()
+    action, scores = b.select_action(_current_episode.current_features)
+    _current_episode.last_action = action
+    return {
+        "action": action,
+        "action_name": REPAIR_ACTION_NAMES[action],
+        "scores": scores,
+    }
+def get_repair_prompt(
+    action: RepairAction,
+    schema: str,
+    question: str,
+    failing_sql: str,
+    error_message: str,
+) -> dict:
+    """
+    Build the system suffix and user message for the chosen repair action.
+    Returns dict with keys: system_suffix, user_message.
+    """
+    offending_token = extract_offending_token(error_message)
+    ctx = RepairContext(
+        schema=schema,
+        question=question,
+        failing_sql=failing_sql,
+        error_message=error_message,
+        offending_token=offending_token,
+    )
+    return {
+        "system_suffix": get_repair_system_suffix(action),
+        "user_message": build_repair_user_message(action, ctx),
+    }
+def record_step(
+    action: RepairAction,
+    success: bool,
+    error_message: str,
+    sql: str,
+) -> dict:
+    """
+    Record the outcome of a repair step and compute shaped reward.
+    Returns dict with keys: reward, breakdown.
+    """
+    if _current_episode is None or _current_episode.current_state is None:
+        raise RuntimeError("Call observe_error() before record_step()")
+    state = _current_episode.current_state
+    grader_input = GraderInput(
+        success=success,
+        attempt_number=state.attempt_number,
+        current_error_class=None if success else classify_error(error_message),
+        previous_error_class=_current_episode.previous_error_class,
+    )
+    result = compute_reward(grader_input)
+    step = EpisodeStep(
+        state=state,
+        featurized=_current_episode.current_features or featurize(state),
+        action=action,
+        reward=result.reward,
+        error_message=error_message,
+        sql=sql,
+        success=success,
+    )
+    _current_episode.steps.append(step)
+    _current_episode.previous_error_class = state.error_class
+    return {
+        "reward": result.reward,
+        "breakdown": {
+            "base": result.breakdown.base,
+            "attempt_penalty": result.breakdown.attempt_penalty,
+            "severity_bonus": result.breakdown.severity_bonus,
+            "change_bonus": result.breakdown.change_bonus,
+        },
+    }
+def end_episode(success: bool) -> Optional[dict]:
+    """
+    End the current episode. Runs HER relabeling and updates the bandit.
+    Returns dict with keys: total_reward, episode_length.
+    """
+    global _current_episode
+    if _current_episode is None or not _current_episode.steps:
+        _current_episode = None
+        return None
+    b = _get_bandit()
+    episode, relabeled = record_episode(
+        _current_episode.question,
+        _current_episode.steps,
+        success,
+    )
+    for exp in relabeled:
+        b.update(exp.state, exp.action, exp.reward)
+    b.decay_alpha()
+    result = {
+        "total_reward": episode.total_reward,
+        "episode_length": len(episode.steps),
+    }
+    _current_episode = None
+    return result
+# ─── Query Interface ──────────────────────────────────────────────
+def get_rl_metrics() -> RLMetrics:
+    return get_metrics()
+def get_bandit_state() -> dict:
+    b = _get_bandit()
+    return {
+        "action_counts": b.get_action_counts(),
+        "total_updates": b.get_total_updates(),
+        "alpha": b.get_alpha(),
+        "action_distribution": b.get_action_distribution(),
+    }
+def is_episode_active() -> bool:
+    return _current_episode is not None
+def reset_rl() -> None:
+    """Reset the entire RL system — bandit weights and experience store."""
+    global _bandit, _current_episode
+    if _bandit:
+        _bandit.reset()
+    reset_experience()
+    _current_episode = None

backend/rl/error_classifier.py ADDED Viewed

	@@ -0,0 +1,98 @@

+"""
+SQL error classifier: maps raw SQLite error messages to one of 8
+canonical ErrorClass values.
+Severity ordering (lower = less severe / closer to correct):
+  OTHER=5, SYNTAX_ERROR=4, NO_SUCH_FUNCTION=3, NO_SUCH_TABLE=3,
+  DATATYPE_MISMATCH=2, AGGREGATION_ERROR=2,
+  NO_SUCH_COLUMN=1, AMBIGUOUS_COLUMN=1
+"""
+import re
+from typing import Optional
+from rl.types import ErrorClass
+_SEVERITY: dict[ErrorClass, int] = {
+    ErrorClass.OTHER:             5,
+    ErrorClass.SYNTAX_ERROR:      4,
+    ErrorClass.NO_SUCH_FUNCTION:  3,
+    ErrorClass.NO_SUCH_TABLE:     3,
+    ErrorClass.DATATYPE_MISMATCH: 2,
+    ErrorClass.AGGREGATION_ERROR: 2,
+    ErrorClass.NO_SUCH_COLUMN:    1,
+    ErrorClass.AMBIGUOUS_COLUMN:  1,
+}
+def error_severity(error_class: ErrorClass) -> int:
+    return _SEVERITY[error_class]
+def classify_error(error_message: str) -> ErrorClass:
+    """
+    Classify a raw SQLite error message into one of 8 canonical classes.
+    Patterns are ordered most-specific-first to avoid false matches.
+    """
+    msg = error_message.lower()
+    # Column-level errors
+    if "no such column" in msg:
+        return ErrorClass.NO_SUCH_COLUMN
+    if "ambiguous column" in msg:
+        return ErrorClass.AMBIGUOUS_COLUMN
+    # Table-level errors
+    if "no such table" in msg:
+        return ErrorClass.NO_SUCH_TABLE
+    # Function errors
+    if "no such function" in msg:
+        return ErrorClass.NO_SUCH_FUNCTION
+    # Aggregation / GROUP BY
+    if (
+        "not an aggregate" in msg
+        or "misuse of aggregate" in msg
+        or ("group by" in msg and "must appear" in msg)
+        or "must be an aggregate" in msg
+    ):
+        return ErrorClass.AGGREGATION_ERROR
+    # Syntax errors (broad — must come after more specific patterns)
+    if "syntax error" in msg or re.search(r'near\s+"', msg):
+        return ErrorClass.SYNTAX_ERROR
+    # Type errors
+    if "datatype mismatch" in msg or "type mismatch" in msg:
+        return ErrorClass.DATATYPE_MISMATCH
+    return ErrorClass.OTHER
+def extract_offending_token(error_message: str) -> Optional[str]:
+    """
+    Extract the offending token from a SQLite error message.
+    Returns None if no specific token can be identified.
+    """
+    # "no such column: X"
+    m = re.search(r"no such column:\s*(\S+)", error_message, re.IGNORECASE)
+    if m:
+        return m.group(1)
+    # "no such table: X"
+    m = re.search(r"no such table:\s*(\S+)", error_message, re.IGNORECASE)
+    if m:
+        return m.group(1)
+    # 'near "X": syntax error'
+    m = re.search(r'near\s+"([^"]+)"', error_message, re.IGNORECASE)
+    if m:
+        return m.group(1)
+    # "no such function: X"
+    m = re.search(r"no such function:\s*(\S+)", error_message, re.IGNORECASE)
+    if m:
+        return m.group(1)
+    return None

backend/rl/experience.py ADDED Viewed

	@@ -0,0 +1,208 @@

+"""
+Experience store: logs episodes, persists to disk, and implements
+Hindsight Experience Replay (HER) for reward relabeling.
+HER (Andrychowicz et al., 2017): If a later attempt in the same episode
+succeeded, relabel earlier failed steps with partial credit proportional
+to their distance from the success step. This multiplies the effective
+training signal from sparse rewards.
+"""
+from __future__ import annotations
+import json
+import os
+import time
+import random
+from pathlib import Path
+from typing import Optional
+from rl.types import (
+    Episode,
+    EpisodeStep,
+    Experience,
+    RLMetrics,
+    RepairAction,
+    REPAIR_ACTION_NAMES,
+    ERROR_CLASS_NAMES,
+)
+from rl.grader import compute_episode_reward
+_DATA_DIR = Path(os.environ.get("DATA_DIR", Path(__file__).parent.parent / "data"))
+EXPERIENCE_PATH = _DATA_DIR / "rl_experiences.json"
+MAX_EPISODES = 500
+_episodes: list[Episode] = []
+_loaded: bool = False
+def _ensure_loaded() -> None:
+    global _loaded, _episodes
+    if _loaded:
+        return
+    _loaded = True
+    try:
+        if EXPERIENCE_PATH.exists():
+            raw = json.loads(EXPERIENCE_PATH.read_text())
+            _episodes = [Episode(**ep) for ep in raw]
+    except Exception:
+        _episodes = []
+def _persist() -> None:
+    try:
+        EXPERIENCE_PATH.parent.mkdir(parents=True, exist_ok=True)
+        data = [ep.model_dump() for ep in _episodes[-MAX_EPISODES:]]
+        EXPERIENCE_PATH.write_text(json.dumps(data, default=str))
+    except Exception:
+        pass
+def record_episode(
+    question: str,
+    steps: list[EpisodeStep],
+    success: bool,
+) -> tuple[Episode, list[Experience]]:
+    """
+    Record a completed episode, run HER relabeling, and persist.
+    Returns (episode, relabeled_experiences).
+    """
+    _ensure_loaded()
+    step_rewards = [s.reward for s in steps]
+    total_reward = compute_episode_reward(step_rewards, success)
+    episode = Episode(
+        id=f"ep-{int(time.time() * 1000)}-{random.randint(1000, 9999)}",
+        question=question,
+        steps=steps,
+        total_reward=total_reward,
+        success=success,
+        timestamp=time.time(),
+    )
+    _episodes.append(episode)
+    if len(_episodes) > MAX_EPISODES:
+        _episodes[:] = _episodes[-MAX_EPISODES:]
+    _persist()
+    relabeled = _apply_her(episode)
+    return episode, relabeled
+def _apply_her(episode: Episode) -> list[Experience]:
+    """
+    Hindsight Experience Replay.
+    If the episode eventually succeeded at step T, relabel earlier
+    failed steps with a hindsight bonus:
+      bonus(t) = 0.3 * (1 - (T - t) / T)
+    Steps closer to the eventual success receive more credit.
+    """
+    experiences: list[Experience] = []
+    success_step_idx = next(
+        (i for i, s in enumerate(episode.steps) if s.success), -1
+    )
+    for t, step in enumerate(episode.steps):
+        reward = step.reward
+        if success_step_idx > t:
+            distance = success_step_idx - t
+            total_steps = len(episode.steps)
+            her_bonus = 0.3 * (1.0 - distance / total_steps)
+            reward += her_bonus
+        next_step = episode.steps[t + 1] if t < len(episode.steps) - 1 else None
+        experiences.append(
+            Experience(
+                state=step.featurized,
+                action=step.action,
+                reward=reward,
+                next_state=next_step.featurized if next_step else None,
+                done=(t == len(episode.steps) - 1),
+                timestamp=episode.timestamp,
+                metadata={
+                    "question": episode.question,
+                    "error_message": step.error_message,
+                    "sql": step.sql,
+                    "error_class": int(step.state.error_class),
+                    "attempt_number": step.state.attempt_number,
+                },
+            )
+        )
+    return experiences
+def replay_all(bandit) -> int:
+    """
+    Replay all stored experiences through the bandit to rebuild weights.
+    Useful after a reset or if weights are lost.
+    """
+    _ensure_loaded()
+    count = 0
+    for ep in _episodes:
+        relabeled = _apply_her(ep)
+        for exp in relabeled:
+            bandit.update(exp.state, exp.action, exp.reward)
+            count += 1
+    return count
+def get_metrics() -> RLMetrics:
+    _ensure_loaded()
+    recent_window = 50
+    recent = _episodes[-recent_window:]
+    all_steps = [s for ep in _episodes for s in ep.steps]
+    action_dist: dict[str, int] = {}
+    error_dist: dict[str, int] = {}
+    for step in all_steps:
+        a_name = REPAIR_ACTION_NAMES[step.action]
+        action_dist[a_name] = action_dist.get(a_name, 0) + 1
+        e_name = ERROR_CLASS_NAMES[step.state.error_class]
+        error_dist[e_name] = error_dist.get(e_name, 0) + 1
+    return RLMetrics(
+        total_episodes=len(_episodes),
+        total_steps=len(all_steps),
+        cumulative_reward=sum(ep.total_reward for ep in _episodes),
+        success_rate=(
+            sum(1 for ep in recent if ep.success) / len(recent)
+            if recent
+            else 0.0
+        ),
+        avg_attempts=(
+            sum(len(ep.steps) for ep in recent) / len(recent)
+            if recent
+            else 0.0
+        ),
+        action_distribution=action_dist,
+        error_distribution=error_dist,
+        reward_history=[ep.total_reward for ep in _episodes],
+    )
+def get_episodes() -> list[Episode]:
+    _ensure_loaded()
+    return list(_episodes)
+def get_recent_episodes(n: int) -> list[Episode]:
+    _ensure_loaded()
+    return _episodes[-n:]
+def reset_experience() -> None:
+    global _episodes, _loaded
+    _episodes = []
+    _loaded = True
+    try:
+        EXPERIENCE_PATH.unlink(missing_ok=True)
+    except Exception:
+        pass

backend/rl/grader.py ADDED Viewed

	@@ -0,0 +1,99 @@

+"""
+Shaped reward function for the SQL debug RL environment.
+Reward components:
+  +1.0  base success reward
+  -0.1  per attempt (attempt penalty — incentivizes early resolution)
+  +0.2  if error severity decreased (progress signal)
+  +0.1  if error class changed at all (exploration signal)
+  -0.1  base failure penalty per step
+The shaping is potential-based (Ng et al., 1999), preserving
+the optimal policy while accelerating learning.
+"""
+from __future__ import annotations
+from typing import Optional
+from dataclasses import dataclass
+from rl.types import ErrorClass
+from rl.error_classifier import error_severity
+@dataclass
+class GraderInput:
+    success: bool
+    attempt_number: int                        # 1-indexed
+    current_error_class: Optional[ErrorClass]  # None if success
+    previous_error_class: Optional[ErrorClass] # None on first attempt
+@dataclass
+class RewardBreakdown:
+    base: float
+    attempt_penalty: float
+    severity_bonus: float
+    change_bonus: float
+@dataclass
+class GraderOutput:
+    reward: float
+    breakdown: RewardBreakdown
+def compute_reward(inp: GraderInput) -> GraderOutput:
+    if inp.success:
+        base = 1.0
+        attempt_penalty = -0.1 * (inp.attempt_number - 1)
+        return GraderOutput(
+            reward=base + attempt_penalty,
+            breakdown=RewardBreakdown(
+                base=base,
+                attempt_penalty=attempt_penalty,
+                severity_bonus=0.0,
+                change_bonus=0.0,
+            ),
+        )
+    # Failed step — base penalty + potential shaping
+    base = -0.1
+    attempt_penalty = -0.05 * inp.attempt_number
+    severity_bonus = 0.0
+    change_bonus = 0.0
+    if inp.previous_error_class is not None and inp.current_error_class is not None:
+        prev_sev = error_severity(inp.previous_error_class)
+        curr_sev = error_severity(inp.current_error_class)
+        if curr_sev < prev_sev:
+            severity_bonus = 0.2    # Progress toward solution
+        elif curr_sev > prev_sev:
+            severity_bonus = -0.1   # Regression
+        if inp.current_error_class != inp.previous_error_class:
+            change_bonus = 0.1      # At least something different happened
+    reward = base + attempt_penalty + severity_bonus + change_bonus
+    return GraderOutput(
+        reward=reward,
+        breakdown=RewardBreakdown(
+            base=base,
+            attempt_penalty=attempt_penalty,
+            severity_bonus=severity_bonus,
+            change_bonus=change_bonus,
+        ),
+    )
+def compute_episode_reward(step_rewards: list[float], success: bool) -> float:
+    """
+    Compute total episode reward from individual step rewards.
+    Includes a terminal bonus/penalty based on final outcome.
+    """
+    total = sum(step_rewards)
+    terminal = 0.5 if success else -0.5
+    return total + terminal

backend/rl/linucb.py ADDED Viewed

	@@ -0,0 +1,190 @@

+"""
+LinUCB Contextual Bandit (Li et al., 2010).
+Maintains per-action inverse covariance matrices using the
+Sherman-Morrison rank-1 update formula for O(d^2) updates.
+For each action a in {0..K-1}:
+  A_inv[a]  — d×d inverse covariance (starts as I_d)
+  b[a]      — d reward-weighted feature accumulator
+  theta[a]  = A_inv[a] @ b[a]   (ridge regression estimate)
+  UCB_a(x)  = theta[a] @ x + alpha * sqrt(max(0, x @ A_inv[a] @ x))
+Action selection: argmax_a UCB_a(x)
+"""
+from __future__ import annotations
+import json
+import os
+import random
+from pathlib import Path
+from typing import List, Optional, Tuple
+import numpy as np
+from rl.types import FEATURE_DIM, NUM_ACTIONS, RepairAction, REPAIR_ACTION_NAMES
+# Default path — can be overridden by DATA_DIR env var
+_DATA_DIR = Path(os.environ.get("DATA_DIR", Path(__file__).parent.parent / "data"))
+WEIGHTS_PATH = _DATA_DIR / "rl_weights.json"
+class LinUCB:
+    """
+    LinUCB contextual bandit with Sherman-Morrison updates and alpha decay.
+    Weights are persisted to JSON after every 10 updates.
+    """
+    def __init__(
+        self,
+        d: int = FEATURE_DIM,
+        K: int = NUM_ACTIONS,
+        alpha: float = 1.5,
+    ) -> None:
+        self.d = d
+        self.K = K
+        self.alpha = alpha
+        self.total_updates = 0
+        loaded = self._load_weights()
+        if loaded is not None:
+            self.A_inv = loaded["A_inv"]
+            self.b = loaded["b"]
+            self.counts = loaded["counts"]
+            self.total_updates = loaded["total_updates"]
+        else:
+            self.A_inv: List[np.ndarray] = [np.eye(d) for _ in range(K)]
+            self.b: List[np.ndarray] = [np.zeros(d) for _ in range(K)]
+            self.counts: List[int] = [0] * K
+    # ─── Core Interface ──────────────────────────────────────────
+    def select_action(self, x: List[float]) -> Tuple[RepairAction, List[float]]:
+        """
+        Select the action with highest UCB score.
+        Returns (action, scores_for_all_actions).
+        """
+        xv = np.array(x, dtype=np.float64)
+        scores = []
+        for a in range(self.K):
+            theta = self.A_inv[a] @ self.b[a]
+            exploit = float(theta @ xv)
+            quad = float(xv @ self.A_inv[a] @ xv)
+            explore = self.alpha * float(np.sqrt(max(0.0, quad)))
+            scores.append(exploit + explore)
+        # Argmax with random tie-breaking
+        best_action = 0
+        best_score = scores[0]
+        for a in range(1, self.K):
+            if scores[a] > best_score or (
+                scores[a] == best_score and random.random() > 0.5
+            ):
+                best_score = scores[a]
+                best_action = a
+        return RepairAction(best_action), scores
+    def update(self, x: List[float], action: RepairAction, reward: float) -> None:
+        """
+        Update the model after observing a reward.
+        Uses Sherman-Morrison: (A + xx^T)^{-1} = A^{-1} - (A^{-1}xx^T A^{-1}) / (1 + x^T A^{-1} x)
+        """
+        a = int(action)
+        xv = np.array(x, dtype=np.float64)
+        A_inv_x = self.A_inv[a] @ xv          # shape (d,)
+        denom = 1.0 + float(xv @ A_inv_x)     # scalar
+        # Rank-1 downdate
+        self.A_inv[a] -= np.outer(A_inv_x, A_inv_x) / denom
+        # Reward-weighted feature accumulation
+        self.b[a] += reward * xv
+        self.counts[a] += 1
+        self.total_updates += 1
+        if self.total_updates % 10 == 0:
+            self.save_weights()
+    def get_estimated_rewards(self, x: List[float]) -> List[float]:
+        """
+        Return theta^T x for each action (no exploration bonus).
+        Useful for understanding learned policy.
+        """
+        xv = np.array(x, dtype=np.float64)
+        return [float((self.A_inv[a] @ self.b[a]) @ xv) for a in range(self.K)]
+    def get_action_counts(self) -> List[int]:
+        return list(self.counts)
+    def get_total_updates(self) -> int:
+        return self.total_updates
+    def get_alpha(self) -> float:
+        return self.alpha
+    def decay_alpha(self, min_alpha: float = 0.3) -> None:
+        """Decay exploration coefficient toward exploitation."""
+        self.alpha = max(min_alpha, self.alpha * 0.995)
+    def get_action_distribution(self) -> dict:
+        total = sum(self.counts) or 1
+        return {
+            REPAIR_ACTION_NAMES[RepairAction(a)]: self.counts[a] / total
+            for a in range(self.K)
+        }
+    # ─── Persistence ─────────────────────────────────────────────
+    def save_weights(self) -> None:
+        try:
+            WEIGHTS_PATH.parent.mkdir(parents=True, exist_ok=True)
+            data = {
+                "A_inv": [m.tolist() for m in self.A_inv],
+                "b": [v.tolist() for v in self.b],
+                "counts": self.counts,
+                "total_updates": self.total_updates,
+                "alpha": self.alpha,
+            }
+            WEIGHTS_PATH.write_text(json.dumps(data))
+        except Exception:
+            pass  # Non-fatal
+    def _load_weights(self) -> Optional[dict]:
+        try:
+            if not WEIGHTS_PATH.exists():
+                return None
+            raw = json.loads(WEIGHTS_PATH.read_text())
+            A_inv = [np.array(m, dtype=np.float64) for m in raw["A_inv"]]
+            b = [np.array(v, dtype=np.float64) for v in raw["b"]]
+            # Validate dimensions
+            if (
+                len(A_inv) == self.K
+                and A_inv[0].shape == (self.d, self.d)
+                and len(b) == self.K
+                and b[0].shape == (self.d,)
+            ):
+                return {
+                    "A_inv": A_inv,
+                    "b": b,
+                    "counts": raw["counts"],
+                    "total_updates": raw["total_updates"],
+                }
+            return None
+        except Exception:
+            return None
+    def reset(self) -> None:
+        self.A_inv = [np.eye(self.d) for _ in range(self.K)]
+        self.b = [np.zeros(self.d) for _ in range(self.K)]
+        self.counts = [0] * self.K
+        self.total_updates = 0
+        self.alpha = 1.5
+        try:
+            WEIGHTS_PATH.unlink(missing_ok=True)
+        except Exception:
+            pass

backend/rl/repair_strategies.py ADDED Viewed

	@@ -0,0 +1,219 @@

+"""
+Repair strategy prompt templates for each of the 8 RepairAction values.
+Each strategy provides:
+  - system_suffix: appended to the base system prompt
+  - user_template: callable that builds the user message given a RepairContext
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Optional, Callable
+from rl.types import RepairAction
+@dataclass
+class RepairContext:
+    schema: str
+    question: str
+    failing_sql: str
+    error_message: str
+    offending_token: Optional[str]
+@dataclass
+class RepairStrategy:
+    action: RepairAction
+    name: str
+    system_suffix: str
+    user_template: Callable[[RepairContext], str]
+def _tmpl_rewrite_full(ctx: RepairContext) -> str:
+    return (
+        f"Schema:\n{ctx.schema}\n\n"
+        f"Question: {ctx.question}\n\n"
+        f"A previous attempt failed with: {ctx.error_message}\n\n"
+        "Write a completely new SQL query from scratch. Do NOT reference the previous attempt."
+    )
+def _tmpl_fix_column(ctx: RepairContext) -> str:
+    token_hint = f"\n\nThe problematic column is: {ctx.offending_token}" if ctx.offending_token else ""
+    return (
+        f"Schema:\n{ctx.schema}\n\n"
+        f"Question: {ctx.question}\n\n"
+        f"Previous SQL:\n{ctx.failing_sql}\n\n"
+        f"Error: {ctx.error_message}"
+        f"{token_hint}\n\n"
+        "Fix ONLY the column name issue. Check the schema for correct column names."
+    )
+def _tmpl_fix_table(ctx: RepairContext) -> str:
+    token_hint = f"\n\nThe problematic table is: {ctx.offending_token}" if ctx.offending_token else ""
+    return (
+        f"Schema:\n{ctx.schema}\n\n"
+        f"Question: {ctx.question}\n\n"
+        f"Previous SQL:\n{ctx.failing_sql}\n\n"
+        f"Error: {ctx.error_message}"
+        f"{token_hint}\n\n"
+        "Fix the table name or JOIN issue. Verify all table names exist in the schema."
+    )
+def _tmpl_add_groupby(ctx: RepairContext) -> str:
+    return (
+        f"Schema:\n{ctx.schema}\n\n"
+        f"Question: {ctx.question}\n\n"
+        f"Previous SQL:\n{ctx.failing_sql}\n\n"
+        f"Error: {ctx.error_message}\n\n"
+        "Fix the GROUP BY / aggregation issue. Ensure every non-aggregate column in SELECT is in GROUP BY."
+    )
+def _tmpl_rewrite_cte(ctx: RepairContext) -> str:
+    return (
+        f"Schema:\n{ctx.schema}\n\n"
+        f"Question: {ctx.question}\n\n"
+        f"Previous SQL:\n{ctx.failing_sql}\n\n"
+        f"Error: {ctx.error_message}\n\n"
+        "Restructure the CTEs or subqueries. Break the query into clear, named WITH clauses."
+    )
+def _tmpl_fix_syntax(ctx: RepairContext) -> str:
+    token_hint = f"\n\nSyntax error near: {ctx.offending_token}" if ctx.offending_token else ""
+    return (
+        f"Schema:\n{ctx.schema}\n\n"
+        f"Question: {ctx.question}\n\n"
+        f"Previous SQL:\n{ctx.failing_sql}\n\n"
+        f"Error: {ctx.error_message}"
+        f"{token_hint}\n\n"
+        "Fix the syntax error. Check for typos, missing commas, unmatched parentheses."
+    )
+def _tmpl_change_dialect(ctx: RepairContext) -> str:
+    return (
+        f"Schema:\n{ctx.schema}\n\n"
+        f"Question: {ctx.question}\n\n"
+        f"Previous SQL:\n{ctx.failing_sql}\n\n"
+        f"Error: {ctx.error_message}\n\n"
+        "The SQL uses functions or syntax not supported by SQLite. "
+        "Rewrite using SQLite-compatible alternatives."
+    )
+def _tmpl_relax_filter(ctx: RepairContext) -> str:
+    return (
+        f"Schema:\n{ctx.schema}\n\n"
+        f"Question: {ctx.question}\n\n"
+        f"Previous SQL:\n{ctx.failing_sql}\n\n"
+        f"Error: {ctx.error_message}\n\n"
+        "Review and relax the WHERE/HAVING conditions. "
+        "Check date formats, value ranges, and filter logic."
+    )
+_STRATEGIES: dict[RepairAction, RepairStrategy] = {
+    RepairAction.REWRITE_FULL: RepairStrategy(
+        action=RepairAction.REWRITE_FULL,
+        name="Full Rewrite",
+        system_suffix=(
+            "\n\nIMPORTANT: The previous SQL attempt was fundamentally flawed. "
+            "Discard it entirely and write a new query from scratch based only on "
+            "the schema and question. Do NOT try to patch the previous SQL."
+        ),
+        user_template=_tmpl_rewrite_full,
+    ),
+    RepairAction.FIX_COLUMN: RepairStrategy(
+        action=RepairAction.FIX_COLUMN,
+        name="Fix Column",
+        system_suffix=(
+            "\n\nIMPORTANT: The previous SQL referenced a wrong column name. "
+            "Carefully check the schema for the exact column names in each table. "
+            "Pay attention to singular vs plural, underscores, and exact spelling."
+        ),
+        user_template=_tmpl_fix_column,
+    ),
+    RepairAction.FIX_TABLE: RepairStrategy(
+        action=RepairAction.FIX_TABLE,
+        name="Fix Table",
+        system_suffix=(
+            "\n\nIMPORTANT: The previous SQL referenced a wrong table name or had "
+            "incorrect JOIN relationships. Check the schema for exact table names "
+            "and foreign key relationships."
+        ),
+        user_template=_tmpl_fix_table,
+    ),
+    RepairAction.ADD_GROUPBY: RepairStrategy(
+        action=RepairAction.ADD_GROUPBY,
+        name="Fix GROUP BY",
+        system_suffix=(
+            "\n\nIMPORTANT: The previous SQL has an aggregation error. Every column "
+            "in SELECT that is not inside an aggregate function (COUNT, SUM, AVG, etc.) "
+            "MUST appear in the GROUP BY clause. Check all selected columns."
+        ),
+        user_template=_tmpl_add_groupby,
+    ),
+    RepairAction.REWRITE_CTE: RepairStrategy(
+        action=RepairAction.REWRITE_CTE,
+        name="Rewrite CTE/Subquery",
+        system_suffix=(
+            "\n\nIMPORTANT: The previous SQL had issues with CTEs or subqueries. "
+            "Restructure the query — consider using WITH clauses for clarity, or "
+            "flatten nested subqueries. Ensure CTE column names are explicitly defined if needed."
+        ),
+        user_template=_tmpl_rewrite_cte,
+    ),
+    RepairAction.FIX_SYNTAX: RepairStrategy(
+        action=RepairAction.FIX_SYNTAX,
+        name="Fix Syntax",
+        system_suffix=(
+            "\n\nIMPORTANT: The previous SQL has a syntax error. Check for: "
+            "missing commas, unmatched parentheses, misspelled keywords, "
+            "incorrect operator usage, missing AS aliases."
+        ),
+        user_template=_tmpl_fix_syntax,
+    ),
+    RepairAction.CHANGE_DIALECT: RepairStrategy(
+        action=RepairAction.CHANGE_DIALECT,
+        name="Fix Dialect",
+        system_suffix=(
+            "\n\nIMPORTANT: The previous SQL used functions or syntax not available in SQLite. "
+            "Key SQLite rules:\n"
+            "- Use strftime() for date formatting, NOT DATE_FORMAT or EXTRACT\n"
+            "- No FULL OUTER JOIN or RIGHT JOIN — use LEFT JOIN with UNION\n"
+            "- Use CAST(x AS INTEGER), not CONVERT()\n"
+            "- No ILIKE — use LIKE (case-insensitive by default for ASCII)\n"
+            "- String concatenation uses || not CONCAT()\n"
+            "- No LIMIT inside subqueries with IN (use CTE instead)"
+        ),
+        user_template=_tmpl_change_dialect,
+    ),
+    RepairAction.RELAX_FILTER: RepairStrategy(
+        action=RepairAction.RELAX_FILTER,
+        name="Relax Filter",
+        system_suffix=(
+            "\n\nIMPORTANT: The previous SQL may have overly restrictive WHERE conditions, "
+            "incorrect date ranges, or wrong filter values causing empty results or errors. "
+            "Review the filter conditions and broaden them to capture the intended data."
+        ),
+        user_template=_tmpl_relax_filter,
+    ),
+}
+def get_repair_system_suffix(action: RepairAction) -> str:
+    return _STRATEGIES[action].system_suffix
+def build_repair_user_message(action: RepairAction, ctx: RepairContext) -> str:
+    return _STRATEGIES[action].user_template(ctx)
+def get_repair_name(action: RepairAction) -> str:
+    return _STRATEGIES[action].name

backend/rl/types.py ADDED Viewed

	@@ -0,0 +1,161 @@

+"""
+RL type definitions and feature engineering.
+Mirrors the TypeScript types.ts exactly:
+  - 8 error classes, 8 repair actions
+  - FEATURE_DIM = 20
+  - featurize() builds the state vector
+"""
+from __future__ import annotations
+from enum import IntEnum
+from typing import Optional, List, Dict, Any
+from pydantic import BaseModel
+# ─── Error Taxonomy ─────────────────────────────────────────────
+class ErrorClass(IntEnum):
+    NO_SUCH_COLUMN    = 0
+    NO_SUCH_TABLE     = 1
+    SYNTAX_ERROR      = 2
+    AMBIGUOUS_COLUMN  = 3
+    DATATYPE_MISMATCH = 4
+    NO_SUCH_FUNCTION  = 5
+    AGGREGATION_ERROR = 6
+    OTHER             = 7
+ERROR_CLASS_NAMES: Dict[ErrorClass, str] = {
+    ErrorClass.NO_SUCH_COLUMN:    "no_such_column",
+    ErrorClass.NO_SUCH_TABLE:     "no_such_table",
+    ErrorClass.SYNTAX_ERROR:      "syntax_error",
+    ErrorClass.AMBIGUOUS_COLUMN:  "ambiguous_column",
+    ErrorClass.DATATYPE_MISMATCH: "datatype_mismatch",
+    ErrorClass.NO_SUCH_FUNCTION:  "no_such_function",
+    ErrorClass.AGGREGATION_ERROR: "aggregation_error",
+    ErrorClass.OTHER:             "other",
+}
+NUM_ERROR_CLASSES = 8
+# ─── Repair Actions ─────────────────────────────────────────────
+class RepairAction(IntEnum):
+    REWRITE_FULL   = 0
+    FIX_COLUMN     = 1
+    FIX_TABLE      = 2
+    ADD_GROUPBY    = 3
+    REWRITE_CTE    = 4
+    FIX_SYNTAX     = 5
+    CHANGE_DIALECT = 6
+    RELAX_FILTER   = 7
+REPAIR_ACTION_NAMES: Dict[RepairAction, str] = {
+    RepairAction.REWRITE_FULL:   "rewrite_full",
+    RepairAction.FIX_COLUMN:     "fix_column",
+    RepairAction.FIX_TABLE:      "fix_table",
+    RepairAction.ADD_GROUPBY:    "add_groupby",
+    RepairAction.REWRITE_CTE:    "rewrite_cte",
+    RepairAction.FIX_SYNTAX:     "fix_syntax",
+    RepairAction.CHANGE_DIALECT: "change_dialect",
+    RepairAction.RELAX_FILTER:   "relax_filter",
+}
+# Inverse map: name → enum
+REPAIR_ACTION_BY_NAME: Dict[str, RepairAction] = {v: k for k, v in REPAIR_ACTION_NAMES.items()}
+NUM_ACTIONS = 8
+# Feature vector:
+#   [0..7]  error class one-hot  (8)
+#   [8]     attempt / 5.0        (1)
+#   [9..16] prev action one-hot  (8)
+#   [17]    error_changed        (1)
+#   [18]    consec_count / 5.0   (1)
+#   [19]    bias = 1.0           (1)
+#   total = 20
+FEATURE_DIM = 20
+# ─── State ──────────────────────────────────────────────────────
+class RLState(BaseModel):
+    error_class: ErrorClass
+    attempt_number: int                   # 1-indexed
+    previous_action: Optional[RepairAction] = None
+    error_changed: bool = False
+    consecutive_same_error: int = 1
+def featurize(state: RLState) -> List[float]:
+    """Build the 20-dimensional feature vector from an RLState."""
+    x = [0.0] * FEATURE_DIM
+    # Error class one-hot [0..7]
+    x[state.error_class] = 1.0
+    # Attempt number normalized [8]
+    x[8] = state.attempt_number / 5.0
+    # Previous action one-hot [9..16]
+    if state.previous_action is not None:
+        x[9 + int(state.previous_action)] = 1.0
+    # Error changed flag [17]
+    x[17] = 1.0 if state.error_changed else 0.0
+    # Consecutive same error normalized [18]
+    x[18] = min(state.consecutive_same_error, 5) / 5.0
+    # Bias term [19]
+    x[19] = 1.0
+    return x
+# ─── Experience / Episode ────────────────────────────────────────
+class EpisodeStep(BaseModel):
+    state: RLState
+    featurized: List[float]
+    action: RepairAction
+    reward: float
+    error_message: str
+    sql: str
+    success: bool
+class Episode(BaseModel):
+    id: str
+    question: str
+    steps: List[EpisodeStep]
+    total_reward: float
+    success: bool
+    timestamp: float
+class Experience(BaseModel):
+    state: List[float]
+    action: RepairAction
+    reward: float
+    next_state: Optional[List[float]] = None
+    done: bool
+    timestamp: float
+    metadata: Dict[str, Any]
+# ─── Metrics ────────────────────────────────────────────────────
+class RLMetrics(BaseModel):
+    total_episodes: int
+    total_steps: int
+    cumulative_reward: float
+    success_rate: float
+    avg_attempts: float
+    action_distribution: Dict[str, int]
+    error_distribution: Dict[str, int]
+    reward_history: List[float]

frontend/index.html ADDED Viewed

	@@ -0,0 +1,14 @@

+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <link rel="icon" type="image/svg+xml" href="/favicon.svg" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>SQL Agent OpenEnv — RL Environment</title>
+    <meta name="description" content="SQL Agent with Reinforcement Learning and GEPA prompt evolution" />
+  </head>
+  <body>
+    <div id="root"></div>
+    <script type="module" src="/src/main.tsx"></script>
+  </body>
+</html>

frontend/package-lock.json ADDED Viewed

The diff for this file is too large to render. See raw diff

frontend/package.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "name": "sql-openenv-ui",
+  "private": true,
+  "version": "0.1.0",
+  "type": "module",
+  "scripts": {
+    "dev": "vite --port 5173",
+    "build": "vite build",
+    "preview": "vite preview"
+  },
+  "dependencies": {
+    "react": "^19.0.0",
+    "react-dom": "^19.0.0",
+    "framer-motion": "^11.0.0",
+    "lucide-react": "^0.400.0",
+    "recharts": "^2.12.0",
+    "zustand": "^4.5.0",
+    "react-markdown": "^9.0.0"
+  },
+  "devDependencies": {
+    "@types/react": "^19.0.0",
+    "@types/react-dom": "^19.0.0",
+    "@vitejs/plugin-react": "^4.3.0",
+    "typescript": "^5.5.0",
+    "vite": "^5.4.0",
+    "tailwindcss": "^3.4.0",
+    "autoprefixer": "^10.4.0",
+    "postcss": "^8.4.0"
+  }
+}

frontend/postcss.config.js ADDED Viewed

	@@ -0,0 +1,6 @@

+export default {
+  plugins: {
+    tailwindcss: {},
+    autoprefixer: {},
+  },
+}

frontend/src/App.tsx ADDED Viewed

	@@ -0,0 +1,179 @@

+import { useState, useEffect } from 'react'
+import { motion, AnimatePresence } from 'framer-motion'
+import { MessageSquare, Target, GitFork, X } from 'lucide-react'
+import { Header } from './components/Header'
+import { LeftSidebar } from './components/LeftSidebar'
+import { ChatPanel } from './components/ChatPanel'
+import { BenchmarkPanel } from './components/BenchmarkPanel'
+import { ERDiagram } from './components/ERDiagram'
+import { RightSidebar } from './components/RightSidebar'
+import { useStore } from './store/useStore'
+import { fetchInit } from './lib/api'
+type Tab = 'chat' | 'benchmark' | 'er'
+const TABS: { id: Tab; label: string; icon: React.ReactNode }[] = [
+  { id: 'chat', label: 'Chat', icon: <MessageSquare size={12} /> },
+  { id: 'benchmark', label: 'Benchmark', icon: <Target size={12} /> },
+  { id: 'er', label: 'ER Diagram', icon: <GitFork size={12} /> },
+]
+export default function App() {
+  const [activeTab, setActiveTab] = useState<Tab>('chat')
+  const [leftOpen, setLeftOpen] = useState(false)
+  const [rightOpen, setRightOpen] = useState(false)
+  const { theme, setDbSeeded, setTables, setSchemaGraph } = useStore()
+  // Apply theme on mount / change
+  useEffect(() => {
+    document.documentElement.setAttribute('data-theme', theme)
+  }, [theme])
+  // Restore theme from storage on mount
+  useEffect(() => {
+    try {
+      const saved = localStorage.getItem('theme') as 'dark' | 'light' | null
+      if (saved) {
+        document.documentElement.setAttribute('data-theme', saved)
+        useStore.setState({ theme: saved })
+      }
+    } catch { /* noop */ }
+  }, [])
+  // Fetch init data
+  useEffect(() => {
+    fetchInit()
+      .then((d) => {
+        setDbSeeded(true)
+        setTables(d.tables)
+        // Lazy-load schema graph
+        fetch('/api/schema-graph')
+          .then((r) => r.json())
+          .then((g) => setSchemaGraph(g))
+          .catch(() => { /* noop */ })
+      })
+      .catch(() => { /* noop */ })
+  }, [setDbSeeded, setTables, setSchemaGraph])
+  // Close mobile sidebars on tab change
+  useEffect(() => {
+    setLeftOpen(false)
+    setRightOpen(false)
+  }, [activeTab])
+  return (
+    <div
+      className="h-screen flex flex-col overflow-hidden theme-bg-primary theme-text-primary"
+      style={{ fontFamily: 'ui-monospace,"SF Mono",Consolas,"Liberation Mono",monospace' }}
+    >
+      <Header
+        onToggleLeft={() => { setLeftOpen((v) => !v); setRightOpen(false) }}
+        onToggleRight={() => { setRightOpen((v) => !v); setLeftOpen(false) }}
+      />
+      <div className="flex flex-1 overflow-hidden relative">
+        {/* Overlay backdrop (mobile) */}
+        {(leftOpen || rightOpen) && (
+          <div
+            className="fixed inset-0 bg-black/50 z-30 lg:hidden"
+            onClick={() => { setLeftOpen(false); setRightOpen(false) }}
+          />
+        )}
+        {/* LEFT SIDEBAR */}
+        <aside
+          className={`
+            fixed top-[53px] bottom-0 left-0 z-40 w-60 border-r theme-border flex flex-col overflow-y-auto
+            transition-transform duration-200 ease-out
+            lg:static lg:w-60 lg:shrink-0 lg:translate-x-0 lg:z-auto
+            ${leftOpen ? 'translate-x-0' : '-translate-x-full'}
+          `}
+          style={{ background: 'var(--bg-secondary)' }}
+        >
+          <div className="flex items-center justify-between px-4 pt-3 pb-1 lg:hidden">
+            <span className="text-[10px] font-semibold text-gray-500 uppercase tracking-wider">
+              Dataset & Tasks
+            </span>
+            <button
+              onClick={() => setLeftOpen(false)}
+              className="p-1 rounded hover:bg-white/5 text-gray-500"
+            >
+              <X size={14} />
+            </button>
+          </div>
+          <div className="flex-1 px-4 py-3">
+            <LeftSidebar />
+          </div>
+        </aside>
+        {/* CENTER: Tabbed panel */}
+        <main className="flex-1 flex flex-col overflow-hidden min-w-0">
+          {/* Tab bar */}
+          <div
+            className="flex items-center gap-1 px-2 sm:px-4 py-2.5 border-b theme-border shrink-0 overflow-x-auto scrollbar-none"
+            style={{ background: 'var(--bg-secondary)' }}
+          >
+            {TABS.map((tab) => (
+              <button
+                key={tab.id}
+                onClick={() => setActiveTab(tab.id)}
+                className={`flex items-center gap-1.5 px-2.5 sm:px-3 py-1.5 rounded-lg text-xs font-medium transition-all whitespace-nowrap shrink-0 ${
+                  activeTab === tab.id
+                    ? 'bg-violet-600/20 text-violet-300 border border-violet-500/30'
+                    : 'text-gray-500 hover:text-gray-300 hover:bg-white/5 border border-transparent'
+                }`}
+              >
+                {tab.icon}
+                <span>{tab.label}</span>
+              </button>
+            ))}
+          </div>
+          {/* Tab content */}
+          <div className="flex-1 overflow-hidden relative">
+            <AnimatePresence mode="wait">
+              <motion.div
+                key={activeTab}
+                initial={{ opacity: 0, y: 4 }}
+                animate={{ opacity: 1, y: 0 }}
+                exit={{ opacity: 0 }}
+                transition={{ duration: 0.15 }}
+                className="absolute inset-0 flex flex-col overflow-hidden"
+              >
+                {activeTab === 'chat' && <ChatPanel />}
+                {activeTab === 'benchmark' && <BenchmarkPanel />}
+                {activeTab === 'er' && <ERDiagram />}
+              </motion.div>
+            </AnimatePresence>
+          </div>
+        </main>
+        {/* RIGHT SIDEBAR */}
+        <aside
+          className={`
+            fixed top-[53px] bottom-0 right-0 z-40 w-72 border-l theme-border flex flex-col overflow-hidden
+            transition-transform duration-200 ease-out
+            lg:static lg:w-72 lg:shrink-0 lg:translate-x-0 lg:z-auto
+            ${rightOpen ? 'translate-x-0' : 'translate-x-full'}
+          `}
+          style={{ background: 'var(--bg-secondary)' }}
+        >
+          <div className="flex items-center justify-between px-4 pt-3 pb-1 lg:hidden">
+            <span className="text-[10px] font-semibold text-gray-500 uppercase tracking-wider">
+              GEPA & RL
+            </span>
+            <button
+              onClick={() => setRightOpen(false)}
+              className="p-1 rounded hover:bg-white/5 text-gray-500"
+            >
+              <X size={14} />
+            </button>
+          </div>
+          <RightSidebar />
+        </aside>
+      </div>
+    </div>
+  )
+}

frontend/src/components/BenchmarkPanel.tsx ADDED Viewed

	@@ -0,0 +1,384 @@

+import { useState, useCallback } from 'react'
+import { motion, AnimatePresence } from 'framer-motion'
+import {
+  Target, Play, Loader2, CheckCircle2, XCircle,
+  ChevronDown, RotateCcw, Zap,
+} from 'lucide-react'
+import { useStore } from '../store/useStore'
+import { streamBenchmark } from '../lib/api'
+import type { BenchmarkResult, Difficulty } from '../lib/types'
+const DIFFICULTY_TABS: { id: Difficulty; label: string }[] = [
+  { id: 'easy', label: 'Easy' },
+  { id: 'medium', label: 'Medium' },
+  { id: 'hard', label: 'Hard' },
+]
+function QueryRow({
+  result,
+  isActive,
+  isExpanded,
+  onToggleExpand,
+  onRunSingle,
+  isRunning,
+  dbSeeded,
+}: {
+  result: BenchmarkResult
+  isActive: boolean
+  isExpanded: boolean
+  onToggleExpand: () => void
+  onRunSingle: () => void
+  isRunning: boolean
+  dbSeeded: boolean
+}) {
+  const statusIcon = () => {
+    switch (result.status) {
+      case 'pending': return <span className="w-2 h-2 rounded-full bg-gray-600 shrink-0" />
+      case 'running': return <Loader2 size={12} className="text-violet-400 animate-spin shrink-0" />
+      case 'pass': return <CheckCircle2 size={12} className="text-green-400 shrink-0" />
+      case 'fail': return <XCircle size={12} className="text-red-400 shrink-0" />
+    }
+  }
+  const difficultyColor =
+    result.difficulty === 'hard'
+      ? 'text-red-400 bg-red-500/10 border-red-500/25'
+      : result.difficulty === 'medium'
+      ? 'text-amber-400 bg-amber-500/10 border-amber-500/25'
+      : 'text-blue-400 bg-blue-500/10 border-blue-500/25'
+  return (
+    <div
+      className={`rounded-xl border transition-all duration-150 ${
+        isActive
+          ? 'border-violet-500/40 bg-violet-500/5'
+          : 'border-white/5 bg-white/[0.02] hover:bg-white/[0.04]'
+      }`}
+    >
+      <div
+        className="flex items-start gap-2 px-3 py-2.5 cursor-pointer"
+        onClick={onToggleExpand}
+      >
+        <div className="mt-0.5 shrink-0">{statusIcon()}</div>
+        <div className="flex-1 min-w-0">
+          <div className="flex items-center gap-2 mb-0.5 flex-wrap">
+            <span className="text-[10px] font-mono text-gray-600">{result.id}</span>
+            <span className={`text-[9px] font-semibold px-1.5 py-0.5 rounded-full border ${difficultyColor}`}>
+              {result.difficulty}
+            </span>
+            {result.score !== null && (
+              <span className={`text-[10px] font-mono font-bold ${result.status === 'pass' ? 'text-green-400' : 'text-red-400'}`}>
+                {result.score.toFixed(2)}
+              </span>
+            )}
+            {result.attempts !== null && (
+              <span className="text-[9px] text-gray-600 font-mono">
+                {result.attempts} attempt{result.attempts !== 1 ? 's' : ''}
+              </span>
+            )}
+          </div>
+          <div className="text-xs text-gray-300 leading-relaxed line-clamp-2">
+            {result.question}
+          </div>
+          {result.reason && result.status !== 'pending' && (
+            <div className={`text-[10px] mt-1 ${result.status === 'pass' ? 'text-green-500/70' : 'text-red-400/70'}`}>
+              {result.reason.length > 120 ? result.reason.slice(0, 120) + '…' : result.reason}
+            </div>
+          )}
+        </div>
+        <div className="flex items-center gap-1.5 shrink-0">
+          {result.status === 'pending' && dbSeeded && !isRunning && (
+            <button
+              onClick={(e) => { e.stopPropagation(); onRunSingle() }}
+              className="p-1 rounded-lg hover:bg-white/10 transition-colors"
+              title="Run this query"
+            >
+              <Play size={10} className="text-gray-500 hover:text-violet-400" />
+            </button>
+          )}
+          <ChevronDown
+            size={11}
+            className={`text-gray-600 transition-transform duration-150 ${isExpanded ? 'rotate-180' : ''}`}
+          />
+        </div>
+      </div>
+      {/* Expanded detail */}
+      <AnimatePresence>
+        {isExpanded && (
+          <motion.div
+            initial={{ height: 0, opacity: 0 }}
+            animate={{ height: 'auto', opacity: 1 }}
+            exit={{ height: 0, opacity: 0 }}
+            transition={{ duration: 0.15 }}
+            className="overflow-hidden"
+          >
+            <div className="px-3 pb-3 flex flex-col gap-2 border-t border-white/5 pt-2">
+              <p className="text-xs text-gray-400 leading-relaxed">{result.question}</p>
+              {result.sql && (
+                <div>
+                  <div className="text-[10px] text-gray-600 mb-1 font-semibold uppercase tracking-wider">
+                    Generated SQL
+                  </div>
+                  <pre className="text-[10px] font-mono text-violet-200/70 bg-black/40 rounded-lg p-2.5 border border-white/5 whitespace-pre-wrap leading-relaxed max-h-40 overflow-y-auto">
+                    {result.sql}
+                  </pre>
+                </div>
+              )}
+              {(result.refRowCount !== null || result.reason) && (
+                <div className="flex flex-col gap-1.5">
+                  {result.refRowCount !== null && (
+                    <div className="flex items-center gap-3 text-[10px] font-mono">
+                      <span className="text-gray-600">reference:</span>
+                      <span className="text-blue-400">{result.refRowCount} rows</span>
+                      <span className="text-gray-600">agent:</span>
+                      <span className={
+                        result.agentRowCount === result.refRowCount
+                          ? 'text-green-400'
+                          : result.agentRowCount === 0
+                          ? 'text-red-400'
+                          : 'text-amber-400'
+                      }>
+                        {result.agentRowCount ?? 0} rows
+                      </span>
+                    </div>
+                  )}
+                  {result.reason && (
+                    <div className={`text-[10px] leading-relaxed ${result.status === 'pass' ? 'text-green-400/80' : 'text-red-400/80'}`}>
+                      {result.reason}
+                    </div>
+                  )}
+                </div>
+              )}
+              {result.status !== 'pending' && result.status !== 'running' && !isRunning && dbSeeded && (
+                <button
+                  onClick={(e) => { e.stopPropagation(); onRunSingle() }}
+                  className="flex items-center gap-1 text-[10px] text-violet-400 hover:text-violet-300 transition-colors self-start mt-1"
+                >
+                  <RotateCcw size={9} />
+                  Re-run
+                </button>
+              )}
+            </div>
+          </motion.div>
+        )}
+      </AnimatePresence>
+    </div>
+  )
+}
+export function BenchmarkPanel() {
+  const {
+    benchmarkResults, isBenchmarking, overallScore,
+    activeBenchmarkId, dbSeeded,
+    setIsBenchmarking, updateBenchmarkResult, setOverallScore,
+    setActiveBenchmarkId, resetBenchmark,
+    taskDifficulty, setTaskDifficulty,
+  } = useStore()
+  const [expandedIds, setExpandedIds] = useState<Set<string>>(new Set())
+  const toggleExpand = (id: string) => {
+    setExpandedIds((prev) => {
+      const next = new Set(prev)
+      if (next.has(id)) next.delete(id)
+      else next.add(id)
+      return next
+    })
+  }
+  const runBenchmark = useCallback(
+    async (queryIds?: string[]) => {
+      if (isBenchmarking) return
+      setIsBenchmarking(true)
+      const targetIds = queryIds ?? benchmarkResults.map((r) => r.id)
+      for (const id of targetIds) {
+        const existing = benchmarkResults.find((r) => r.id === id)
+        if (existing) {
+          updateBenchmarkResult({ ...existing, status: 'running', score: null, reason: null, sql: null })
+        }
+      }
+      try {
+        for await (const event of streamBenchmark(taskDifficulty, queryIds)) {
+          if (event.type === 'query_start') {
+            setActiveBenchmarkId(event.id as string)
+            const existing = benchmarkResults.find((r) => r.id === event.id)
+            if (existing) updateBenchmarkResult({ ...existing, status: 'running' })
+          } else if (event.type === 'query_result') {
+            const existing = benchmarkResults.find((r) => r.id === event.id)
+            if (existing) {
+              updateBenchmarkResult({
+                ...existing,
+                status: (event.pass as boolean) ? 'pass' : 'fail',
+                score: event.score as number,
+                reason: event.reason as string,
+                sql: event.sql as string,
+                attempts: (event.attempts as number) ?? null,
+                refRowCount: (event.refRowCount as number) ?? null,
+                agentRowCount: (event.agentRowCount as number) ?? null,
+              })
+            }
+          } else if (event.type === 'done') {
+            setOverallScore(event.overallScore as number)
+            setActiveBenchmarkId(null)
+            setIsBenchmarking(false)
+          } else if (event.type === 'error') {
+            setActiveBenchmarkId(null)
+            setIsBenchmarking(false)
+          }
+        }
+      } catch {
+        setIsBenchmarking(false)
+        setActiveBenchmarkId(null)
+      }
+    },
+    [isBenchmarking, benchmarkResults, setIsBenchmarking, updateBenchmarkResult,
+      setOverallScore, setActiveBenchmarkId, taskDifficulty]
+  )
+  const passedCount = benchmarkResults.filter((r) => r.status === 'pass').length
+  const completedCount = benchmarkResults.filter((r) => r.status === 'pass' || r.status === 'fail').length
+  const totalScore = benchmarkResults.reduce((s, r) => s + (r.score ?? 0), 0)
+  const progressPct = benchmarkResults.length > 0 ? Math.round((completedCount / benchmarkResults.length) * 100) : 0
+  const scorePct = completedCount > 0 ? Math.round((totalScore / benchmarkResults.length) * 100) : 0
+  return (
+    <div className="flex flex-col h-full">
+      {/* Header */}
+      <div className="px-4 py-3 border-b border-white/[0.06] shrink-0">
+        <div className="flex items-center justify-between mb-2">
+          <div className="flex items-center gap-2">
+            <Target size={14} className="text-violet-400" />
+            <span className="text-xs font-semibold text-white">Benchmark</span>
+            {completedCount > 0 && (
+              <span className="text-xs text-gray-500 font-mono">
+                {passedCount}/{benchmarkResults.length} passed
+              </span>
+            )}
+          </div>
+          <div className="flex items-center gap-2">
+            {completedCount > 0 && (
+              <button
+                onClick={resetBenchmark}
+                disabled={isBenchmarking}
+                className="flex items-center gap-1 px-2 py-1 rounded-lg text-[10px] text-gray-500 hover:text-gray-300 hover:bg-white/5 transition-all disabled:opacity-40"
+              >
+                <RotateCcw size={10} />
+                Reset
+              </button>
+            )}
+            <button
+              onClick={() => void runBenchmark()}
+              disabled={isBenchmarking || !dbSeeded}
+              className="flex items-center gap-1.5 px-3 py-1.5 rounded-lg bg-violet-600 hover:bg-violet-500 disabled:opacity-40 disabled:cursor-not-allowed transition-all text-white text-xs font-semibold"
+            >
+              {isBenchmarking ? (
+                <Loader2 size={11} className="animate-spin" />
+              ) : (
+                <Play size={11} />
+              )}
+              Run All
+            </button>
+          </div>
+        </div>
+        {/* Overall score */}
+        {overallScore !== null && (
+          <motion.div
+            initial={{ opacity: 0, scale: 0.95 }}
+            animate={{ opacity: 1, scale: 1 }}
+            className="mb-2 flex items-center gap-3 px-3 py-2 rounded-xl border border-violet-500/20 bg-violet-500/5"
+          >
+            <Zap size={14} className="text-violet-400 shrink-0" />
+            <div>
+              <div className="text-[10px] text-gray-500 uppercase tracking-wider">Overall Score</div>
+              <div className="text-xl font-bold font-mono text-violet-300">
+                {(overallScore * 100).toFixed(0)}%
+              </div>
+            </div>
+          </motion.div>
+        )}
+        {/* Score bar */}
+        {completedCount > 0 && (
+          <div className="flex flex-col gap-1">
+            <div className="flex items-center justify-between text-[10px]">
+              <span className="text-gray-500">
+                Score: {totalScore.toFixed(1)}/{benchmarkResults.length}
+              </span>
+              <span className="text-violet-400 font-mono">{scorePct}%</span>
+            </div>
+            <div className="h-1.5 bg-white/5 rounded-full overflow-hidden">
+              <motion.div
+                className="h-full rounded-full bg-gradient-to-r from-violet-600 to-violet-400"
+                initial={{ width: 0 }}
+                animate={{ width: `${scorePct}%` }}
+                transition={{ duration: 0.5, ease: 'easeOut' }}
+              />
+            </div>
+          </div>
+        )}
+        {/* Progress */}
+        {isBenchmarking && (
+          <div className="mt-1.5">
+            <div className="h-1 bg-white/5 rounded-full overflow-hidden">
+              <motion.div
+                className="h-full rounded-full bg-violet-500/60"
+                initial={{ width: 0 }}
+                animate={{ width: `${progressPct}%` }}
+                transition={{ duration: 0.3 }}
+              />
+            </div>
+          </div>
+        )}
+      </div>
+      {/* Difficulty tabs */}
+      <div className="flex items-center gap-1 px-4 py-2 border-b border-white/[0.06] shrink-0">
+        {DIFFICULTY_TABS.map((tab) => (
+          <button
+            key={tab.id}
+            onClick={() => setTaskDifficulty(tab.id)}
+            className={`px-3 py-1 rounded-lg text-xs font-medium transition-all ${
+              taskDifficulty === tab.id
+                ? 'bg-violet-600/20 text-violet-300 border border-violet-500/30'
+                : 'text-gray-500 hover:text-gray-300 hover:bg-white/5 border border-transparent'
+            }`}
+          >
+            {tab.label}
+          </button>
+        ))}
+      </div>
+      {/* Query list */}
+      <div className="flex-1 overflow-y-auto">
+        <div className="p-2 flex flex-col gap-1">
+          {benchmarkResults.map((result) => (
+            <QueryRow
+              key={result.id}
+              result={result}
+              isActive={activeBenchmarkId === result.id}
+              isExpanded={expandedIds.has(result.id)}
+              onToggleExpand={() => toggleExpand(result.id)}
+              onRunSingle={() => void runBenchmark([result.id])}
+              isRunning={isBenchmarking}
+              dbSeeded={dbSeeded}
+            />
+          ))}
+        </div>
+      </div>
+      {!dbSeeded && (
+        <div className="px-4 py-2 border-t border-white/[0.06] text-[10px] text-gray-600 text-center shrink-0">
+          Waiting for database initialization...
+        </div>
+      )}
+    </div>
+  )
+}

frontend/src/components/ChatPanel.tsx ADDED Viewed

	@@ -0,0 +1,599 @@

+import { useState, useRef, useEffect, useCallback } from 'react'
+import { motion, AnimatePresence } from 'framer-motion'
+import {
+  Send, CheckCircle2, XCircle, ChevronDown, ChevronUp,
+  Loader2, MessageSquare, Zap, RefreshCw, Trash2,
+} from 'lucide-react'
+import { useStore } from '../store/useStore'
+import { streamExecuteQuery, submitFeedback } from '../lib/api'
+import { ResultsTable } from './ResultsTable'
+import type { ChatMessage, AttemptStep } from '../lib/types'
+// ─── SQL Syntax Highlighter ───────────────────────────────────────
+const SQL_KEYWORDS = /\b(SELECT|FROM|WHERE|JOIN|LEFT|RIGHT|INNER|OUTER|FULL|ON|GROUP\s+BY|ORDER\s+BY|HAVING|LIMIT|OFFSET|UNION|ALL|DISTINCT|AS|AND|OR|NOT|IN|IS|NULL|LIKE|BETWEEN|CASE|WHEN|THEN|ELSE|END|WITH|CTE|INSERT|UPDATE|DELETE|CREATE|DROP|ALTER|TABLE|INDEX|VIEW|SET|VALUES|INTO|EXISTS|COUNT|SUM|AVG|MIN|MAX|COALESCE|NULLIF|CAST|OVER|PARTITION\s+BY|ROW_NUMBER|RANK|DENSE_RANK|LAG|LEAD|DATE|STRFTIME|JULIANDAY|ROUND|ABS|LENGTH|SUBSTR|UPPER|LOWER|TRIM|REPLACE|IFNULL)\b/gi
+function SqlBlock({ sql, streaming }: { sql: string; streaming?: boolean }) {
+  const parts: React.ReactNode[] = []
+  let last = 0
+  let match: RegExpExecArray | null
+  const re = new RegExp(SQL_KEYWORDS.source, 'gi')
+  while ((match = re.exec(sql)) !== null) {
+    if (match.index > last) {
+      parts.push(<span key={`t-${last}`}>{sql.slice(last, match.index)}</span>)
+    }
+    parts.push(
+      <span key={`k-${match.index}`} className="sql-keyword">
+        {match[0]}
+      </span>
+    )
+    last = match.index + match[0].length
+  }
+  if (last < sql.length) {
+    parts.push(<span key={`t-end`}>{sql.slice(last)}</span>)
+  }
+  return (
+    <pre
+      className="px-3 py-2.5 text-xs font-mono bg-violet-950/20 whitespace-pre-wrap overflow-x-auto leading-relaxed border-t border-white/[0.04]"
+      style={{ color: 'rgba(221, 214, 254, 0.8)' }}
+    >
+      {parts}
+      {streaming && <span className="cursor-blink" />}
+    </pre>
+  )
+}
+// ─── Attempt badge ────────────────────────────────────────────────
+function AttemptBadge({ attempt, total }: { attempt: number; total: number }) {
+  const colors =
+    attempt === 1
+      ? 'text-gray-400 bg-white/5 border-white/10'
+      : attempt === 2
+      ? 'text-amber-400 bg-amber-500/10 border-amber-500/20'
+      : attempt === 3
+      ? 'text-orange-400 bg-orange-500/10 border-orange-500/20'
+      : 'text-red-400 bg-red-500/10 border-red-500/20'
+  return (
+    <span className={`text-[10px] font-semibold px-2 py-0.5 rounded-full border ${colors}`}>
+      Attempt {attempt}/{total}
+    </span>
+  )
+}
+// ─── RL Action badge ──────────────────────────────────────────────
+function RLActionBadge({ action, score }: { action: string; score?: number }) {
+  return (
+    <span className="inline-flex items-center gap-1 text-[10px] font-semibold px-2 py-0.5 rounded-full border border-orange-500/30 bg-orange-500/10 text-orange-400">
+      <Zap size={9} />
+      {action}
+      {score !== undefined && (
+        <span className="text-orange-400/60 ml-0.5">{score.toFixed(2)}</span>
+      )}
+    </span>
+  )
+}
+// ─── Reward display ──────────────────────────────────────────────
+function RewardBadge({ reward }: { reward: number }) {
+  const positive = reward >= 0
+  return (
+    <motion.span
+      initial={{ scale: 0.8, opacity: 0 }}
+      animate={{ scale: 1, opacity: 1 }}
+      transition={{ type: 'spring', stiffness: 300 }}
+      className={`inline-flex items-center gap-0.5 text-[11px] font-bold tabular-nums reward-pulse ${
+        positive ? 'text-green-400' : 'text-red-400'
+      }`}
+    >
+      {positive ? '+' : ''}{reward.toFixed(2)}
+    </motion.span>
+  )
+}
+// ─── Attempt steps collapsible ────────────────────────────────────
+function AttemptSteps({ steps }: { steps: AttemptStep[] }) {
+  const [open, setOpen] = useState(false)
+  if (steps.length <= 1) return null
+  return (
+    <div className="border border-white/[0.05] rounded-xl overflow-hidden">
+      <button
+        onClick={() => setOpen((v) => !v)}
+        className="w-full flex items-center justify-between px-3 py-2 bg-white/[0.02] hover:bg-white/[0.04] transition-colors text-[10px] text-gray-500"
+      >
+        <span>{steps.length} attempts to solve</span>
+        {open ? <ChevronUp size={11} /> : <ChevronDown size={11} />}
+      </button>
+      <AnimatePresence>
+        {open && (
+          <motion.div
+            initial={{ height: 0, opacity: 0 }}
+            animate={{ height: 'auto', opacity: 1 }}
+            exit={{ height: 0, opacity: 0 }}
+            transition={{ duration: 0.15 }}
+            className="overflow-hidden"
+          >
+            <div className="flex flex-col divide-y divide-white/[0.04]">
+              {steps.map((step) => (
+                <div key={step.attempt} className="px-3 py-2">
+                  <div className="flex items-center gap-2 mb-1.5">
+                    <AttemptBadge attempt={step.attempt} total={steps.length} />
+                    {step.action && (
+                      <RLActionBadge action={step.action} score={step.actionScore} />
+                    )}
+                    {step.reward !== undefined && <RewardBadge reward={step.reward} />}
+                  </div>
+                  {step.error && (
+                    <div className="text-[10px] text-red-400/70 mb-1 bg-red-500/5 rounded px-2 py-1 border border-red-500/15">
+                      {step.error}
+                    </div>
+                  )}
+                  <SqlBlock sql={step.sql} />
+                </div>
+              ))}
+            </div>
+          </motion.div>
+        )}
+      </AnimatePresence>
+    </div>
+  )
+}
+// ─── Suggested query chips ────────────────────────────────────────
+const SUGGESTED: Record<string, string[]> = {
+  easy: ['Show all products', 'List users from USA', 'What categories exist?'],
+  medium: ['Top 5 sellers by revenue', 'Average order value by country', 'Products with low stock'],
+  hard: ['Rolling 7-day revenue', 'Seller ranking with rank change', 'Cohort retention analysis'],
+}
+function EmptyState({ onSelect }: { onSelect: (q: string) => void }) {
+  const { taskDifficulty } = useStore()
+  const suggestions = SUGGESTED[taskDifficulty] ?? SUGGESTED.easy
+  return (
+    <div className="flex flex-col items-center justify-center h-full gap-6 px-8 text-center">
+      <div>
+        <div
+          className="w-12 h-12 rounded-2xl flex items-center justify-center mx-auto mb-4"
+          style={{ background: '#1e3a5f', boxShadow: '0 8px 24px rgba(30,58,95,0.4)' }}
+        >
+          <MessageSquare size={22} className="text-white" />
+        </div>
+        <h2 className="text-base font-semibold text-white mb-1">Ask about your data</h2>
+        <p className="text-xs text-gray-500 max-w-xs">
+          Type a question in natural language. The agent will generate SQL, execute it,
+          and self-repair on errors using reinforcement learning.
+        </p>
+      </div>
+      <div className="flex flex-col gap-2 w-full max-w-sm">
+        <div className="text-[10px] text-gray-600 uppercase tracking-wider mb-0.5">
+          Try these queries
+        </div>
+        {suggestions.map((q) => (
+          <button
+            key={q}
+            onClick={() => onSelect(q)}
+            className="flex items-center gap-2 px-3 py-2.5 rounded-xl border border-white/[0.06] bg-white/[0.02] hover:bg-white/[0.05] hover:border-violet-500/30 transition-all text-left group"
+          >
+            <span className="text-violet-500 shrink-0 group-hover:text-violet-400">›</span>
+            <span className="text-xs text-gray-300">{q}</span>
+          </button>
+        ))}
+      </div>
+    </div>
+  )
+}
+// ─── Message Card ─────────────────────────────────────────────────
+function MessageCard({
+  msg,
+  onFeedback,
+  onRetry,
+}: {
+  msg: ChatMessage
+  onFeedback: (id: string, correct: boolean) => Promise<void>
+  onRetry: (q: string) => void
+}) {
+  const [sqlOpen, setSqlOpen] = useState(true)
+  return (
+    <div className="flex flex-col gap-2.5">
+      {/* User question bubble */}
+      <div className="flex justify-end">
+        <div className="max-w-[80%] bg-violet-600/20 border border-violet-500/25 rounded-2xl rounded-tr-sm px-4 py-2.5">
+          <p className="text-sm text-white leading-relaxed">{msg.question}</p>
+        </div>
+      </div>
+      {/* Agent response */}
+      <div className="flex flex-col gap-2">
+        {/* Streaming thinking */}
+        {msg.status === 'streaming' && !msg.sql && (
+          <div className="flex items-center gap-2 text-xs text-gray-500 px-1">
+            <Loader2 size={11} className="animate-spin text-violet-400" />
+            Generating SQL...
+          </div>
+        )}
+        {/* Multiple attempts */}
+        <AttemptSteps steps={msg.steps} />
+        {/* Final SQL block */}
+        {msg.sql && (
+          <div className="border border-white/[0.06] rounded-xl overflow-hidden">
+            <button
+              onClick={() => setSqlOpen((v) => !v)}
+              className="w-full flex items-center justify-between px-3 py-2 bg-white/[0.02] hover:bg-white/[0.04] transition-colors"
+            >
+              <div className="flex items-center gap-2">
+                <span className="text-[10px] font-semibold text-gray-500 uppercase tracking-wider">
+                  SQL
+                </span>
+                {msg.status === 'streaming' && (
+                  <Loader2 size={10} className="animate-spin text-violet-400" />
+                )}
+                {msg.attempts > 1 && (
+                  <AttemptBadge attempt={msg.attempts} total={msg.attempts} />
+                )}
+              </div>
+              {sqlOpen ? (
+                <ChevronUp size={11} className="text-gray-600" />
+              ) : (
+                <ChevronDown size={11} className="text-gray-600" />
+              )}
+            </button>
+            {sqlOpen && (
+              <SqlBlock sql={msg.sql} streaming={msg.status === 'streaming'} />
+            )}
+          </div>
+        )}
+        {/* Executing indicator */}
+        {msg.status === 'streaming' && msg.sql && msg.rows.length === 0 && !msg.errorMsg && (
+          <div className="flex items-center gap-2 text-xs text-gray-500 px-1">
+            <Loader2 size={11} className="animate-spin text-violet-400" />
+            Executing...
+          </div>
+        )}
+        {/* RL badges row */}
+        {(msg.rlAction || msg.reward !== undefined) && (
+          <div className="flex items-center gap-2 flex-wrap">
+            {msg.rlAction && (
+              <RLActionBadge action={msg.rlAction} score={msg.rlActionScore} />
+            )}
+            {msg.reward !== undefined && <RewardBadge reward={msg.reward} />}
+          </div>
+        )}
+        {/* Result table */}
+        {msg.status === 'done' && msg.attempts > 0 && (
+          <div className="flex flex-col gap-1.5">
+            <div className="flex items-center gap-2 text-[10px] px-0.5">
+              <CheckCircle2 size={11} className="text-green-400" />
+              <span className="text-green-400 font-semibold">Success</span>
+              <span className="text-gray-600">
+                · {msg.rowCount} row{msg.rowCount !== 1 ? 's' : ''}
+              </span>
+              {msg.attempts > 1 && (
+                <span className="text-amber-400/60">{msg.attempts} attempts</span>
+              )}
+            </div>
+            <ResultsTable rows={msg.rows} rowCount={msg.rowCount} />
+          </div>
+        )}
+        {/* Error */}
+        {msg.status === 'error' && (
+          <div className="flex items-start gap-2 bg-red-500/10 border border-red-500/20 rounded-xl px-3 py-2.5 text-xs text-red-300">
+            <XCircle size={12} className="shrink-0 mt-0.5" />
+            <div>
+              <p className="font-semibold mb-0.5">Query failed</p>
+              <p className="opacity-80">{msg.errorMsg ?? 'Agent exhausted all repair attempts'}</p>
+            </div>
+          </div>
+        )}
+        {/* Feedback */}
+        {msg.status === 'done' && msg.attempts > 0 && (
+          <div className="flex items-center gap-2">
+            {msg.feedback ? (
+              <div
+                className={`text-xs flex items-center gap-1.5 ${
+                  msg.feedback === 'correct' ? 'text-green-400' : 'text-red-400'
+                }`}
+              >
+                {msg.feedback === 'correct' ? (
+                  <CheckCircle2 size={12} />
+                ) : (
+                  <XCircle size={12} />
+                )}
+                Marked as {msg.feedback}
+              </div>
+            ) : (
+              <>
+                <span className="text-[10px] text-gray-600 mr-0.5">Was this correct?</span>
+                <button
+                  disabled={msg.feedbackSending}
+                  onClick={() => onFeedback(msg.id, true)}
+                  className="flex items-center gap-1 px-2 py-1 text-[10px] font-medium rounded-lg border border-green-500/25 bg-green-500/8 text-green-400 hover:bg-green-500/15 transition-all disabled:opacity-40"
+                >
+                  <CheckCircle2 size={10} />
+                  Correct
+                </button>
+                <button
+                  disabled={msg.feedbackSending}
+                  onClick={() => onFeedback(msg.id, false)}
+                  className="flex items-center gap-1 px-2 py-1 text-[10px] font-medium rounded-lg border border-red-500/25 bg-red-500/8 text-red-400 hover:bg-red-500/15 transition-all disabled:opacity-40"
+                >
+                  <XCircle size={10} />
+                  Wrong
+                </button>
+              </>
+            )}
+            {(msg.status === 'done' || msg.status === 'error') && (
+              <button
+                onClick={() => onRetry(msg.question)}
+                className="ml-auto flex items-center gap-1 text-[10px] text-gray-600 hover:text-gray-400 transition-colors"
+              >
+                <RefreshCw size={10} />
+                Retry
+              </button>
+            )}
+          </div>
+        )}
+      </div>
+    </div>
+  )
+}
+// ─── Chat Panel ───────────────────────────────────────────────────
+export function ChatPanel() {
+  const {
+    messages, addMessage, updateMessage, clearMessages,
+    isExecuting, setIsExecuting,
+    taskId, taskDifficulty,
+    optimizingBanner, setOptimizingBanner,
+    promptGeneration,
+  } = useStore()
+  const [input, setInput] = useState('')
+  const bottomRef = useRef<HTMLDivElement>(null)
+  const inputRef = useRef<HTMLTextAreaElement>(null)
+  useEffect(() => {
+    bottomRef.current?.scrollIntoView({ behavior: 'smooth' })
+  }, [messages.length])
+  const handleFeedback = useCallback(
+    async (id: string, correct: boolean) => {
+      const msg = messages.find((m) => m.id === id)
+      if (!msg) return
+      updateMessage(id, { feedbackSending: true })
+      try {
+        await submitFeedback(msg.question, msg.sql, correct)
+        updateMessage(id, { feedback: correct ? 'correct' : 'wrong', feedbackSending: false })
+      } catch {
+        updateMessage(id, { feedbackSending: false })
+      }
+    },
+    [messages, updateMessage]
+  )
+  const execute = useCallback(
+    async (question: string) => {
+      if (!question.trim() || isExecuting) return
+      setIsExecuting(true)
+      const msgId = `msg-${Date.now()}`
+      const newMsg: ChatMessage = {
+        id: msgId,
+        question,
+        status: 'streaming',
+        sql: '',
+        rows: [],
+        rowCount: 0,
+        attempts: 0,
+        steps: [],
+        feedback: null,
+        promptGeneration,
+      }
+      addMessage(newMsg)
+      try {
+        for await (const event of streamExecuteQuery(question, taskId)) {
+          if (event.type === 'sql') {
+            updateMessage(msgId, { sql: event.sql as string })
+          } else if (event.type === 'sql_chunk') {
+            // incremental SQL streaming — read current sql from store
+            const curSql = useStore.getState().messages.find((m) => m.id === msgId)?.sql ?? ''
+            updateMessage(msgId, { sql: curSql + (event.chunk as string) })
+          } else if (event.type === 'attempt') {
+            const step: AttemptStep = {
+              attempt: event.attempt as number,
+              sql: event.sql as string,
+              error: event.error as string | undefined,
+              action: event.action as string | undefined,
+              actionScore: event.action_score as number | undefined,
+              reward: event.reward as number | undefined,
+            }
+            const curSteps = useStore.getState().messages.find((m) => m.id === msgId)?.steps ?? []
+            updateMessage(msgId, {
+              attempts: event.attempt as number,
+              steps: [...curSteps, step],
+              sql: event.sql as string,
+              rlAction: event.action as string | undefined,
+              rlActionScore: event.action_score as number | undefined,
+            })
+          } else if (event.type === 'result') {
+            updateMessage(msgId, {
+              rows: (event.rows as Record<string, unknown>[]) ?? [],
+              rowCount: (event.row_count as number) ?? 0,
+              reward: event.reward as number | undefined,
+            })
+          } else if (event.type === 'done') {
+            updateMessage(msgId, {
+              status: 'done',
+              attempts: (event.attempts as number) ?? 1,
+              reward: event.reward as number | undefined,
+            })
+          } else if (event.type === 'error') {
+            updateMessage(msgId, {
+              status: 'error',
+              errorMsg: event.message as string,
+            })
+          } else if (event.type === 'gepa_start') {
+            setOptimizingBanner(true)
+          } else if (event.type === 'gepa_done') {
+            setOptimizingBanner(false)
+          }
+        }
+      } catch (err) {
+        updateMessage(msgId, {
+          status: 'error',
+          errorMsg: err instanceof Error ? err.message : 'Network error',
+        })
+      } finally {
+        setIsExecuting(false)
+        // If still streaming after generator ends, mark done
+        const finalMsg = useStore.getState().messages.find((m) => m.id === msgId)
+        if (finalMsg?.status === 'streaming') {
+          updateMessage(msgId, { status: finalMsg.sql ? 'done' : 'error' })
+        }
+      }
+    },
+    [isExecuting, setIsExecuting, addMessage, updateMessage, taskId, promptGeneration, setOptimizingBanner]
+  )
+  const handleSend = () => {
+    if (!input.trim()) return
+    const q = input.trim()
+    setInput('')
+    void execute(q)
+  }
+  const handleKeyDown = (e: React.KeyboardEvent<HTMLTextAreaElement>) => {
+    if (e.key === 'Enter' && !e.shiftKey) {
+      e.preventDefault()
+      handleSend()
+    }
+  }
+  const suggestions = SUGGESTED[taskDifficulty] ?? SUGGESTED.easy
+  return (
+    <div className="flex flex-col h-full">
+      {/* Optimizing banner */}
+      <AnimatePresence>
+        {optimizingBanner && (
+          <motion.div
+            initial={{ height: 0, opacity: 0 }}
+            animate={{ height: 'auto', opacity: 1 }}
+            exit={{ height: 0, opacity: 0 }}
+            className="shrink-0 overflow-hidden"
+          >
+            <div className="shimmer-banner border-b border-violet-500/20 px-4 py-2 flex items-center gap-2">
+              <Loader2 size={12} className="animate-spin text-violet-400" />
+              <span className="text-xs text-violet-300 font-semibold">
+                Optimizing system prompt via GEPA...
+              </span>
+            </div>
+          </motion.div>
+        )}
+      </AnimatePresence>
+      {/* Messages */}
+      <div className="flex-1 overflow-y-auto px-4 py-4">
+        {messages.length === 0 ? (
+          <EmptyState onSelect={(q) => { setInput(q); inputRef.current?.focus() }} />
+        ) : (
+          <div className="flex flex-col gap-6 max-w-3xl mx-auto">
+            {messages.map((msg) => (
+              <MessageCard
+                key={msg.id}
+                msg={msg}
+                onFeedback={handleFeedback}
+                onRetry={(q) => { setInput(q); inputRef.current?.focus() }}
+              />
+            ))}
+            <div ref={bottomRef} />
+          </div>
+        )}
+      </div>
+      {/* Input area */}
+      <div
+        className="shrink-0 border-t border-white/[0.06] px-4 py-3"
+        style={{ background: 'var(--bg-secondary)' }}
+      >
+        {/* Suggested chips */}
+        {messages.length > 0 && (
+          <div className="flex gap-1.5 flex-wrap mb-2.5">
+            {suggestions.slice(0, 3).map((q) => (
+              <button
+                key={q}
+                onClick={() => { setInput(q); inputRef.current?.focus() }}
+                className="text-[10px] px-2.5 py-1 rounded-full border border-white/[0.06] text-gray-500 hover:text-gray-300 hover:border-violet-500/30 transition-all"
+              >
+                {q}
+              </button>
+            ))}
+          </div>
+        )}
+        <div className="flex items-end gap-2">
+          <div className="flex-1 relative">
+            <textarea
+              ref={inputRef}
+              value={input}
+              onChange={(e) => setInput(e.target.value)}
+              onKeyDown={handleKeyDown}
+              placeholder="Ask about products, orders, sellers..."
+              disabled={isExecuting}
+              rows={1}
+              className="w-full px-3 py-2.5 pr-10 text-sm text-white rounded-xl border border-white/[0.06] bg-white/[0.03] placeholder-gray-600 resize-none focus:outline-none focus:border-violet-500/40 focus:bg-white/[0.05] transition-all disabled:opacity-50"
+              style={{ minHeight: 40, maxHeight: 120, overflowY: 'auto' }}
+            />
+          </div>
+          <div className="flex flex-col gap-1.5 shrink-0">
+            <button
+              onClick={handleSend}
+              disabled={!input.trim() || isExecuting}
+              className="w-9 h-9 rounded-xl bg-violet-600 hover:bg-violet-500 disabled:opacity-40 disabled:cursor-not-allowed transition-all flex items-center justify-center"
+            >
+              {isExecuting ? (
+                <Loader2 size={14} className="animate-spin text-white" />
+              ) : (
+                <Send size={14} className="text-white" />
+              )}
+            </button>
+            {messages.length > 0 && (
+              <button
+                onClick={clearMessages}
+                disabled={isExecuting}
+                className="w-9 h-9 rounded-xl border border-white/[0.06] hover:bg-white/5 disabled:opacity-40 transition-all flex items-center justify-center text-gray-600 hover:text-gray-400"
+                title="Clear chat"
+              >
+                <Trash2 size={12} />
+              </button>
+            )}
+          </div>
+        </div>
+        <p className="text-[9px] text-gray-700 mt-1.5 text-center">
+          Enter to send · Shift+Enter for newline · Agent uses LinUCB + GEPA
+        </p>
+      </div>
+    </div>
+  )
+}

frontend/src/components/ERDiagram.tsx ADDED Viewed

	@@ -0,0 +1,234 @@

+import { useState, useEffect, useRef } from 'react'
+import { Loader2, GitFork } from 'lucide-react'
+import { useStore } from '../store/useStore'
+import { fetchSchemaGraph } from '../lib/api'
+import type { SchemaTable, SchemaRelationship } from '../lib/types'
+// ─── Table card ───────────────────────────────────────────────────
+function TableCard({ table, x, y }: { table: SchemaTable; x: number; y: number }) {
+  return (
+    <g transform={`translate(${x},${y})`}>
+      {/* Card bg */}
+      <rect
+        width={180}
+        height={28 + table.columns.length * 20}
+        rx={8}
+        fill="#0e0e16"
+        stroke="rgba(255,255,255,0.08)"
+        strokeWidth={1}
+      />
+      {/* Header */}
+      <rect width={180} height={28} rx={8} fill="rgba(139,92,246,0.15)" />
+      <rect y={20} width={180} height={8} fill="rgba(139,92,246,0.15)" />
+      <text
+        x={10}
+        y={18}
+        fill="#a78bfa"
+        fontSize={11}
+        fontWeight="bold"
+        fontFamily="ui-monospace,monospace"
+      >
+        {table.name}
+      </text>
+      {/* Columns */}
+      {table.columns.map((col, i) => (
+        <g key={col.name} transform={`translate(0,${28 + i * 20})`}>
+          <rect
+            width={180}
+            height={20}
+            fill={i % 2 === 0 ? 'rgba(255,255,255,0.01)' : 'transparent'}
+          />
+          <text
+            x={10}
+            y={14}
+            fill={col.pk ? '#60a5fa' : col.fk ? '#34d399' : 'rgba(255,255,255,0.5)'}
+            fontSize={10}
+            fontFamily="ui-monospace,monospace"
+          >
+            {col.pk ? '🔑 ' : col.fk ? '🔗 ' : '   '}
+            {col.name}
+          </text>
+          <text
+            x={170}
+            y={14}
+            fill="rgba(255,255,255,0.2)"
+            fontSize={9}
+            fontFamily="ui-monospace,monospace"
+            textAnchor="end"
+          >
+            {col.type}
+          </text>
+        </g>
+      ))}
+    </g>
+  )
+}
+// ─── Layout helpers ───────────────────────────────────────────────
+function layoutTables(tables: SchemaTable[]) {
+  const CARD_W = 180
+  const CARD_H_BASE = 28
+  const COL_H = 20
+  const GAP_X = 40
+  const GAP_Y = 30
+  const COLS_PER_ROW = 3
+  const positions: Record<string, { x: number; y: number; w: number; h: number }> = {}
+  let maxRowH = 0
+  tables.forEach((t, i) => {
+    const col = i % COLS_PER_ROW
+    const row = Math.floor(i / COLS_PER_ROW)
+    const h = CARD_H_BASE + t.columns.length * COL_H
+    if (row === Math.floor(i / COLS_PER_ROW) && col === 0) maxRowH = 0
+    maxRowH = Math.max(maxRowH, h)
+    const prevRowsH = tables
+      .slice(0, row * COLS_PER_ROW)
+      .reduce((acc, _, idx) => {
+        if (idx % COLS_PER_ROW === 0) {
+          const rowH = tables.slice(idx, idx + COLS_PER_ROW).reduce(
+            (m, rt) => Math.max(m, CARD_H_BASE + rt.columns.length * COL_H),
+            0
+          )
+          return acc + rowH + GAP_Y
+        }
+        return acc
+      }, 0)
+    positions[t.name] = {
+      x: col * (CARD_W + GAP_X) + 20,
+      y: prevRowsH + 20,
+      w: CARD_W,
+      h,
+    }
+  })
+  return positions
+}
+function RelationshipLine({
+  from,
+  to,
+  positions,
+}: {
+  from: string
+  to: string
+  positions: Record<string, { x: number; y: number; w: number; h: number }>
+}) {
+  const a = positions[from]
+  const b = positions[to]
+  if (!a || !b) return null
+  const x1 = a.x + a.w
+  const y1 = a.y + 14
+  const x2 = b.x
+  const y2 = b.y + 14
+  const cx = (x1 + x2) / 2
+  return (
+    <path
+      d={`M${x1},${y1} C${cx},${y1} ${cx},${y2} ${x2},${y2}`}
+      stroke="rgba(139,92,246,0.3)"
+      strokeWidth={1.5}
+      fill="none"
+      strokeDasharray="4 3"
+    />
+  )
+}
+// ─── ER Diagram component ─────────────────────────────────────────
+export function ERDiagram() {
+  const { schemaGraph, setSchemaGraph } = useStore()
+  const [loading, setLoading] = useState(false)
+  const svgRef = useRef<SVGSVGElement>(null)
+  const load = async () => {
+    setLoading(true)
+    try {
+      const data = await fetchSchemaGraph()
+      setSchemaGraph(data)
+    } catch {
+      // noop
+    } finally {
+      setLoading(false)
+    }
+  }
+  useEffect(() => {
+    if (!schemaGraph) void load()
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, [])
+  if (loading) {
+    return (
+      <div className="flex items-center justify-center h-full gap-2 text-gray-500">
+        <Loader2 size={16} className="animate-spin" />
+        <span className="text-sm">Loading schema...</span>
+      </div>
+    )
+  }
+  if (!schemaGraph || schemaGraph.tables.length === 0) {
+    return (
+      <div className="flex flex-col items-center justify-center h-full gap-3 text-gray-600">
+        <GitFork size={32} className="text-gray-700" />
+        <p className="text-sm">Schema will appear after database connects</p>
+        <button
+          onClick={() => void load()}
+          className="text-xs text-violet-400 hover:text-violet-300 transition-colors"
+        >
+          Retry
+        </button>
+      </div>
+    )
+  }
+  const { tables, relationships } = schemaGraph
+  const positions = layoutTables(tables)
+  const allX = Object.values(positions).map((p) => p.x + p.w)
+  const allY = Object.values(positions).map((p) => p.y + p.h)
+  const svgW = Math.max(...allX) + 40
+  const svgH = Math.max(...allY) + 40
+  return (
+    <div className="h-full overflow-auto p-4">
+      <div className="text-[10px] text-gray-500 uppercase tracking-widest mb-3 flex items-center gap-1.5">
+        <GitFork size={10} className="text-violet-400" />
+        Entity Relationship Diagram
+        <span className="text-gray-700">· {tables.length} tables</span>
+      </div>
+      <svg
+        ref={svgRef}
+        width={svgW}
+        height={svgH}
+        style={{ minWidth: svgW }}
+      >
+        {/* FK lines */}
+        {(relationships as SchemaRelationship[]).map((rel, i) => (
+          <RelationshipLine
+            key={i}
+            from={rel.from}
+            to={rel.to}
+            positions={positions}
+          />
+        ))}
+        {/* Tables */}
+        {tables.map((t: SchemaTable) => (
+          <TableCard
+            key={t.name}
+            table={t}
+            x={positions[t.name]?.x ?? 0}
+            y={positions[t.name]?.y ?? 0}
+          />
+        ))}
+      </svg>
+    </div>
+  )
+}

frontend/src/components/Header.tsx ADDED Viewed

	@@ -0,0 +1,110 @@

+import { Database, Sun, Moon, PanelLeftOpen, PanelRightOpen, Cpu } from 'lucide-react'
+import { useStore } from '../store/useStore'
+import type { Difficulty } from '../lib/types'
+interface HeaderProps {
+  onToggleLeft: () => void
+  onToggleRight: () => void
+}
+const DIFFICULTIES: { id: Difficulty; label: string; color: string }[] = [
+  { id: 'easy', label: 'Easy', color: 'text-green-400 border-green-500/30 bg-green-500/10' },
+  { id: 'medium', label: 'Medium', color: 'text-amber-400 border-amber-500/30 bg-amber-500/10' },
+  { id: 'hard', label: 'Hard', color: 'text-red-400 border-red-500/30 bg-red-500/10' },
+]
+export function Header({ onToggleLeft, onToggleRight }: HeaderProps) {
+  const { theme, toggleTheme, dbSeeded, taskDifficulty, setTaskDifficulty } = useStore()
+  return (
+    <header
+      className="border-b px-3 sm:px-5 py-3 flex items-center justify-between shrink-0 backdrop-blur-sm sticky top-0 z-50 theme-border"
+      style={{ background: 'var(--bg-secondary)' }}
+    >
+      <div className="flex items-center gap-2 sm:gap-3">
+        {/* Mobile sidebar toggle */}
+        <button
+          onClick={onToggleLeft}
+          className="lg:hidden flex items-center gap-1 px-2 py-1.5 rounded-lg hover:bg-white/5 text-gray-400 hover:text-white transition-colors text-[10px]"
+        >
+          <PanelLeftOpen size={14} />
+          <span className="hidden sm:inline">Data</span>
+        </button>
+        {/* Logo */}
+        <div
+          className="w-7 h-7 rounded-lg flex items-center justify-center shadow-lg shrink-0"
+          style={{ background: '#1e3a5f', boxShadow: '0 4px 12px rgba(30,58,95,0.4)' }}
+        >
+          <Database size={13} className="text-white" />
+        </div>
+        {/* Title */}
+        <div>
+          <h1 className="text-sm font-bold text-white tracking-tight leading-none">
+            SQL Agent OpenEnv
+          </h1>
+          <p className="text-[10px] text-gray-600 hidden sm:block mt-0.5">
+            Reinforcement Learning Environment
+          </p>
+        </div>
+      </div>
+      <div className="flex items-center gap-2 sm:gap-3">
+        {/* Connection status */}
+        {dbSeeded ? (
+          <div className="hidden sm:flex items-center gap-1.5 text-[10px] text-green-400">
+            <span className="w-1.5 h-1.5 rounded-full bg-green-400 inline-block" />
+            benchmark db
+          </div>
+        ) : (
+          <div className="hidden sm:flex items-center gap-1.5 text-[10px] text-amber-400">
+            <span className="w-1.5 h-1.5 rounded-full bg-amber-400 inline-block animate-pulse" />
+            connecting...
+          </div>
+        )}
+        {/* RL indicator */}
+        <div className="hidden md:flex items-center gap-1.5 text-[10px] text-violet-400 border border-violet-500/20 rounded-full px-2 py-0.5">
+          <Cpu size={10} />
+          LinUCB Active
+        </div>
+        {/* Difficulty selector */}
+        <div className="flex items-center gap-1 border border-white/[0.06] rounded-lg p-0.5">
+          {DIFFICULTIES.map((d) => (
+            <button
+              key={d.id}
+              onClick={() => setTaskDifficulty(d.id)}
+              className={`text-[10px] font-semibold px-2 py-1 rounded transition-all ${
+                taskDifficulty === d.id
+                  ? `${d.color} border`
+                  : 'text-gray-500 hover:text-gray-300 border border-transparent'
+              }`}
+            >
+              {d.label}
+            </button>
+          ))}
+        </div>
+        {/* Theme toggle */}
+        <button
+          onClick={toggleTheme}
+          className="p-1.5 rounded-lg hover:bg-white/5 transition-colors theme-text-muted"
+          title={theme === 'dark' ? 'Switch to light' : 'Switch to dark'}
+        >
+          {theme === 'dark' ? <Sun size={14} /> : <Moon size={14} />}
+        </button>
+        {/* Mobile right sidebar toggle */}
+        <button
+          onClick={onToggleRight}
+          className="lg:hidden flex items-center gap-1 px-2 py-1.5 rounded-lg hover:bg-white/5 text-gray-400 hover:text-white transition-colors text-[10px]"
+        >
+          <span className="hidden sm:inline">GEPA</span>
+          <PanelRightOpen size={14} />
+        </button>
+      </div>
+    </header>
+  )
+}

frontend/src/components/LeftSidebar.tsx ADDED Viewed

	@@ -0,0 +1,157 @@

+import { useState } from 'react'
+import { motion, AnimatePresence } from 'framer-motion'
+import { Database, Table2, ChevronDown, ChevronRight, GitFork, ShoppingCart } from 'lucide-react'
+import { useStore } from '../store/useStore'
+import type { Difficulty } from '../lib/types'
+const DIFFICULTY_CONFIG: Record<Difficulty, { label: string; bg: string; text: string; border: string }> = {
+  easy: { label: 'Easy', bg: 'bg-green-500/10', text: 'text-green-400', border: 'border-green-500/30' },
+  medium: { label: 'Medium', bg: 'bg-amber-500/10', text: 'text-amber-400', border: 'border-amber-500/30' },
+  hard: { label: 'Hard', bg: 'bg-red-500/10', text: 'text-red-400', border: 'border-red-500/30' },
+}
+export function LeftSidebar() {
+  const { tables, taskDifficulty, setTaskDifficulty, dbSeeded } = useStore()
+  const [tablesExpanded, setTablesExpanded] = useState(true)
+  const cfg = DIFFICULTY_CONFIG[taskDifficulty]
+  return (
+    <div className="flex flex-col gap-4 py-1">
+      {/* Task Difficulty */}
+      <section>
+        <div className="text-[10px] font-semibold text-gray-500 uppercase tracking-widest mb-2 flex items-center gap-1.5">
+          <GitFork size={10} className="text-violet-400" />
+          Task Difficulty
+        </div>
+        <div className="flex flex-col gap-1">
+          {(Object.keys(DIFFICULTY_CONFIG) as Difficulty[]).map((d) => {
+            const c = DIFFICULTY_CONFIG[d]
+            const active = d === taskDifficulty
+            return (
+              <button
+                key={d}
+                onClick={() => setTaskDifficulty(d)}
+                className={`flex items-center justify-between px-3 py-2 rounded-lg border text-xs font-medium transition-all ${
+                  active
+                    ? `${c.bg} ${c.text} ${c.border}`
+                    : 'border-transparent text-gray-500 hover:text-gray-300 hover:bg-white/5'
+                }`}
+              >
+                <span>{c.label}</span>
+                {active && (
+                  <span className={`text-[9px] font-mono ${c.text} opacity-70`}>selected</span>
+                )}
+              </button>
+            )
+          })}
+        </div>
+      </section>
+      {/* Schema Tables */}
+      <section>
+        <button
+          className="text-[10px] font-semibold text-gray-500 uppercase tracking-widest mb-2 flex items-center gap-1.5 w-full"
+          onClick={() => setTablesExpanded((v) => !v)}
+        >
+          <Database size={10} className="text-blue-400" />
+          <span className="flex-1 text-left">Database Schema</span>
+          {tablesExpanded ? <ChevronDown size={10} /> : <ChevronRight size={10} />}
+        </button>
+        <AnimatePresence>
+          {tablesExpanded && (
+            <motion.div
+              initial={{ opacity: 0, height: 0 }}
+              animate={{ opacity: 1, height: 'auto' }}
+              exit={{ opacity: 0, height: 0 }}
+              className="overflow-hidden"
+            >
+              {dbSeeded && tables.length > 0 ? (
+                <div className="flex flex-col gap-1">
+                  {tables.map((t) => (
+                    <div
+                      key={t.name}
+                      className="flex items-center justify-between px-2.5 py-1.5 rounded-lg border border-white/[0.04] bg-white/[0.02] hover:bg-white/[0.04] transition-colors"
+                    >
+                      <div className="flex items-center gap-1.5">
+                        <Table2 size={10} className="text-blue-400 shrink-0" />
+                        <span className="text-xs text-gray-300 font-mono">{t.name}</span>
+                      </div>
+                      <span className="text-[9px] text-gray-600 font-mono tabular-nums">
+                        {t.rows.toLocaleString()}
+                      </span>
+                    </div>
+                  ))}
+                </div>
+              ) : (
+                <div className="flex flex-col gap-1">
+                  {[120, 80, 95, 60, 70].map((w, i) => (
+                    <div
+                      key={i}
+                      className="flex items-center justify-between px-2.5 py-1.5 rounded-lg border border-white/[0.04] bg-white/[0.02]"
+                    >
+                      <div
+                        className="h-2 rounded bg-white/10 animate-pulse"
+                        style={{ width: w }}
+                      />
+                      <div className="h-2 w-8 rounded bg-white/10 animate-pulse" />
+                    </div>
+                  ))}
+                </div>
+              )}
+            </motion.div>
+          )}
+        </AnimatePresence>
+      </section>
+      {/* Business Context */}
+      <section>
+        <div className="text-[10px] font-semibold text-gray-500 uppercase tracking-widest mb-2 flex items-center gap-1.5">
+          <ShoppingCart size={10} className="text-orange-400" />
+          Business Context
+        </div>
+        <div
+          className="rounded-xl border border-white/[0.05] p-3 text-[11px] text-gray-500 leading-relaxed"
+          style={{ background: 'var(--bg-card)' }}
+        >
+          <p className="mb-2 text-gray-400 font-medium">E-Commerce Marketplace</p>
+          <p>
+            Multi-vendor marketplace with products, orders, sellers, users, and reviews.
+            Supports complex analytical queries across sales, inventory, and user behavior.
+          </p>
+          <div className="mt-2 flex flex-wrap gap-1">
+            {['Products', 'Orders', 'Sellers', 'Users', 'Reviews', 'Categories'].map((t) => (
+              <span
+                key={t}
+                className="text-[9px] px-1.5 py-0.5 rounded border border-white/[0.06] text-gray-600"
+              >
+                {t}
+              </span>
+            ))}
+          </div>
+        </div>
+      </section>
+      {/* Current task badge */}
+      <section>
+        <div
+          className={`rounded-xl border ${cfg.border} ${cfg.bg} p-3 flex flex-col gap-1.5`}
+        >
+          <div className="flex items-center justify-between">
+            <span className={`text-[10px] font-semibold uppercase tracking-wider ${cfg.text}`}>
+              Current Task
+            </span>
+            <span className={`text-[10px] font-mono ${cfg.text}`}>{cfg.label}</span>
+          </div>
+          <p className="text-[11px] text-gray-400 leading-relaxed">
+            {taskDifficulty === 'easy'
+              ? 'Simple SELECT queries, basic filtering and aggregation'
+              : taskDifficulty === 'medium'
+              ? 'Multi-table JOINs, GROUP BY, subqueries, window functions'
+              : 'Complex CTEs, rolling aggregations, cohort analysis, ranking'}
+          </p>
+        </div>
+      </section>
+    </div>
+  )
+}

frontend/src/components/PerformanceGraph.tsx ADDED Viewed

	@@ -0,0 +1,175 @@

+import { useEffect } from 'react'
+import {
+  LineChart, Line, BarChart, Bar,
+  XAxis, YAxis, CartesianGrid, Tooltip,
+  ResponsiveContainer, ReferenceLine,
+} from 'recharts'
+import { TrendingUp, Loader2, RefreshCw } from 'lucide-react'
+import { useStore } from '../store/useStore'
+import { fetchRLState } from '../lib/api'
+const CustomTooltip = ({
+  active,
+  payload,
+  label,
+}: {
+  active?: boolean
+  payload?: { value: number; name: string; color: string }[]
+  label?: string | number
+}) => {
+  if (active && payload?.length) {
+    return (
+      <div
+        className="border border-white/10 rounded-lg px-3 py-2 text-xs"
+        style={{ background: '#1a1a2e' }}
+      >
+        <p className="text-gray-400 mb-1">#{label}</p>
+        {payload.map((p) => (
+          <p key={p.name} style={{ color: p.color }}>
+            {p.name}: <span className="font-semibold">{p.value}</span>
+          </p>
+        ))}
+      </div>
+    )
+  }
+  return null
+}
+export function PerformanceGraph() {
+  const { rlState, setRlState } = useStore()
+  const load = async () => {
+    try {
+      const data = await fetchRLState()
+      setRlState(data)
+    } catch {
+      // noop — backend might not be up
+    }
+  }
+  useEffect(() => {
+    void load()
+    const interval = setInterval(() => void load(), 10_000)
+    return () => clearInterval(interval)
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, [])
+  if (!rlState) {
+    return (
+      <div className="flex flex-col items-center justify-center h-40 text-gray-600 gap-2">
+        <TrendingUp size={24} className="text-gray-700" />
+        <p className="text-[11px] text-center">
+          RL metrics appear after agent episodes
+        </p>
+        <Loader2 size={14} className="animate-spin text-gray-700" />
+      </div>
+    )
+  }
+  const { totalEpisodes, successRate, currentAlpha, episodes, actionDistribution } = rlState
+  return (
+    <div className="flex flex-col gap-3">
+      {/* Stats row */}
+      <div className="grid grid-cols-3 gap-1.5">
+        {[
+          { label: 'Episodes', value: totalEpisodes, color: 'text-blue-400' },
+          { label: 'Success', value: `${(successRate * 100).toFixed(0)}%`, color: 'text-green-400' },
+          { label: 'Alpha', value: currentAlpha.toFixed(3), color: 'text-orange-400' },
+        ].map((s) => (
+          <div
+            key={s.label}
+            className="bg-white/5 rounded-xl p-2 text-center"
+          >
+            <div className={`text-sm font-bold font-mono ${s.color}`}>{s.value}</div>
+            <div className="text-[9px] text-gray-500 mt-0.5">{s.label}</div>
+          </div>
+        ))}
+      </div>
+      {/* Reward per episode */}
+      {episodes.length > 0 && (
+        <div>
+          <div className="flex items-center justify-between mb-1.5">
+            <div className="text-[10px] text-gray-500 font-medium">Reward per Episode</div>
+            <button
+              onClick={() => void load()}
+              className="p-1 rounded hover:bg-white/5 text-gray-600 hover:text-gray-400 transition-colors"
+              title="Refresh"
+            >
+              <RefreshCw size={10} />
+            </button>
+          </div>
+          <ResponsiveContainer width="100%" height={110}>
+            <LineChart data={episodes} margin={{ top: 4, right: 4, bottom: 0, left: -20 }}>
+              <CartesianGrid strokeDasharray="3 3" stroke="#ffffff08" />
+              <XAxis dataKey="episode" tick={{ fontSize: 9, fill: '#6b7280' }} />
+              <YAxis domain={[-1, 1]} tick={{ fontSize: 9, fill: '#6b7280' }} />
+              <Tooltip content={<CustomTooltip />} />
+              <ReferenceLine y={0} stroke="#ffffff20" strokeDasharray="3 3" />
+              <Line
+                type="monotone"
+                dataKey="totalReward"
+                name="Reward"
+                stroke="#f97316"
+                strokeWidth={2}
+                dot={episodes.length < 30 ? { fill: '#f97316', r: 2 } : false}
+                activeDot={{ r: 4 }}
+              />
+            </LineChart>
+          </ResponsiveContainer>
+        </div>
+      )}
+      {/* Action distribution */}
+      {actionDistribution.length > 0 && (
+        <div>
+          <div className="text-[10px] text-gray-500 mb-1.5 font-medium">
+            LinUCB Action Distribution
+          </div>
+          <ResponsiveContainer width="100%" height={90}>
+            <BarChart
+              data={actionDistribution}
+              margin={{ top: 4, right: 4, bottom: 0, left: -20 }}
+            >
+              <CartesianGrid strokeDasharray="3 3" stroke="#ffffff08" />
+              <XAxis
+                dataKey="action"
+                tick={{ fontSize: 8, fill: '#6b7280' }}
+                tickFormatter={(v: string) => v.replace('FIX_', '').slice(0, 6)}
+              />
+              <YAxis tick={{ fontSize: 9, fill: '#6b7280' }} />
+              <Tooltip content={<CustomTooltip />} />
+              <Bar dataKey="count" name="Uses" fill="#8b5cf6" radius={[3, 3, 0, 0]} />
+            </BarChart>
+          </ResponsiveContainer>
+        </div>
+      )}
+      {/* Success rate line */}
+      {episodes.length >= 3 && (
+        <div>
+          <div className="text-[10px] text-gray-500 mb-1.5 font-medium">
+            Rolling Success Rate
+          </div>
+          <ResponsiveContainer width="100%" height={80}>
+            <LineChart data={episodes} margin={{ top: 4, right: 4, bottom: 0, left: -20 }}>
+              <CartesianGrid strokeDasharray="3 3" stroke="#ffffff08" />
+              <XAxis dataKey="episode" tick={{ fontSize: 9, fill: '#6b7280' }} />
+              <YAxis domain={[0, 1]} tick={{ fontSize: 9, fill: '#6b7280' }} />
+              <Tooltip content={<CustomTooltip />} />
+              <Line
+                type="monotone"
+                dataKey="successRate"
+                name="Success"
+                stroke="#22c55e"
+                strokeWidth={2}
+                dot={false}
+              />
+            </LineChart>
+          </ResponsiveContainer>
+        </div>
+      )}
+    </div>
+  )
+}

frontend/src/components/PromptEvolution.tsx ADDED Viewed

	@@ -0,0 +1,148 @@

+import { useState, useEffect } from 'react'
+import { motion, AnimatePresence } from 'framer-motion'
+import { Brain, ChevronDown, ChevronUp, Zap, History } from 'lucide-react'
+import { useStore } from '../store/useStore'
+import { fetchPromptHistory } from '../lib/api'
+const SEED_PROMPT = `You are a SQL expert. Given a natural language question and a SQLite database schema, write a correct SQL query.
+Rules:
+- Output ONLY the SQL query, nothing else
+- No markdown, no code fences, no explanation
+- Use SQLite syntax
+- Always qualify column names with table aliases when using JOINs`
+export function PromptEvolution() {
+  const { currentPrompt, promptGeneration, promptHistory, setPromptData } = useStore()
+  const [expanded, setExpanded] = useState(false)
+  const [historyExpanded, setHistoryExpanded] = useState(false)
+  const [loading, setLoading] = useState(false)
+  const prompt = currentPrompt || SEED_PROMPT
+  const generation = promptGeneration
+  const loadHistory = async () => {
+    setLoading(true)
+    try {
+      const data = await fetchPromptHistory()
+      setPromptData(data)
+    } catch {
+      // noop
+    } finally {
+      setLoading(false)
+    }
+  }
+  useEffect(() => {
+    void loadHistory()
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, [])
+  return (
+    <div className="flex flex-col gap-2">
+      {/* Header */}
+      <button
+        onClick={() => setExpanded((v) => !v)}
+        className="flex items-center justify-between w-full group"
+      >
+        <div className="flex items-center gap-2">
+          <Brain size={14} className="text-violet-400" />
+          <span className="text-xs font-semibold text-white/70">System Prompt</span>
+          {generation > 0 ? (
+            <span className="text-[10px] bg-violet-500/20 text-violet-300 border border-violet-500/30 rounded-full px-2 py-0.5">
+              Gen {generation} · Optimized
+            </span>
+          ) : (
+            <span className="text-[10px] bg-white/5 text-gray-500 rounded-full px-2 py-0.5">
+              Seed
+            </span>
+          )}
+        </div>
+        {expanded ? (
+          <ChevronUp size={13} className="text-gray-500" />
+        ) : (
+          <ChevronDown size={13} className="text-gray-500" />
+        )}
+      </button>
+      <AnimatePresence>
+        {expanded && (
+          <motion.div
+            initial={{ opacity: 0, height: 0 }}
+            animate={{ opacity: 1, height: 'auto' }}
+            exit={{ opacity: 0, height: 0 }}
+            transition={{ duration: 0.2 }}
+            className="overflow-hidden"
+          >
+            {/* Prompt preview */}
+            <div className="max-h-40 overflow-y-auto">
+              <pre className="text-[11px] font-mono text-violet-200/70 bg-violet-950/30 rounded-xl p-3 border border-violet-500/20 whitespace-pre-wrap leading-relaxed">
+                {prompt}
+              </pre>
+            </div>
+            {/* History button */}
+            {promptHistory.length > 0 && (
+              <button
+                onClick={() => setHistoryExpanded((v) => !v)}
+                className="mt-2 w-full flex items-center justify-center gap-2 px-3 py-2 text-xs font-medium bg-violet-600/15 text-violet-300 border border-violet-500/25 rounded-xl hover:bg-violet-600/25 hover:border-violet-500/40 transition-all"
+              >
+                <History size={12} />
+                {historyExpanded ? 'Hide' : 'View'} Evolution History
+                <span className="text-[10px] text-violet-400/60 ml-1">
+                  ({promptHistory.length} gen{promptHistory.length !== 1 ? 's' : ''})
+                </span>
+              </button>
+            )}
+            {/* Generation history */}
+            <AnimatePresence>
+              {historyExpanded && promptHistory.length > 0 && (
+                <motion.div
+                  initial={{ height: 0, opacity: 0 }}
+                  animate={{ height: 'auto', opacity: 1 }}
+                  exit={{ height: 0, opacity: 0 }}
+                  transition={{ duration: 0.15 }}
+                  className="overflow-hidden mt-2"
+                >
+                  <div className="flex flex-col gap-1.5">
+                    <div className="text-[10px] text-gray-500 font-medium flex items-center gap-1">
+                      <Zap size={10} className="text-violet-400" />
+                      Optimization History
+                    </div>
+                    {promptHistory.map((snap) => (
+                      <div
+                        key={snap.generation}
+                        className="border border-white/5 rounded-xl p-2.5 hover:border-white/10 hover:bg-white/[0.02] transition-all"
+                      >
+                        <div className="flex items-center justify-between mb-1">
+                          <span className="text-[10px] font-semibold text-violet-400">
+                            Generation {snap.generation}
+                          </span>
+                          <span className="text-[10px] font-mono text-green-400">
+                            {(snap.score * 100).toFixed(0)}%
+                          </span>
+                        </div>
+                        <p className="text-[10px] text-gray-400 leading-relaxed line-clamp-2">
+                          {snap.summary}
+                        </p>
+                        <p className="text-[9px] text-gray-600 mt-1">{snap.timestamp}</p>
+                      </div>
+                    ))}
+                  </div>
+                </motion.div>
+              )}
+            </AnimatePresence>
+            {loading && (
+              <div className="flex items-center gap-2 text-[10px] text-gray-500 mt-2 px-1">
+                <span className="w-3 h-3 border border-violet-500/40 border-t-violet-400 rounded-full animate-spin inline-block" />
+                Loading history...
+              </div>
+            )}
+          </motion.div>
+        )}
+      </AnimatePresence>
+    </div>
+  )
+}

frontend/src/components/ResultsTable.tsx ADDED Viewed

	@@ -0,0 +1,78 @@

+const MAX_ROWS = 10
+const MAX_CELL_LEN = 30
+function truncate(val: unknown): string {
+  const s = val === null || val === undefined ? 'null' : String(val)
+  return s.length > MAX_CELL_LEN ? s.slice(0, MAX_CELL_LEN) + '…' : s
+}
+interface ResultsTableProps {
+  rows: Record<string, unknown>[]
+  rowCount: number
+}
+export function ResultsTable({ rows, rowCount }: ResultsTableProps) {
+  if (rows.length === 0) {
+    return (
+      <div className="text-xs text-gray-500 italic px-3 py-2 border border-white/[0.06] rounded-xl">
+        No rows returned.
+      </div>
+    )
+  }
+  const columns = Object.keys(rows[0])
+  const displayRows = rows.slice(0, MAX_ROWS)
+  return (
+    <div className="overflow-auto max-h-60 rounded-xl border border-white/[0.06]" style={{ fontSize: 11 }}>
+      {rowCount > MAX_ROWS && (
+        <div className="px-3 py-1 text-[10px] text-amber-400/70 bg-amber-500/5 border-b border-amber-500/10 shrink-0">
+          Showing {MAX_ROWS} of {rowCount} rows
+        </div>
+      )}
+      <table className="w-full font-mono border-collapse">
+        <thead>
+          <tr
+            className="border-b border-white/[0.06] sticky top-0"
+            style={{ background: 'var(--bg-tertiary)' }}
+          >
+            {columns.map((col) => (
+              <th
+                key={col}
+                className="px-3 py-1.5 text-left text-[10px] font-semibold text-gray-500 uppercase tracking-wider whitespace-nowrap"
+              >
+                {col}
+              </th>
+            ))}
+          </tr>
+        </thead>
+        <tbody>
+          {displayRows.map((row, i) => (
+            <tr
+              key={i}
+              className="border-b border-white/[0.03] hover:bg-white/[0.02] transition-colors"
+            >
+              {columns.map((col) => (
+                <td
+                  key={col}
+                  className={`px-3 py-1.5 whitespace-nowrap ${
+                    row[col] === null ? 'text-gray-600 italic' : 'text-gray-300'
+                  }`}
+                  title={row[col] !== null ? String(row[col]) : undefined}
+                >
+                  {truncate(row[col])}
+                </td>
+              ))}
+            </tr>
+          ))}
+        </tbody>
+      </table>
+      <div
+        className="px-3 py-1 text-[10px] text-gray-600 border-t border-white/[0.04]"
+        style={{ background: 'var(--bg-tertiary)' }}
+      >
+        Showing {displayRows.length} of {rowCount} rows
+      </div>
+    </div>
+  )
+}

frontend/src/components/RightSidebar.tsx ADDED Viewed

	@@ -0,0 +1,27 @@

+import { Zap, Brain } from 'lucide-react'
+import { PromptEvolution } from './PromptEvolution'
+import { PerformanceGraph } from './PerformanceGraph'
+export function RightSidebar() {
+  return (
+    <div className="flex flex-col h-full overflow-y-auto">
+      {/* GEPA Section */}
+      <div className="p-4 border-b border-white/[0.06] shrink-0">
+        <div className="flex items-center gap-2 text-[10px] font-semibold text-gray-500 uppercase tracking-widest mb-3">
+          <Brain size={10} className="text-violet-400" />
+          GEPA Prompt Evolution
+        </div>
+        <PromptEvolution />
+      </div>
+      {/* RL Charts */}
+      <div className="p-4 flex-1 overflow-y-auto">
+        <div className="flex items-center gap-2 text-[10px] font-semibold text-gray-500 uppercase tracking-widest mb-3">
+          <Zap size={10} className="text-violet-400" />
+          RL Learning Progress
+        </div>
+        <PerformanceGraph />
+      </div>
+    </div>
+  )
+}

frontend/src/index.css ADDED Viewed

	@@ -0,0 +1,187 @@

+@tailwind base;
+@tailwind components;
+@tailwind utilities;
+/* ─── Theme Variables ──────────────────────────────────────────── */
+:root {
+  --bg-primary: #08080d;
+  --bg-secondary: #09090f;
+  --bg-tertiary: #0a0a12;
+  --bg-card: #0e0e16;
+  --bg-input: rgba(255, 255, 255, 0.03);
+  --bg-hover: rgba(255, 255, 255, 0.02);
+  --bg-hover-strong: rgba(255, 255, 255, 0.05);
+  --text-primary: #ffffff;
+  --text-secondary: rgba(255, 255, 255, 0.7);
+  --text-muted: #6b7280;
+  --text-dim: #4b5563;
+  --border-color: rgba(255, 255, 255, 0.06);
+  --border-hover: rgba(255, 255, 255, 0.12);
+  --accent-violet: #8b5cf6;
+  --accent-green: #22c55e;
+  --accent-orange: #f97316;
+  --accent-red: #ef4444;
+  --accent-blue: #3b82f6;
+  --background: var(--bg-primary);
+  --foreground: var(--text-primary);
+}
+[data-theme="light"] {
+  --bg-primary: #f5f6f8;
+  --bg-secondary: #ffffff;
+  --bg-tertiary: #eef0f3;
+  --bg-card: #ffffff;
+  --bg-input: rgba(0, 0, 0, 0.04);
+  --bg-hover: rgba(0, 0, 0, 0.02);
+  --bg-hover-strong: rgba(0, 0, 0, 0.05);
+  --text-primary: #111827;
+  --text-secondary: #374151;
+  --text-muted: #6b7280;
+  --text-dim: #9ca3af;
+  --border-color: rgba(0, 0, 0, 0.1);
+  --border-hover: rgba(0, 0, 0, 0.2);
+  --background: var(--bg-primary);
+  --foreground: var(--text-primary);
+}
+/* ─── Base ─────────────────────────────────────────────────────── */
+* {
+  box-sizing: border-box;
+  margin: 0;
+  padding: 0;
+}
+html,
+body,
+#root {
+  height: 100%;
+  width: 100%;
+}
+body {
+  background: var(--bg-primary);
+  color: var(--text-primary);
+  font-family: ui-monospace, 'SF Mono', Consolas, 'Liberation Mono', monospace;
+  -webkit-font-smoothing: antialiased;
+  -moz-osx-font-smoothing: grayscale;
+}
+/* ─── Theme Utility Classes ──────────────────────────────────────── */
+.theme-bg-primary   { background-color: var(--bg-primary) !important; }
+.theme-bg-secondary { background-color: var(--bg-secondary) !important; }
+.theme-bg-tertiary  { background-color: var(--bg-tertiary) !important; }
+.theme-bg-card      { background-color: var(--bg-card) !important; }
+.theme-text-primary   { color: var(--text-primary) !important; }
+.theme-text-secondary { color: var(--text-secondary) !important; }
+.theme-text-muted     { color: var(--text-muted) !important; }
+.theme-border         { border-color: var(--border-color) !important; }
+.theme-border-hover   { border-color: var(--border-hover) !important; }
+/* ─── Scrollbars ─────────────────────────────────────────────────── */
+.scrollbar-none {
+  -ms-overflow-style: none;
+  scrollbar-width: none;
+}
+.scrollbar-none::-webkit-scrollbar {
+  display: none;
+}
+::-webkit-scrollbar {
+  width: 4px;
+  height: 4px;
+}
+::-webkit-scrollbar-track {
+  background: transparent;
+}
+::-webkit-scrollbar-thumb {
+  background: rgba(255, 255, 255, 0.1);
+  border-radius: 4px;
+}
+[data-theme="light"] ::-webkit-scrollbar-thumb {
+  background: rgba(0, 0, 0, 0.15);
+}
+/* ─── SQL Syntax Highlighting ───────────────────────────────────── */
+.sql-keyword { color: #a78bfa; font-weight: 600; }
+.sql-function { color: #60a5fa; }
+.sql-string { color: #34d399; }
+.sql-number { color: #f97316; }
+.sql-comment { color: #6b7280; font-style: italic; }
+.sql-operator { color: #e5e7eb; }
+/* ─── Blinking cursor ──────────────────────────────────────────── */
+@keyframes blink {
+  0%, 100% { opacity: 1; }
+  50% { opacity: 0; }
+}
+.cursor-blink {
+  display: inline-block;
+  width: 2px;
+  height: 1em;
+  background: currentColor;
+  animation: blink 1s step-end infinite;
+  vertical-align: text-bottom;
+  margin-left: 1px;
+}
+/* ─── Reward pulse animation ─────────────────────────────────────── */
+@keyframes rewardPulse {
+  0% { transform: scale(1); opacity: 0.7; }
+  50% { transform: scale(1.15); opacity: 1; }
+  100% { transform: scale(1); opacity: 1; }
+}
+.reward-pulse {
+  animation: rewardPulse 0.5s ease-out;
+}
+/* ─── Optimizing banner ──────────────────────────────────────────── */
+@keyframes shimmer {
+  0% { background-position: -200% 0; }
+  100% { background-position: 200% 0; }
+}
+.shimmer-banner {
+  background: linear-gradient(
+    90deg,
+    rgba(139, 92, 246, 0.15) 0%,
+    rgba(139, 92, 246, 0.3) 50%,
+    rgba(139, 92, 246, 0.15) 100%
+  );
+  background-size: 200% 100%;
+  animation: shimmer 2s linear infinite;
+}
+/* ─── Light Mode Global Overrides ─────────────────────────────── */
+[data-theme="light"] .text-white { color: var(--text-primary) !important; }
+[data-theme="light"] .text-white\/70 { color: var(--text-secondary) !important; }
+[data-theme="light"] .text-gray-200 { color: #1f2937 !important; }
+[data-theme="light"] .text-gray-300 { color: #374151 !important; }
+[data-theme="light"] .text-gray-400 { color: #4b5563 !important; }
+[data-theme="light"] .text-gray-500 { color: #6b7280 !important; }
+[data-theme="light"] .text-gray-600 { color: #9ca3af !important; }
+[data-theme="light"] .text-violet-300 { color: #7c3aed !important; }
+[data-theme="light"] .text-violet-400 { color: #7c3aed !important; }
+[data-theme="light"] .text-green-400 { color: #15803d !important; }
+[data-theme="light"] .text-red-400 { color: #b91c1c !important; }
+[data-theme="light"] pre {
+  background-color: var(--bg-tertiary) !important;
+  color: #374151 !important;
+}
+[data-theme="light"] .recharts-cartesian-grid line {
+  stroke: rgba(0, 0, 0, 0.06) !important;
+}

frontend/src/lib/api.ts ADDED Viewed

	@@ -0,0 +1,97 @@

+import type { InitResponse, RLState, SchemaGraph, SSEEvent } from './types'
+const BASE_URL: string = import.meta.env.VITE_API_URL ?? ''
+async function* parseSSE(response: Response): AsyncGenerator<SSEEvent> {
+  const reader = response.body!.getReader()
+  const decoder = new TextDecoder()
+  let buffer = ''
+  while (true) {
+    const { done, value } = await reader.read()
+    if (done) break
+    buffer += decoder.decode(value, { stream: true })
+    const lines = buffer.split('\n')
+    buffer = lines.pop() ?? ''
+    for (const line of lines) {
+      if (!line.startsWith('data: ')) continue
+      const raw = line.slice(6).trim()
+      if (raw === '[DONE]') return
+      try {
+        yield JSON.parse(raw) as SSEEvent
+      } catch {
+        // ignore malformed lines
+      }
+    }
+  }
+}
+export async function* streamExecuteQuery(
+  question: string,
+  taskId: string
+): AsyncGenerator<SSEEvent> {
+  const res = await fetch(`${BASE_URL}/api/execute-query`, {
+    method: 'POST',
+    headers: { 'Content-Type': 'application/json' },
+    body: JSON.stringify({ question, task_id: taskId }),
+  })
+  if (!res.ok) {
+    throw new Error(`HTTP ${res.status}: ${res.statusText}`)
+  }
+  yield* parseSSE(res)
+}
+export async function* streamBenchmark(
+  taskId: string,
+  queryIds?: string[]
+): AsyncGenerator<SSEEvent> {
+  const body: Record<string, unknown> = { task_id: taskId }
+  if (queryIds) body.queryIds = queryIds
+  const res = await fetch(`${BASE_URL}/api/benchmark`, {
+    method: 'POST',
+    headers: { 'Content-Type': 'application/json' },
+    body: JSON.stringify(body),
+  })
+  if (!res.ok) {
+    throw new Error(`HTTP ${res.status}: ${res.statusText}`)
+  }
+  yield* parseSSE(res)
+}
+export async function fetchInit(): Promise<InitResponse> {
+  const res = await fetch(`${BASE_URL}/api/init`)
+  if (!res.ok) throw new Error(`HTTP ${res.status}`)
+  return res.json() as Promise<InitResponse>
+}
+export async function fetchRLState(): Promise<RLState> {
+  const res = await fetch(`${BASE_URL}/api/rl-state`)
+  if (!res.ok) throw new Error(`HTTP ${res.status}`)
+  return res.json() as Promise<RLState>
+}
+export async function fetchSchemaGraph(): Promise<SchemaGraph> {
+  const res = await fetch(`${BASE_URL}/api/schema-graph`)
+  if (!res.ok) throw new Error(`HTTP ${res.status}`)
+  return res.json() as Promise<SchemaGraph>
+}
+export async function submitFeedback(
+  question: string,
+  sql: string,
+  correct: boolean
+): Promise<void> {
+  await fetch(`${BASE_URL}/api/feedback`, {
+    method: 'POST',
+    headers: { 'Content-Type': 'application/json' },
+    body: JSON.stringify({ question, sql, correct }),
+  })
+}
+export async function fetchPromptHistory() {
+  const res = await fetch(`${BASE_URL}/api/prompt-history`)
+  if (!res.ok) throw new Error(`HTTP ${res.status}`)
+  return res.json()
+}

frontend/src/lib/types.ts ADDED Viewed

	@@ -0,0 +1,131 @@

+// ─── Chat Types ──────────────────────────────────────────────────
+export type MessageStatus = 'streaming' | 'done' | 'error'
+export type FeedbackType = 'correct' | 'wrong' | null
+export interface AttemptStep {
+  attempt: number
+  sql: string
+  error?: string
+  action?: string
+  actionScore?: number
+  reward?: number
+}
+export interface ChatMessage {
+  id: string
+  question: string
+  status: MessageStatus
+  sql: string
+  rows: Record<string, unknown>[]
+  rowCount: number
+  errorMsg?: string
+  attempts: number
+  steps: AttemptStep[]
+  reward?: number
+  rlAction?: string
+  rlActionScore?: number
+  feedback: FeedbackType
+  feedbackSending?: boolean
+  promptGeneration: number
+  streamingCursor?: boolean
+}
+// ─── Benchmark Types ─────────────────────────────────────────────
+export type BenchmarkStatus = 'pending' | 'running' | 'pass' | 'fail'
+export type Difficulty = 'easy' | 'medium' | 'hard'
+export interface BenchmarkQuery {
+  id: string
+  question: string
+  difficulty: Difficulty
+}
+export interface BenchmarkResult {
+  id: string
+  question: string
+  difficulty: Difficulty
+  status: BenchmarkStatus
+  score: number | null
+  sql: string | null
+  reason: string | null
+  attempts: number | null
+  refRowCount: number | null
+  agentRowCount: number | null
+}
+// ─── RL State ────────────────────────────────────────────────────
+export interface RLEpisode {
+  episode: number
+  totalReward: number
+  successRate: number
+}
+export interface ActionCount {
+  action: string
+  count: number
+}
+export interface RLState {
+  totalEpisodes: number
+  successRate: number
+  currentAlpha: number
+  episodes: RLEpisode[]
+  actionDistribution: ActionCount[]
+  currentGeneration: number
+}
+// ─── GEPA / Prompt ───────────────────────────────────────────────
+export interface PromptSnapshot {
+  generation: number
+  prompt: string
+  score: number
+  summary: string
+  timestamp: string
+}
+// ─── Schema ──────────────────────────────────────────────────────
+export interface TableInfo {
+  name: string
+  rows: number
+}
+export interface ColumnInfo {
+  name: string
+  type: string
+  pk?: boolean
+  fk?: string
+}
+export interface SchemaTable {
+  name: string
+  columns: ColumnInfo[]
+}
+export interface SchemaRelationship {
+  from: string
+  fromCol: string
+  to: string
+  toCol: string
+}
+export interface SchemaGraph {
+  tables: SchemaTable[]
+  relationships: SchemaRelationship[]
+}
+// ─── API Response Types ──────────────────────────────────────────
+export interface InitResponse {
+  seeded: boolean
+  tables: TableInfo[]
+}
+export interface SSEEvent {
+  type: string
+  [key: string]: unknown
+}

frontend/src/main.tsx ADDED Viewed

	@@ -0,0 +1,19 @@

+import React from 'react'
+import ReactDOM from 'react-dom/client'
+import App from './App'
+import './index.css'
+// Restore persisted theme
+try {
+  const saved = localStorage.getItem('theme') as 'dark' | 'light' | null
+  if (saved) document.documentElement.setAttribute('data-theme', saved)
+  else document.documentElement.setAttribute('data-theme', 'dark')
+} catch {
+  document.documentElement.setAttribute('data-theme', 'dark')
+}
+ReactDOM.createRoot(document.getElementById('root')!).render(
+  <React.StrictMode>
+    <App />
+  </React.StrictMode>
+)

frontend/src/store/useStore.ts ADDED Viewed

	@@ -0,0 +1,175 @@

+import { create } from 'zustand'
+import type {
+  ChatMessage,
+  BenchmarkResult,
+  RLState,
+  TableInfo,
+  SchemaGraph,
+  PromptSnapshot,
+  Difficulty,
+} from '../lib/types'
+interface Store {
+  // Theme
+  theme: 'dark' | 'light'
+  toggleTheme: () => void
+  // Task
+  taskId: string
+  taskDifficulty: Difficulty
+  setTaskId: (id: string) => void
+  setTaskDifficulty: (d: Difficulty) => void
+  // Init / DB
+  dbSeeded: boolean
+  setDbSeeded: (v: boolean) => void
+  tables: TableInfo[]
+  setTables: (tables: TableInfo[]) => void
+  schemaGraph: SchemaGraph | null
+  setSchemaGraph: (g: SchemaGraph) => void
+  // Chat
+  messages: ChatMessage[]
+  addMessage: (msg: ChatMessage) => void
+  updateMessage: (id: string, update: Partial<ChatMessage>) => void
+  clearMessages: () => void
+  isExecuting: boolean
+  setIsExecuting: (v: boolean) => void
+  optimizingBanner: boolean
+  setOptimizingBanner: (v: boolean) => void
+  // Benchmark
+  benchmarkResults: BenchmarkResult[]
+  setBenchmarkResults: (r: BenchmarkResult[]) => void
+  updateBenchmarkResult: (r: BenchmarkResult) => void
+  resetBenchmark: () => void
+  isBenchmarking: boolean
+  setIsBenchmarking: (v: boolean) => void
+  activeBenchmarkId: string | null
+  setActiveBenchmarkId: (id: string | null) => void
+  overallScore: number | null
+  setOverallScore: (s: number) => void
+  // RL State
+  rlState: RLState | null
+  setRlState: (s: RLState) => void
+  // GEPA / Prompt
+  currentPrompt: string
+  promptGeneration: number
+  promptHistory: PromptSnapshot[]
+  setPromptData: (data: { prompt: string; generation: number; history: PromptSnapshot[] }) => void
+}
+const EASY_QUERIES: BenchmarkResult[] = [
+  { id: 'E1', question: 'Show all products', difficulty: 'easy', status: 'pending', score: null, sql: null, reason: null, attempts: null, refRowCount: null, agentRowCount: null },
+  { id: 'E2', question: 'List all users from the USA', difficulty: 'easy', status: 'pending', score: null, sql: null, reason: null, attempts: null, refRowCount: null, agentRowCount: null },
+  { id: 'E3', question: 'What product categories exist?', difficulty: 'easy', status: 'pending', score: null, sql: null, reason: null, attempts: null, refRowCount: null, agentRowCount: null },
+  { id: 'E4', question: 'How many orders are in the database?', difficulty: 'easy', status: 'pending', score: null, sql: null, reason: null, attempts: null, refRowCount: null, agentRowCount: null },
+  { id: 'E5', question: 'Show all sellers with their names', difficulty: 'easy', status: 'pending', score: null, sql: null, reason: null, attempts: null, refRowCount: null, agentRowCount: null },
+]
+const MEDIUM_QUERIES: BenchmarkResult[] = [
+  { id: 'M1', question: 'Top 5 sellers by total revenue', difficulty: 'medium', status: 'pending', score: null, sql: null, reason: null, attempts: null, refRowCount: null, agentRowCount: null },
+  { id: 'M2', question: 'Average order value by country', difficulty: 'medium', status: 'pending', score: null, sql: null, reason: null, attempts: null, refRowCount: null, agentRowCount: null },
+  { id: 'M3', question: 'Products with stock below 10 units', difficulty: 'medium', status: 'pending', score: null, sql: null, reason: null, attempts: null, refRowCount: null, agentRowCount: null },
+  { id: 'M4', question: 'Monthly order count for the last 12 months', difficulty: 'medium', status: 'pending', score: null, sql: null, reason: null, attempts: null, refRowCount: null, agentRowCount: null },
+  { id: 'M5', question: 'Categories ranked by number of products', difficulty: 'medium', status: 'pending', score: null, sql: null, reason: null, attempts: null, refRowCount: null, agentRowCount: null },
+]
+const HARD_QUERIES: BenchmarkResult[] = [
+  { id: 'H1', question: 'Rolling 7-day revenue for the past 30 days', difficulty: 'hard', status: 'pending', score: null, sql: null, reason: null, attempts: null, refRowCount: null, agentRowCount: null },
+  { id: 'H2', question: 'Seller ranking with rank change from previous month', difficulty: 'hard', status: 'pending', score: null, sql: null, reason: null, attempts: null, refRowCount: null, agentRowCount: null },
+  { id: 'H3', question: 'Cohort retention analysis by signup month', difficulty: 'hard', status: 'pending', score: null, sql: null, reason: null, attempts: null, refRowCount: null, agentRowCount: null },
+  { id: 'H4', question: 'Identify top products contributing to 80% of revenue (Pareto)', difficulty: 'hard', status: 'pending', score: null, sql: null, reason: null, attempts: null, refRowCount: null, agentRowCount: null },
+  { id: 'H5', question: 'Customer lifetime value segmented by acquisition channel', difficulty: 'hard', status: 'pending', score: null, sql: null, reason: null, attempts: null, refRowCount: null, agentRowCount: null },
+]
+export const useStore = create<Store>((set) => ({
+  // Theme
+  theme: 'dark',
+  toggleTheme: () =>
+    set((s) => {
+      const next = s.theme === 'dark' ? 'light' : 'dark'
+      document.documentElement.setAttribute('data-theme', next)
+      try { localStorage.setItem('theme', next) } catch { /* noop */ }
+      return { theme: next }
+    }),
+  // Task
+  taskId: 'easy',
+  taskDifficulty: 'easy',
+  setTaskId: (id) => set({ taskId: id }),
+  setTaskDifficulty: (d) =>
+    set({
+      taskDifficulty: d,
+      taskId: d,
+      benchmarkResults:
+        d === 'easy' ? EASY_QUERIES : d === 'medium' ? MEDIUM_QUERIES : HARD_QUERIES,
+      overallScore: null,
+    }),
+  // Init
+  dbSeeded: false,
+  setDbSeeded: (v) => set({ dbSeeded: v }),
+  tables: [],
+  setTables: (tables) => set({ tables }),
+  schemaGraph: null,
+  setSchemaGraph: (g) => set({ schemaGraph: g }),
+  // Chat
+  messages: [],
+  addMessage: (msg) => set((s) => ({ messages: [...s.messages, msg] })),
+  updateMessage: (id, update) =>
+    set((s) => ({
+      messages: s.messages.map((m) => (m.id === id ? { ...m, ...update } : m)),
+    })),
+  clearMessages: () => set({ messages: [] }),
+  isExecuting: false,
+  setIsExecuting: (v) => set({ isExecuting: v }),
+  optimizingBanner: false,
+  setOptimizingBanner: (v) => set({ optimizingBanner: v }),
+  // Benchmark
+  benchmarkResults: EASY_QUERIES,
+  setBenchmarkResults: (r) => set({ benchmarkResults: r }),
+  updateBenchmarkResult: (r) =>
+    set((s) => ({
+      benchmarkResults: s.benchmarkResults.map((br) => (br.id === r.id ? r : br)),
+    })),
+  resetBenchmark: () =>
+    set((s) => ({
+      benchmarkResults: s.benchmarkResults.map((r) => ({
+        ...r,
+        status: 'pending' as const,
+        score: null,
+        sql: null,
+        reason: null,
+        attempts: null,
+        refRowCount: null,
+        agentRowCount: null,
+      })),
+      overallScore: null,
+    })),
+  isBenchmarking: false,
+  setIsBenchmarking: (v) => set({ isBenchmarking: v }),
+  activeBenchmarkId: null,
+  setActiveBenchmarkId: (id) => set({ activeBenchmarkId: id }),
+  overallScore: null,
+  setOverallScore: (s) => set({ overallScore: s }),
+  // RL State
+  rlState: null,
+  setRlState: (s) => set({ rlState: s }),
+  // GEPA
+  currentPrompt: '',
+  promptGeneration: 0,
+  promptHistory: [],
+  setPromptData: (data) =>
+    set({
+      currentPrompt: data.prompt,
+      promptGeneration: data.generation,
+      promptHistory: data.history,
+    }),
+}))

frontend/src/vite-env.d.ts ADDED Viewed

	@@ -0,0 +1,9 @@

+/// <reference types="vite/client" />
+interface ImportMetaEnv {
+  readonly VITE_API_URL?: string
+}
+interface ImportMeta {
+  readonly env: ImportMetaEnv
+}

frontend/tailwind.config.js ADDED Viewed

	@@ -0,0 +1,20 @@

+/** @type {import('tailwindcss').Config} */
+export default {
+  content: [
+    './index.html',
+    './src/**/*.{js,ts,jsx,tsx}',
+  ],
+  theme: {
+    extend: {
+      colors: {
+        'bg-primary': '#08080d',
+        'bg-secondary': '#09090f',
+        'bg-card': '#0e0e16',
+      },
+      fontFamily: {
+        mono: ['ui-monospace', '"SF Mono"', 'Consolas', '"Liberation Mono"', 'monospace'],
+      },
+    },
+  },
+  plugins: [],
+}

frontend/tsconfig.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "compilerOptions": {
+    "target": "ES2020",
+    "useDefineForClassFields": true,
+    "lib": ["ES2020", "DOM", "DOM.Iterable"],
+    "module": "ESNext",
+    "skipLibCheck": true,
+    "moduleResolution": "bundler",
+    "allowImportingTsExtensions": true,
+    "resolveJsonModule": true,
+    "isolatedModules": true,
+    "noEmit": true,
+    "jsx": "react-jsx",
+    "strict": true,
+    "noUnusedLocals": true,
+    "noUnusedParameters": true,
+    "noFallthroughCasesInSwitch": true,
+    "baseUrl": ".",
+    "paths": {
+      "@/*": ["./src/*"]
+    }
+  },
+  "include": ["src"]
+}

frontend/vite.config.ts ADDED Viewed

	@@ -0,0 +1,28 @@

+import { defineConfig } from 'vite'
+import react from '@vitejs/plugin-react'
+import path from 'path'
+export default defineConfig({
+  plugins: [react()],
+  resolve: {
+    alias: {
+      '@': path.resolve(__dirname, './src'),
+    },
+  },
+  server: {
+    port: 5173,
+    proxy: {
+      '/api': {
+        target: 'http://localhost:8000',
+        changeOrigin: true,
+      },
+      '/env': {
+        target: 'http://localhost:8000',
+        changeOrigin: true,
+      },
+    },
+  },
+  build: {
+    outDir: 'dist',
+  },
+})

inference.py ADDED Viewed

	@@ -0,0 +1,230 @@

+"""
+SQL Agent OpenEnv — Baseline Inference Script
+==============================================
+Runs a baseline LLM agent against all 3 tasks of the SQL Agent OpenEnv environment.
+Environment variables (required):
+  API_BASE_URL   — OpenAI-compatible base URL  (default: https://router.huggingface.co/v1)
+  MODEL_NAME     — Model identifier            (default: Qwen/Qwen2.5-72B-Instruct)
+  HF_TOKEN       — Hugging Face / API key
+STDOUT format (strictly enforced):
+  [START] task=<task_id> env=sql-agent-openenv model=<model>
+  [STEP]  step=<n> action=<action> reward=<0.00> done=<true|false> error=<msg|null>
+  [END]   success=<true|false> steps=<n> score=<0.000> rewards=<r1,r2,...>
+"""
+from __future__ import annotations
+import asyncio
+import os
+import sys
+import textwrap
+from typing import List, Optional
+# ── Path setup (inference.py lives at repo root; backend is a subdirectory) ──
+_BACKEND = os.path.join(os.path.dirname(os.path.abspath(__file__)), "backend")
+if _BACKEND not in sys.path:
+    sys.path.insert(0, _BACKEND)
+from openai import OpenAI  # noqa: E402
+from env.sql_env import SQLAgentEnv, Action, Observation  # noqa: E402
+# ── Config ────────────────────────────────────────────────────────────────────
+API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY") or os.getenv("OPENAI_API_KEY", "")
+API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
+MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
+BENCHMARK = "sql-agent-openenv"
+TASKS = ["simple_queries", "join_queries", "complex_queries"]
+MAX_STEPS = 5
+TEMPERATURE = 0.2
+MAX_TOKENS = 50
+REPAIR_ACTIONS = [
+    "rewrite_full",
+    "fix_column",
+    "fix_table",
+    "add_groupby",
+    "rewrite_cte",
+    "fix_syntax",
+    "change_dialect",
+    "relax_filter",
+]
+SYSTEM_PROMPT = textwrap.dedent("""
+    You are an expert SQL agent interacting with a SQL repair environment.
+    At each step you receive a natural language question, a database schema,
+    and optionally the last SQL attempt + error message.
+    Your job: pick ONE repair action from the list below that is most likely
+    to fix the SQL error on the next attempt.
+    Available actions:
+      generate       — write fresh SQL from scratch (use on first attempt)
+      rewrite_full   — completely rewrite the query from scratch
+      fix_column     — fix wrong column name references
+      fix_table      — fix wrong table name references
+      add_groupby    — add or fix GROUP BY / aggregation clauses
+      rewrite_cte    — restructure subqueries or CTEs
+      fix_syntax     — fix syntax errors (brackets, commas, keywords)
+      change_dialect — convert to SQLite-compatible functions
+      relax_filter   — broaden or remove overly strict WHERE conditions
+    Reply with ONLY the action name. No explanation. No punctuation.
+    Example: fix_column
+""").strip()
+# ── Logging ───────────────────────────────────────────────────────────────────
+def log_start(task: str, model: str) -> None:
+    print(f"[START] task={task} env={BENCHMARK} model={model}", flush=True)
+def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
+    error_val = error.replace("\n", " ").strip() if error else "null"
+    done_val = str(done).lower()
+    print(
+        f"[STEP] step={step} action={action} reward={reward:.2f} "
+        f"done={done_val} error={error_val}",
+        flush=True,
+    )
+def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
+    rewards_str = ",".join(f"{r:.2f}" for r in rewards)
+    print(
+        f"[END] success={str(success).lower()} steps={steps} "
+        f"score={score:.3f} rewards={rewards_str}",
+        flush=True,
+    )
+# ── LLM helper ────────────────────────────────────────────────────────────────
+def pick_action(
+    client: OpenAI,
+    obs: Observation,
+    step: int,
+) -> str:
+    """Ask the LLM to pick a repair action given the current observation."""
+    if step == 1 or obs.current_sql is None:
+        return "generate"
+    user_msg = textwrap.dedent(f"""
+        Question: {obs.question}
+        Current SQL (failed):
+        {obs.current_sql}
+        Error: {obs.error_message or "unknown"}
+        Error class: {obs.error_class or "unknown"}
+        Attempt number: {obs.attempt_number} of {obs.max_attempts}
+        Which repair action should I use next?
+    """).strip()
+    try:
+        completion = client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": user_msg},
+            ],
+            temperature=TEMPERATURE,
+            max_tokens=MAX_TOKENS,
+        )
+        raw = (completion.choices[0].message.content or "").strip().lower()
+        # Normalise to valid action name
+        for action in REPAIR_ACTIONS:
+            if action in raw:
+                return action
+        return "rewrite_full"
+    except Exception as exc:
+        print(f"[DEBUG] LLM call failed: {exc}", flush=True)
+        return "rewrite_full"
+# ── Single-episode runner ─────────────────────────────────────────────────────
+async def run_episode(
+    env: SQLAgentEnv,
+    client: OpenAI,
+    task_id: str,
+) -> None:
+    """Run one full episode for a task, emitting structured stdout logs."""
+    log_start(task=task_id, model=MODEL_NAME)
+    rewards: List[float] = []
+    steps_taken = 0
+    score = 0.0
+    success = False
+    last_error: Optional[str] = None
+    try:
+        obs = env.reset(task_id)
+        for step in range(1, MAX_STEPS + 1):
+            action_name = pick_action(client, obs, step)
+            action = Action(repair_action=action_name)
+            try:
+                obs, reward_info = await env.step(action)
+            except RuntimeError as exc:
+                log_step(step=step, action=action_name, reward=0.0, done=True, error=str(exc))
+                rewards.append(0.0)
+                steps_taken = step
+                break
+            reward = reward_info.value
+            done = reward_info.done
+            last_error = obs.error_message
+            success = reward_info.success
+            rewards.append(reward)
+            steps_taken = step
+            log_step(
+                step=step,
+                action=action_name,
+                reward=reward,
+                done=done,
+                error=last_error,
+            )
+            if done:
+                break
+        # Score: clamp sum of rewards to [0, 1]
+        total = sum(rewards)
+        max_possible = MAX_STEPS * 1.0  # max reward per step is 1.0
+        score = min(max(total / max_possible, 0.0), 1.0)
+    finally:
+        log_end(
+            success=success,
+            steps=steps_taken,
+            score=score,
+            rewards=rewards,
+        )
+# ── Main ──────────────────────────────────────────────────────────────────────
+async def main() -> None:
+    client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
+    env = SQLAgentEnv()
+    for task_id in TASKS:
+        await run_episode(env, client, task_id)
+        # Small gap between tasks for readability
+        print("", flush=True)
+if __name__ == "__main__":
+    asyncio.run(main())

openenv.yaml ADDED Viewed

	@@ -0,0 +1,137 @@

+name: sql-agent-openenv
+version: "1.0.0"
+description: >
+  A SQL generation and repair environment where an AI agent learns to write
+  correct SQL queries through a self-debugging loop powered by a LinUCB
+  contextual bandit and GEPA prompt evolution. Models real-world data analyst
+  workflows — querying databases with natural language, handling errors, and
+  iteratively improving.
+author: sql-agent-openenv-team
+tags:
+  - openenv
+  - sql
+  - rl
+  - nlp
+  - contextual-bandit
+# ── Endpoints ────────────────────────────────────────────────────────────────
+api:
+  reset: /reset
+  step: /step
+  state: /state
+# ── Action Space ─────────────────────────────────────────────────────────────
+action_space:
+  type: discrete
+  n: 9
+  actions:
+    - name: generate
+      description: "Generate SQL from scratch (first attempt)"
+    - name: rewrite_full
+      description: "Completely rewrite the query from scratch"
+    - name: fix_column
+      description: "Fix wrong column name references using schema"
+    - name: fix_table
+      description: "Fix wrong table name references or JOIN structure"
+    - name: add_groupby
+      description: "Add or fix GROUP BY / aggregation clauses"
+    - name: rewrite_cte
+      description: "Restructure CTEs or subqueries"
+    - name: fix_syntax
+      description: "Fix syntax errors (brackets, commas, keywords)"
+    - name: change_dialect
+      description: "Convert to SQLite-compatible functions"
+    - name: relax_filter
+      description: "Broaden or remove overly strict WHERE conditions"
+# ── Observation Space ────────────────────────────────────────────────────────
+observation_space:
+  type: dict
+  fields:
+    - name: question
+      type: string
+      description: "Natural language question to answer with SQL"
+    - name: schema_info
+      type: string
+      description: "Full database schema (tables, columns, types, FK relationships)"
+    - name: current_sql
+      type: string
+      nullable: true
+      description: "The SQL generated on the last attempt (null on first step)"
+    - name: error_message
+      type: string
+      nullable: true
+      description: "SQLite error message from the last attempt (null on success)"
+    - name: error_class
+      type: string
+      nullable: true
+      description: "Classified error type (e.g. no_such_column, syntax_error)"
+    - name: attempt_number
+      type: integer
+      description: "Current attempt number (0 at reset, increments each step)"
+    - name: max_attempts
+      type: integer
+      description: "Maximum allowed attempts per episode (5)"
+    - name: task_id
+      type: string
+      description: "Active task identifier"
+    - name: task_difficulty
+      type: string
+      description: "Task difficulty level: easy | medium | hard"
+# ── Reward ───────────────────────────────────────────────────────────────────
+reward:
+  range: [-1.5, 1.5]
+  description: >
+    Shaped reward providing partial progress signals throughout the episode.
+    Success on attempt N: 1.0 - 0.1*(N-1).
+    Failure step: -0.1 - 0.05*N + severity_improvement_bonus + error_class_change_bonus.
+    Penalizes infinite loops (consecutive same error) and rewards convergence toward correct SQL.
+# ── Tasks ────────────────────────────────────────────────────────────────────
+tasks:
+  - id: simple_queries
+    name: Simple SQL Queries
+    difficulty: easy
+    description: >
+      Single-table SELECT queries. Agent must retrieve correct rows by applying
+      basic filters and projections on the marketplace database.
+    question_count: 5
+    grader: >
+      Checks that required output columns are present and row count falls
+      within expected bounds. Attempt penalty not applied.
+  - id: join_queries
+    name: SQL Join Queries
+    difficulty: medium
+    description: >
+      Multi-table JOIN queries with GROUP BY and aggregation. Agent must
+      correctly join tables and compute aggregates over the marketplace data.
+    question_count: 5
+    grader: >
+      Correct columns + row count score multiplied by (1.0 - 0.1*(attempts-1)).
+      Rewards efficient, first-try solutions.
+  - id: complex_queries
+    name: Complex SQL Queries
+    difficulty: hard
+    description: >
+      Advanced queries using CTEs, window functions, nested aggregations, and
+      multi-level joins. Requires precise SQLite syntax knowledge.
+    question_count: 5
+    grader: >
+      Strict correctness required. Score capped at 0.8 without first-attempt
+      bonus. Attempt penalty of 0.1*(attempts-1) applied. Hard tasks genuinely
+      challenge frontier models.
+# ── Environment Metadata ─────────────────────────────────────────────────────
+metadata:
+  max_steps_per_episode: 5
+  database: SQLite (marketplace schema — users, products, orders, reviews, sellers)
+  rl_algorithm: LinUCB contextual bandit (feature_dim=20, 8 repair actions)
+  prompt_optimizer: GEPA (Generative Evolutionary Prompt Adaptation)
+  runtime_estimate_minutes: 5
+  compute_requirements:
+    vcpu: 2
+    memory_gb: 4