Spaces:

abhinavthedev
/

sql-debug

Sleeping

App Files Files Community

abhinavthedev commited on 9 days ago

Commit

aa3a171

verified ·

1 Parent(s): 0a519f8

Upload folder using huggingface_hub

Browse files

Files changed (26) hide show

Dockerfile +81 -0
README.md +231 -10
__init__.py +16 -0
client.py +79 -0
grader.py +74 -0
inference.py +194 -0
models.py +38 -0
openenv.yaml +7 -0
openenv_sql_debug.egg-info/PKG-INFO +11 -0
openenv_sql_debug.egg-info/SOURCES.txt +18 -0
openenv_sql_debug.egg-info/dependency_links.txt +1 -0
openenv_sql_debug.egg-info/entry_points.txt +2 -0
openenv_sql_debug.egg-info/requires.txt +7 -0
openenv_sql_debug.egg-info/top_level.txt +1 -0
pyproject.toml +39 -0
runner.py +19 -0
server/__init__.py +11 -0
server/app.py +58 -0
server/requirements.txt +3 -0
server/sql_debug_environment.py +160 -0
tasks/__init__.py +0 -0
tasks/task_easy.py +31 -0
tasks/task_hard.py +51 -0
tasks/task_medium.py +38 -0
test.py +29 -0
uv.lock +0 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,81 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# Multi-stage build using openenv-base
+# This Dockerfile is flexible and works for both:
+# - In-repo environments (with local OpenEnv sources)
+# - Standalone environments (with openenv from PyPI/Git)
+# The build script (openenv build) handles context detection and sets appropriate build args.
+ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
+FROM ${BASE_IMAGE} AS builder
+WORKDIR /app
+# Ensure git is available (required for installing dependencies from VCS)
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends git && \
+    rm -rf /var/lib/apt/lists/*
+# Build argument to control whether we're building standalone or in-repo
+ARG BUILD_MODE=in-repo
+ARG ENV_NAME=sql_debug
+# Copy environment code (always at root of build context)
+COPY . /app/env
+# For in-repo builds, openenv is already vendored in the build context
+# For standalone builds, openenv will be installed via pyproject.toml
+WORKDIR /app/env
+# Ensure uv is available (for local builds where base image lacks it)
+RUN if ! command -v uv >/dev/null 2>&1; then \
+        curl -LsSf https://astral.sh/uv/install.sh | sh && \
+        mv /root/.local/bin/uv /usr/local/bin/uv && \
+        mv /root/.local/bin/uvx /usr/local/bin/uvx; \
+    fi
+# Install dependencies using uv sync
+# If uv.lock exists, use it; otherwise resolve on the fly
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then \
+        uv sync --frozen --no-install-project --no-editable; \
+    else \
+        uv sync --no-install-project --no-editable; \
+    fi
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then \
+        uv sync --frozen --no-editable; \
+    else \
+        uv sync --no-editable; \
+    fi
+# Final runtime stage
+FROM ${BASE_IMAGE}
+WORKDIR /app
+# Copy the virtual environment from builder
+COPY --from=builder /app/env/.venv /app/.venv
+# Copy the environment code
+COPY --from=builder /app/env /app/env
+# Set PATH to use the virtual environment
+ENV PATH="/app/.venv/bin:$PATH"
+# Set PYTHONPATH so imports work correctly
+ENV PYTHONPATH="/app/env:$PYTHONPATH"
+# Health check
+HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8000/health || exit 1
+# Run the FastAPI server
+# The module path is constructed to work with the /app/env structure
+ENV ENABLE_WEB_INTERFACE=true
+CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]

README.md CHANGED Viewed

@@ -1,10 +1,231 @@
----
-title: Sql Debug
-emoji: 🏃
-colorFrom: green
-colorTo: indigo
-sdk: docker
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: Sql Debug Environment Server
+emoji: 🏒
+colorFrom: pink
+colorTo: red
+sdk: docker
+pinned: false
+app_port: 8000
+base_path: /web
+tags:
+  - openenv
+  - sql
+  - debugging
+  - optimization
+---
+# 🏒 OpenEnv: SQL Debug Environment
+An [OpenEnv](https://openenv.dev)-compliant environment where AI agents fix broken SQL queries and optimize slow ones against in-memory SQLite databases.
+> ✅ **Validator:** `openenv validate` passes when the environment is wired up correctly
+> 🚀 **Local API:** `https://abhinavthedev-sql-debug.hf.space`
+> 📖 **Swagger UI:** `https://abhinavthedev-sql-debug.hf.space/docs`
+---
+## 🎯 Environment Description
+This environment simulates the work of a SQL engineer who must repair syntax errors, correct logic bugs, and improve query performance. Agents receive a schema, a broken or slow query, and a natural-language target description. They submit SQL queries, observe the execution result and query plan, and are scored on correctness and efficiency.
+The environment is intentionally practical: each task mirrors a real debugging pattern used in analytics, reporting, and data engineering workflows.
+---
+## 📋 Tasks
+### Task 1 - Syntax Fix *(Easy)*
+**Task ID:** `syntax_fix_001`
+**Objective:** Fix a malformed query so it returns all orders where `amount > 500`.
+| Field | Description |
+|---|---|
+| `schema` | `orders` table with `id`, `customer`, `amount`, `order_date` |
+| `broken_query` | `SELEC * FORM orders WERE amount > 500` |
+| `target` | Return all orders where amount is greater than 500 |
+**Max steps:** 5 | **Difficulty:** Easy
+---
+### Task 2 - Logic Fix *(Medium)*
+**Task ID:** `logic_fix_001`
+**Objective:** Correct a join bug so only employees in valid departments are returned.
+| Field | Description |
+|---|---|
+| `schema` | `employees` and `departments` tables |
+| `broken_query` | Query uses `LEFT JOIN` but should exclude missing departments |
+| `target` | Return employees in departments with budget > 400000 |
+**Max steps:** 8 | **Difficulty:** Medium
+---
+### Task 3 - Query Optimization *(Hard)*
+**Task ID:** `optimize_001`
+**Objective:** Rewrite a correlated subquery into an efficient CTE or grouped subquery.
+| Field | Description |
+|---|---|
+| `schema` | `transactions` table with generated sample rows |
+| `broken_query` | Correlated subquery that scans per row |
+| `target` | Return completed transactions above the user's average amount |
+**Max steps:** 10 | **Difficulty:** Hard
+---
+## 🔌 API Reference
+### Base URL
+```text
+https://abhinavthedev-sql-debug.hf.space
+```
+### Core Endpoints
+| Method | Endpoint | Description |
+|---|---|---|
+| `POST` | `/reset` | Start a new episode; pass `task_id` to choose a task |
+| `POST` | `/step` | Submit a SQL query and receive the next observation |
+| `GET` | `/state/{session_id}` | Inspect the current episode state |
+| `GET` | `/schema` | View action, observation, and state schemas |
+| `GET` | `/ws` | WebSocket endpoint for low-latency sessions |
+| `GET` | `/health` | Health check |
+| `GET` | `/docs` | Swagger UI |
+---
+## 🎮 Action Space
+The agent submits a single SQL query each step.
+```json
+{
+  "query": "SELECT * FROM orders WHERE amount > 500"
+}
+```
+### Example Actions
+```json
+{ "query": "SELECT * FROM orders WHERE amount > 500" }
+{ "query": "SELECT e.name, d.dept_name FROM employees e INNER JOIN departments d ON e.dept_id = d.id WHERE d.budget > 400000" }
+{ "query": "WITH avg_amount AS (SELECT user_id, AVG(amount) AS avg_amount FROM transactions GROUP BY user_id) SELECT t.* FROM transactions t JOIN avg_amount a ON t.user_id = a.user_id WHERE t.status = 'completed' AND t.amount > a.avg_amount" }
+```
+---
+## 📊 Observation Space
+```json
+{
+  "task_id": "syntax_fix_001",
+  "schema_sql": "CREATE TABLE orders (...)",
+  "current_query": "SELEC * FORM orders WERE amount > 500",
+  "error_message": "near \"SELEC\": syntax error",
+  "query_result": [],
+  "execution_plan": "",
+  "step_count": 0,
+  "target_description": "Return all orders where amount is greater than 500",
+  "reward_so_far": 0.0,
+  "available_tasks": ["syntax_fix_001", "logic_fix_001", "optimize_001"],
+  "done": false,
+  "reward": 0.05
+}
+```
+---
+## 💰 Reward Function
+The reward is computed from syntax validity, result correctness, and query plan quality.
+| Event | Reward |
+|---|---|
+| Query fails with syntax error | `0.05` |
+| Query runs successfully | contributes to the main score |
+| Correct row match on easy and medium tasks | up to `0.6` of the score |
+| Good query plan on hard task | up to `0.2` of the score |
+| Uses correlated-subquery pattern on hard task | heavy plan penalty |
+| Excessively long query | length penalty |
+Final scores are clamped to the range `[0.0, 1.0]`.
+---
+## 🚀 Setup & Usage
+### Option 1 - Run Locally
+```bash
+pip install -e .
+uvicorn server.app:app --host 0.0.0.0 --port 8000 --reload
+# Open https://abhinavthedev-sql-debug.hf.space/docs
+```
+### Option 2 - Run with Docker
+```bash
+docker build -t sql-debug-env -f server/Dockerfile .
+docker run -p 8000:8000 sql-debug-env
+curl https://abhinavthedev-sql-debug.hf.space/health
+```
+### Option 3 - Run the Inference Loop
+```bash
+export SERVER_URL=https://abhinavthedev-sql-debug.hf.space
+export API_KEY=sk-...
+python inference.py
+```
+The inference script defaults to `syntax_fix_001`, logs each step, and stops when the episode ends or the step budget is reached.
+---
+## 🏗️ Project Structure
+```text
+sql_exp/
+├── client.py              # OpenEnv client wrapper
+├── grader.py              # Reward computation
+├── inference.py           # LLM-driven inference loop
+├── models.py              # Action and observation models
+├── openenv.yaml           # OpenEnv manifest
+├── pyproject.toml         # Project metadata and dependencies
+├── runner.py              # SQLite query runner
+├── server/
+│   ├── app.py             # FastAPI app and OpenEnv wiring
+│   ├── Dockerfile         # Container definition
+│   └── sql_debug_environment.py  # Core environment logic
+├── tasks/
+│   ├── task_easy.py       # Syntax-fix task
+│   ├── task_medium.py     # Join logic task
+│   └── task_hard.py       # Query optimization task
+├── test.py                # Manual websocket smoke test
+└── README.md              # Project overview
+```
+---
+## 🛠️ Tech Stack
+- **Python 3.10+** - Runtime
+- **FastAPI** - HTTP framework
+- **OpenEnv Core** - Environment server and client primitives
+- **SQLite** - Query execution engine
+- **Uvicorn** - ASGI server
+- **Docker** - Containerization
+---
+## 📝 License
+BSD-style license, matching the source headers in this repository.

__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Sql Exp Environment."""
+from .client import SqlExpEnv
+from .models import SqlExpAction, SqlExpObservation
+__all__ = [
+    "SqlExpAction",
+    "SqlExpObservation",
+    "SqlExpEnv",
+]

client.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# client.py
+"""
+SQL Debug Environment client.
+This is what inference.py uses to talk to the running server.
+"""
+from typing import Dict
+from openenv.core import EnvClient
+from openenv.core.client_types import StepResult
+from openenv.core.env_server.types import State
+from models import SQLDebugAction, SQLDebugObservation
+class SQLDebugEnv(EnvClient[SQLDebugAction, SQLDebugObservation, State]):
+    """
+    Client for the SQL Debug & Optimizer environment.
+    Maintains a persistent WebSocket connection to the server.
+    Each instance gets its own dedicated environment session.
+    Usage (direct server):
+        with SQLDebugEnv(base_url="http://localhost:8000") as env:
+            result = env.reset()
+            print(result.observation.target_description)
+            result = env.step(SQLDebugAction(query="SELECT * FROM orders"))
+            print(result.reward)
+    Usage (Docker):
+        env = SQLDebugEnv.from_docker_image("sql-debug-env:latest")
+        try:
+            result = env.reset()
+            result = env.step(SQLDebugAction(query="SELECT * FROM orders WHERE amount > 500"))
+        finally:
+            env.close()
+    """
+    def _step_payload(self, action: SQLDebugAction) -> Dict:
+        """Convert SQLDebugAction to JSON payload."""
+        return {"query": action.query}
+    def _parse_result(self, payload: Dict) -> StepResult[SQLDebugObservation]:
+        """Parse server JSON response into a typed StepResult."""
+        obs_data = payload.get("observation", {})
+        observation = SQLDebugObservation(
+            task_id=obs_data.get("task_id", ""),
+            schema_sql=obs_data.get("schema_sql", ""),
+            current_query=obs_data.get("current_query", ""),
+            error_message=obs_data.get("error_message", ""),
+            query_result=obs_data.get("query_result", []),
+            execution_plan=obs_data.get("execution_plan", ""),
+            step_count=obs_data.get("step_count", 0),
+            target_description=obs_data.get("target_description", ""),
+            reward_so_far=obs_data.get("reward_so_far", 0.0),
+            available_tasks=obs_data.get("available_tasks", []),
+            done=payload.get("done", False),
+            reward=payload.get("reward", 0.0),
+        )
+        return StepResult(
+            observation=observation,
+            reward=payload.get("reward", 0.0),
+            done=payload.get("done", False),
+        )
+    def _parse_state(self, payload: Dict) -> State:
+        """Parse server JSON response into a State object."""
+        return State(
+            episode_id=payload.get("episode_id"),
+            step_count=payload.get("step_count", 0),
+        )

grader.py ADDED Viewed

	@@ -0,0 +1,74 @@

+def compute_reward(task: dict, agent_query: str, run_result: dict) -> dict:
+    """
+    task        = one of TASK dicts from tasks/
+    agent_query = the SQL string the agent submitted
+    run_result  = output from runner.run_query()
+    Returns a dict: { value, syntax_ok, result_match_pct, plan_score, message }
+    """
+    # ── Step 1: Did the query even run? ───────────────────────────────────────
+    syntax_ok = (run_result["error"] is None)
+    if not syntax_ok:
+        # Give tiny credit for trying (not zero, so agent gets gradient signal)
+        return {
+            "value": 0.05,
+            "syntax_ok": False,
+            "result_match_pct": 0.0,
+            "plan_score": 0.0,
+            "message": f"Syntax error: {run_result['error'][:100]}",
+        }
+    # ── Step 2: Did we get the right rows? ────────────────────────────────────
+    result_match_pct = 0.0
+    if task["expected_rows"] is not None:
+        expected = task["expected_rows"]
+        got = run_result["rows"]
+        # Count how many expected rows are present in the result
+        matches = sum(1 for row in expected if row in got)
+        result_match_pct = matches / max(len(expected), 1)
+        # Penalize extra rows (returned too many rows = wrong query)
+        if len(got) > len(expected) * 2:
+            result_match_pct *= 0.7  # 30% penalty for bloated results
+    else:
+        # Hard task: no fixed rows — give full match credit if query runs
+        result_match_pct = 1.0
+    # ── Step 3: Is the query plan good? (hard task only) ─────────────────────
+    plan_score = 0.0
+    if task.get("check_plan"):
+        query_upper = agent_query.upper()
+        good_patterns = task.get("good_patterns", [])
+        # Each good pattern found = partial credit
+        found = sum(1 for p in good_patterns if p.upper() in query_upper)
+        plan_score = found / max(len(good_patterns), 1)
+        # Also penalize if they still use correlated subquery pattern
+        if "WHERE" in query_upper and "SELECT AVG" in query_upper:
+            plan_score *= 0.3  # Heavy penalty — they didn't really optimize
+    # ── Step 4: Combine into final score ──────────────────────────────────────
+    # Weights: syntax 20% + correctness 60% + plan 20%
+    base_score = 0.2 + (0.6 * result_match_pct) + (0.2 * plan_score)
+    # Penalize absurdly long queries (e.g. agent spams SELECT *)
+    length_penalty = max(0.0, (len(agent_query) - 800) / 2000)
+    final = max(0.0, min(1.0, base_score - length_penalty))
+    status = "perfect" if final >= 0.99 else "partial" if final > 0.2 else "wrong"
+    msg = f"{status} | rows matched: {result_match_pct:.0%} | plan: {plan_score:.0%}"
+    return {
+        "value": round(final, 3),
+        "syntax_ok": True,
+        "result_match_pct": result_match_pct,
+        "plan_score": plan_score,
+        "message": msg,
+    }

inference.py ADDED Viewed

	@@ -0,0 +1,194 @@

+# inference.py
+"""
+SQL Debug & Optimizer — OpenEnv Inference Script
+Mandatory stdout format:
+    [START] task=<task_name> env=<benchmark> model=<model_name>
+    [STEP]  step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
+    [END]   success=<true|false> steps=<n> score=<0.00> rewards=<r1,r2,...>
+"""
+import asyncio
+import os
+import textwrap
+from typing import List, Optional
+from openai import OpenAI
+from client import SQLDebugEnv, SQLDebugAction
+# ── Mandatory env vars (injected by evaluator on submission) ──────────────────
+IMAGE_NAME   = os.getenv("LOCAL_IMAGE_NAME")
+API_KEY      = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
+API_BASE_URL = os.getenv("API_BASE_URL", "https://api.groq.com/openai/v1")
+MODEL_NAME   = os.getenv("MODEL_NAME",   "llama-3.3-70b-versatile")
+# ── Task + run config ─────────────────────────────────────────────────────────
+TASK_NAME  = os.getenv("SQL_ENV_TASK", "syntax_fix_001")
+BENCHMARK  = "sql-debug-optimizer"
+MAX_STEPS  = 8       # well under 20 min limit; each step is ~2s
+TEMPERATURE = 0.0    # deterministic = reproducible scores
+MAX_TOKENS  = 400
+SUCCESS_THRESHOLD = 0.5   # reward >= 0.5 = success
+# ── Mandatory stdout loggers — DO NOT change field names or order ─────────────
+def log_start(task: str, env: str, model: str) -> None:
+    print(f"[START] task={task} env={env} model={model}", flush=True)
+def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
+    # action must be single-line — newlines break log parsing
+    action_clean = action.replace("\n", " ").replace("\r", "").strip()
+    error_val = error if error else "null"
+    done_val = str(done).lower()
+    print(
+        f"[STEP] step={step} action={action_clean} reward={reward:.2f} "
+        f"done={done_val} error={error_val}",
+        flush=True,
+    )
+def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
+    rewards_str = ",".join(f"{r:.2f}" for r in rewards)
+    print(
+        f"[END] success={str(success).lower()} steps={steps} "
+        f"score={score:.2f} rewards={rewards_str}",
+        flush=True,
+    )
+# ── Prompt design ─────────────────────────────────────────────────────────────
+SYSTEM_PROMPT = textwrap.dedent("""
+    You are an expert SQL engineer helping debug and optimize SQL queries.
+    Rules (follow exactly):
+    - Respond with ONLY the corrected SQL query.
+    - No markdown, no code fences (no ```sql), no explanation.
+    - No comments inside the SQL.
+    - If the query has a syntax error, fix it first.
+    - If the query has a logic bug (wrong JOIN, wrong WHERE), fix the logic.
+    - If asked to optimize, replace correlated subqueries with CTEs using WITH.
+    - Output raw SQL only — it will be executed directly.
+""").strip()
+def build_prompt(obs) -> str:
+    """Build the user prompt from the current observation."""
+    result_preview = str(obs.query_result[:3]) if obs.query_result else "empty / error"
+    return textwrap.dedent(f"""
+        TASK: {obs.target_description}
+        DATABASE SCHEMA:
+        {obs.schema_sql.strip()[:800]}
+        CURRENT QUERY (this is broken or slow — fix it):
+        {obs.current_query.strip()}
+        ERROR: {obs.error_message or "none"}
+        CURRENT RESULT (first 3 rows): {result_preview}
+        STEP: {obs.step_count + 1} of {MAX_STEPS}
+        Write the corrected SQL query:
+    """).strip()
+def call_llm(client: OpenAI, obs) -> str:
+    """Ask the LLM for a better SQL query. Returns clean SQL string."""
+    try:
+        completion = client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user",   "content": build_prompt(obs)},
+            ],
+            temperature=TEMPERATURE,
+            max_tokens=MAX_TOKENS,
+            stream=False,
+        )
+        raw = (completion.choices[0].message.content or "").strip()
+        # Strip markdown code fences if model adds them despite instructions
+        if "```" in raw:
+            lines = raw.split("\n")
+            raw = "\n".join(
+                line for line in lines if not line.strip().startswith("```")
+            ).strip()
+        return raw if raw else "SELECT 1"
+    except Exception as exc:
+        print(f"[DEBUG] LLM call failed: {exc}", flush=True)
+        return "SELECT 1"
+# ── Main loop ─────────────────────────────────────────────────────────────────
+async def main() -> None:
+    client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
+    # Connect to the environment (Docker or local server)
+    SERVER_URL = os.getenv("SERVER_URL", "http://localhost:8000")
+    env = SQLDebugEnv(base_url=SERVER_URL)
+    rewards: List[float] = []
+    steps_taken = 0
+    score = 0.0
+    success = False
+    log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)
+    try:
+        # Reset — get the broken query and task info
+        result = await env.reset(task_id=TASK_NAME)
+        obs = result.observation
+        for step in range(1, MAX_STEPS + 1):
+            if result.done:
+                break
+            # Ask LLM for a better query
+            sql_query = call_llm(client, obs)
+            # Submit to environment
+            result = await env.step(SQLDebugAction(query=sql_query))
+            obs = result.observation
+            reward = result.reward or 0.0
+            done   = result.done
+            error  = obs.error_message if obs.error_message else None
+            rewards.append(reward)
+            steps_taken = step
+            log_step(
+                step=step,
+                action=sql_query,
+                reward=reward,
+                done=done,
+                error=error,
+            )
+            if done:
+                break
+        # Score = best reward achieved (already 0.0–1.0 from grader)
+        score = max(rewards) if rewards else 0.0
+        score = min(max(score, 0.0), 1.0)
+        success = score >= SUCCESS_THRESHOLD
+    except Exception as exc:
+        print(f"[DEBUG] Episode error: {exc}", flush=True)
+    finally:
+        try:
+            await env.close()
+        except Exception as e:
+            print(f"[DEBUG] env.close() error: {e}", flush=True)
+        log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
+if __name__ == "__main__":
+    asyncio.run(main())

models.py ADDED Viewed

	@@ -0,0 +1,38 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Data models for the SQL Debug & Optimizer Environment.
+"""
+from typing import Any, Dict, List
+from pydantic import Field
+from openenv.core.env_server.types import Action, Observation
+class SQLDebugAction(Action):
+    """
+    What the agent submits each step — just a SQL query string.
+    The environment will run it, grade it, and return a new observation.
+    """
+    query: str = Field(..., description="The SQL query the agent wants to try")
+class SQLDebugObservation(Observation):
+    """
+    What the agent sees after each step.
+    Contains everything it needs to improve its next query.
+    """
+    task_id: str = Field(default="", description="Which task is active")
+    schema_sql: str = Field(default="", description="CREATE TABLE statements for this task")
+    current_query: str = Field(default="", description="Last query that was run")
+    error_message: str = Field(default="", description="SQLite error if query failed, else empty string")
+    query_result: List[Dict[str, Any]] = Field(default_factory=list, description="First 10 rows returned")
+    execution_plan: str = Field(default="", description="EXPLAIN QUERY PLAN output")
+    step_count: int = Field(default=0, description="How many steps taken so far")
+    target_description: str = Field(default="", description="Plain English goal for this task")
+    reward_so_far: float = Field(default=0.0, description="Best reward achieved this episode")
+    available_tasks: List[str] = Field(default_factory=list, description="All task IDs you can reset to")

openenv.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+spec_version: 1
+name: sql_debug
+type: space
+runtime: fastapi
+app: server.app:app
+port: 8000

openenv_sql_debug.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,11 @@

+Metadata-Version: 2.4
+Name: openenv-sql_debug
+Version: 0.1.0
+Summary: Sql Debug environment for OpenEnv
+Requires-Python: >=3.10
+Requires-Dist: openenv-core[core]>=0.2.2
+Requires-Dist: openai>=2.30.0
+Requires-Dist: uvicorn>=0.43.0
+Provides-Extra: dev
+Requires-Dist: pytest>=8.0.0; extra == "dev"
+Requires-Dist: pytest-cov>=4.0.0; extra == "dev"

openenv_sql_debug.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+README.md
+pyproject.toml
+./__init__.py
+./client.py
+./grader.py
+./inference.py
+./models.py
+./runner.py
+./test.py
+openenv_sql_debug.egg-info/PKG-INFO
+openenv_sql_debug.egg-info/SOURCES.txt
+openenv_sql_debug.egg-info/dependency_links.txt
+openenv_sql_debug.egg-info/entry_points.txt
+openenv_sql_debug.egg-info/requires.txt
+openenv_sql_debug.egg-info/top_level.txt
+server/__init__.py
+server/app.py
+server/sql_debug_environment.py

openenv_sql_debug.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

openenv_sql_debug.egg-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [console_scripts]
2	+ server = sql_debug.server.app:main

openenv_sql_debug.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+openenv-core[core]>=0.2.2
+openai>=2.30.0
+uvicorn>=0.43.0
+[dev]
+pytest>=8.0.0
+pytest-cov>=4.0.0

openenv_sql_debug.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ sql_debug

pyproject.toml ADDED Viewed

	@@ -0,0 +1,39 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+[build-system]
+requires = ["setuptools>=45", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "openenv-sql_debug"
+version = "0.1.0"
+description = "Sql Debug environment for OpenEnv"
+requires-python = ">=3.10"
+dependencies = [
+    # Core OpenEnv runtime (provides FastAPI server + HTTP client types)
+    # install from github
+    # "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
+    "openenv-core[core]>=0.2.2",
+    "openai>=2.30.0",
+    "uvicorn>=0.43.0",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0.0",
+    "pytest-cov>=4.0.0",
+]
+[project.scripts]
+# Server entry point - enables running via: uv run --project . server
+# or: python -m sql_debug.server.app
+server = "sql_debug.server.app:main"
+[tool.setuptools]
+include-package-data = true
+packages = ["sql_debug", "sql_debug.server"]
+package-dir = { "sql_debug" = ".", "sql_debug.server" = "server" }

runner.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import sqlite3
+def run_query(schema_sql: str, query: str) -> dict:
+    """
+    Runs query against an in-memory SQLite DB seeded with schema_sql.
+    Returns: { "rows": [...], "error": str|None, "plan": str }
+    """
+    conn = sqlite3.connect(":memory:")
+    conn.row_factory = sqlite3.Row
+    try:
+        conn.executescript(schema_sql)
+        plan_rows = conn.execute(f"EXPLAIN QUERY PLAN {query}").fetchall()
+        plan = " | ".join(str(dict(r)) for r in plan_rows)
+        result_rows = [dict(r) for r in conn.execute(query).fetchall()]
+        return {"rows": result_rows, "error": None, "plan": plan}
+    except Exception as e:
+        return {"rows": [], "error": str(e), "plan": ""}
+    finally:
+        conn.close()

server/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Sql Exp environment server components."""
+from .sql_debug_environment import SQLDebugEnvironment
+__all__ = ["SQLDebugEnvironment"]

server/app.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+FastAPI server for the SQL Debug & Optimizer Environment.
+Exposes the environment over HTTP + WebSocket so inference.py
+(and the OpenEnv evaluator) can interact with it remotely.
+Endpoints created automatically by openenv:
+    POST /reset    — start new episode (optionally pass task_id in body)
+    POST /step     — submit an action, get observation + reward
+    GET  /state    — current episode state
+    GET  /schema   — action/observation JSON schemas
+    WS   /ws       — WebSocket for persistent low-latency sessions
+Run locally:
+    uvicorn server.app:app --host 0.0.0.0 --port 8000 --reload
+Or via Docker (defined in Dockerfile):
+    docker build -t sql-debug-env .
+    docker run -p 8000:8000 sql-debug-env
+"""
+try:
+    from openenv.core.env_server.http_server import create_app
+except Exception as e:
+    raise ImportError(
+        "openenv-core is required. Install with: pip install openenv-core"
+    ) from e
+try:
+    from models import SQLDebugAction, SQLDebugObservation
+    from .sql_debug_environment import SQLDebugEnvironment
+except ModuleNotFoundError:
+    from models import SQLDebugAction, SQLDebugObservation
+    from sql_exp.server.sql_debug_environment import SQLDebugEnvironment
+app = create_app(
+    SQLDebugEnvironment,
+    SQLDebugAction,
+    SQLDebugObservation,
+    env_name="sql_debug_optimizer",
+    max_concurrent_envs=4,   # one per task running in parallel
+)
+def main(host: str = "0.0.0.0", port: int = 8000):
+    import uvicorn
+    uvicorn.run(app, host=host, port=port)
+if __name__ == "__main__":
+    main()

server/requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+openenv[core]>=0.2.0
+fastapi>=0.115.0
+uvicorn>=0.24.0

server/sql_debug_environment.py ADDED Viewed

	@@ -0,0 +1,160 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+SQL Debug & Optimizer Environment — server-side implementation.
+The server runs this. The agent never touches this file directly.
+It loads tasks, runs queries in SQLite, grades them, and returns observations.
+"""
+from uuid import uuid4
+from openenv.core.env_server.interfaces import Environment
+from openenv.core.env_server.types import State
+try:
+    from ..models import SQLDebugAction, SQLDebugObservation
+except ImportError:
+    from models import SQLDebugAction, SQLDebugObservation
+from runner import run_query
+from grader import compute_reward
+def _load_all_tasks() -> dict:
+    """Load every task from the tasks/ folder into a dict keyed by task_id."""
+    from tasks.task_easy import TASK as EASY
+    from tasks.task_medium import TASK as MEDIUM
+    from tasks.task_hard import TASK as HARD
+    return {
+        EASY["task_id"]:   EASY,
+        MEDIUM["task_id"]: MEDIUM,
+        HARD["task_id"]:   HARD,
+    }
+class SQLDebugEnvironment(Environment):
+    """
+    SQL Debug & Optimizer environment.
+    The agent receives a broken or slow SQL query and must fix/optimize it.
+    Each step the agent submits a new query — the environment runs it in
+    SQLite, grades it (0.0–1.0), and returns the result as an observation.
+    Three tasks:
+        syntax_fix_001  (easy)   — fix typos in SQL keywords
+        logic_fix_001   (medium) — fix wrong JOIN type causing bad results
+        # optimize_001    (hard)   — rewrite correlated subquery as a CTE
+    """
+    SUPPORTS_CONCURRENT_SESSIONS: bool = True
+    def __init__(self):
+        self._all_tasks = _load_all_tasks()
+        self._current_task = None
+        self._state = State(episode_id=str(uuid4()), step_count=0)
+        self._best_reward = 0.0
+        self._current_query = ""
+    # ── reset ────────────────────────────────────────────────────────────────
+    def reset(self, task_id: str = None) -> SQLDebugObservation:
+        """
+        Start a new episode.
+        Pass task_id to pick a specific task, or leave None for the default (easy).
+        """
+        if task_id is None:
+            task_id = list(self._all_tasks.keys())[0]   # default: easy
+        if task_id not in self._all_tasks:
+            # Unknown task — return error observation instead of crashing
+            return SQLDebugObservation(
+                task_id=task_id,
+                error_message=f"Unknown task_id '{task_id}'. Available: {list(self._all_tasks.keys())}",
+                available_tasks=list(self._all_tasks.keys()),
+            )
+        self._current_task = self._all_tasks[task_id]
+        self._state = State(episode_id=str(uuid4()), step_count=0)
+        self._best_reward = 0.0
+        self._current_query = self._current_task["broken_query"]
+        # Run the broken query so the agent sees the starting error
+        run_result = run_query(
+            self._current_task["schema_sql"],
+            self._current_query,
+        )
+        return SQLDebugObservation(
+            task_id=task_id,
+            schema_sql=self._current_task["schema_sql"],
+            current_query=self._current_query,
+            error_message=run_result["error"] or "",
+            query_result=run_result["rows"][:10],
+            execution_plan=run_result["plan"],
+            step_count=0,
+            target_description=self._current_task["target_description"],
+            reward_so_far=0.0,
+            available_tasks=list(self._all_tasks.keys()),
+            done=False,
+            reward=0.0,
+        )
+    # ── step ─────────────────────────────────────────────────────────────────
+    def step(self, action: SQLDebugAction) -> SQLDebugObservation:
+        """
+        Agent submits a query.
+        We run it, grade it, and return the new observation + reward.
+        """
+        if self._current_task is None:
+            return SQLDebugObservation(
+                error_message="Call reset() before step()",
+                available_tasks=list(self._all_tasks.keys()),
+                done=True,
+                reward=0.0,
+            )
+        self._state.step_count += 1
+        self._current_query = action.query
+        # Run the query in SQLite
+        run_result = run_query(
+            self._current_task["schema_sql"],
+            action.query,
+        )
+        # Grade it (returns dict with value, syntax_ok, result_match_pct, etc.)
+        reward_dict = compute_reward(self._current_task, action.query, run_result)
+        reward_value = reward_dict["value"]
+        # Track the best reward this episode
+        self._best_reward = max(self._best_reward, reward_value)
+        # Episode ends on perfect score or max steps
+        max_steps = self._current_task.get("max_steps", 8)
+        done = (reward_value >= 0.99) or (self._state.step_count >= max_steps)
+        return SQLDebugObservation(
+            task_id=self._current_task["task_id"],
+            schema_sql=self._current_task["schema_sql"],
+            current_query=action.query,
+            error_message=run_result["error"] or "",
+            query_result=run_result["rows"][:10],
+            execution_plan=run_result["plan"],
+            step_count=self._state.step_count,
+            target_description=self._current_task["target_description"],
+            reward_so_far=self._best_reward,
+            available_tasks=list(self._all_tasks.keys()),
+            done=done,
+            reward=reward_value,
+        )
+    # ── state ─────────────────────────────────────────────────────────────────
+    @property
+    def state(self) -> State:
+        return self._state

tasks/__init__.py ADDED Viewed

File without changes

tasks/task_easy.py ADDED Viewed

	@@ -0,0 +1,31 @@

+TASK = {
+    "task_id": "syntax_fix_001",
+    "difficulty": "easy",
+    "max_steps": 5,
+    # This creates the database the agent works with
+    "schema_sql": """
+        CREATE TABLE orders (
+            id INTEGER, customer TEXT, amount REAL, order_date TEXT
+        );
+        INSERT INTO orders VALUES (1, 'Alice', 520.0, '2024-01-15');
+        INSERT INTO orders VALUES (2, 'Bob',   90.0,  '2024-01-16');
+        INSERT INTO orders VALUES (3, 'Carol', 800.0, '2024-01-17');
+        INSERT INTO orders VALUES (4, 'Dan',   150.0, '2024-01-18');
+    """,
+    # This is the broken query the agent must fix
+    "broken_query": "SELEC * FORM orders WERE amount > 500",
+    # Plain English: what should the fixed query do?
+    "target_description": "Return all orders where amount is greater than 500",
+    # What the correct answer looks like — used by grader to check
+    "expected_rows": [
+        {"id": 1, "customer": "Alice", "amount": 520.0, "order_date": "2024-01-15"},
+        {"id": 3, "customer": "Carol", "amount": 800.0, "order_date": "2024-01-17"},
+    ],
+    # For easy task, plan quality doesn't matter
+    "check_plan": False,
+}

tasks/task_hard.py ADDED Viewed

	@@ -0,0 +1,51 @@

+# tasks/task_hard.py
+import random
+def generate_schema(n_rows=5000, seed=42):
+    """Generates schema + INSERT statements for n_rows transactions."""
+    rng = random.Random(seed)
+    statuses = ['completed', 'pending', 'failed']
+    inserts = []
+    for i in range(1, n_rows + 1):
+        user_id = rng.randint(1, 100)
+        amount = round(rng.uniform(10, 1000), 2)
+        status = rng.choice(statuses)
+        inserts.append(f"INSERT INTO transactions VALUES ({i}, {user_id}, {amount}, 'completed');")
+    return (
+        "CREATE TABLE transactions (id INTEGER, user_id INTEGER, amount REAL, ts TEXT, status TEXT);\n"
+        + "\n".join(inserts[:200])  # Keep it fast for demo (200 rows)
+    )
+TASK = {
+    "task_id": "optimize_001",
+    "difficulty": "hard",
+    "max_steps": 10,
+    "schema_sql": generate_schema(200),  # Use 200 rows for speed in hackathon
+    # Slow: correlated subquery — runs inner SELECT once per outer row
+    "broken_query": """
+        SELECT *
+        FROM transactions t1
+        WHERE amount > (
+            SELECT AVG(amount)
+            FROM transactions t2
+            WHERE t2.user_id = t1.user_id
+        )
+        AND t1.status = 'completed'
+    """,
+    "target_description": (
+        "Return all completed transactions where the amount exceeds that user's average. "
+        "Optimize it — avoid correlated subqueries. Use a CTE or subquery with GROUP BY."
+    ),
+    # For hard task we grade differently — no fixed expected_rows
+    "expected_rows": None,
+    # We check that the query plan is efficient (no per-row correlated scans)
+    "check_plan": True,
+    # Keywords we look for in the agent's solution
+    "good_patterns": ["WITH", "GROUP BY", "AVG("],
+}

tasks/task_medium.py ADDED Viewed

	@@ -0,0 +1,38 @@

+TASK = {
+    "task_id": "logic_fix_001",
+    "difficulty": "medium",
+    "max_steps": 8,
+    "schema_sql": """
+        CREATE TABLE employees (id INTEGER, name TEXT, dept_id INTEGER, salary REAL);
+        CREATE TABLE departments (id INTEGER, dept_name TEXT, budget REAL);
+        INSERT INTO departments VALUES (1, 'Engineering', 500000);
+        INSERT INTO departments VALUES (2, 'Sales', 300000);
+        INSERT INTO employees VALUES (1, 'Alice', 1, 95000);
+        INSERT INTO employees VALUES (2, 'Bob',   2, 60000);
+        INSERT INTO employees VALUES (3, 'Carol', 1, 85000);
+        INSERT INTO employees VALUES (4, 'Dan',   99, 55000); -- dept 99 doesn't exist!
+    """,
+    # Bug: LEFT JOIN means Dan (no dept) appears in results. Should be INNER JOIN.
+    "broken_query": """
+        SELECT e.name, d.dept_name
+        FROM employees e
+        LEFT JOIN departments d ON e.dept_id = d.id
+        WHERE d.budget > 400000
+    """,
+    "target_description": (
+        "Return names of employees in departments with budget > 400000. "
+        "Do NOT include employees whose department doesn't exist."
+    ),
+    "expected_rows": [
+        {"name": "Alice", "dept_name": "Engineering"},
+        {"name": "Carol", "dept_name": "Engineering"},
+    ],
+    "check_plan": False,
+}

test.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# test_websocket.py
+from client import SQLDebugEnv
+def test():
+    # Use WebSocket URL
+    env = SQLDebugEnv(base_url="ws://localhost:8000")
+    try:
+        for task_id in ["syntax_fix_002", "logic_fix_002", "optimize_002", "pipeline_audit_001"]:
+            print(f"\n{'='*60}")
+            print(f"Testing: {task_id}")
+            # Connect and reset
+            result = env.reset(task_id=task_id)
+            obs = result.observation
+            print(f"✓ task_id: {obs.task_id}")
+            print(f"✓ description: {obs.target_description[:50]}...")
+            print(f"✓ query: {obs.current_query[:60]}...")
+            # Try one step
+            from models import SQLDebugAction
+            result = env.step(SQLDebugAction(query="SELECT 1"))
+            print(f"✓ step reward: {result.reward}")
+    finally:
+        env.close()
+test()

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff