Spaces:

jaivardhan2409
/

sql-query-optimizer

Sleeping

App Files Files Community

jaivardhan2409 commited on Apr 6

Commit

e4c32ce

verified ·

1 Parent(s): 5abdb9a

Upload folder using huggingface_hub

Browse files

Files changed (17) hide show

Dockerfile +8 -0
README.md +48 -6
__init__.py +1 -0
baseline.py +71 -0
client.py +56 -0
env/__init__.py +0 -0
env/environment.py +106 -0
env/models.py +20 -0
env/reward.py +27 -0
env/tasks.py +137 -0
models.py +20 -0
openenv.yaml +14 -0
pyproject.toml +22 -0
requirements.txt +6 -0
server/__init__.py +1 -0
server/app.py +72 -0
uv.lock +0 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,8 @@

+FROM python:3.11-slim
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+EXPOSE 7860
+ENV ENABLE_WEB_INTERFACE=true
+CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,10 +1,52 @@
 ---
-title: Sql Query Optimizer
-emoji: 👀
-colorFrom: red
-colorTo: gray
 sdk: docker
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: SQL Query Optimizer
+emoji: 🦀
+colorFrom: blue
+colorTo: green
 sdk: docker
+app_port: 7860
+base_path: /web
 ---
+# SQL Query Optimizer OpenEnv
+## Environment Description & Motivation
+This domain models a genuine, high-value task performed by data engineers and DBAs every day: reviewing and optimizing slow SQL queries. Instead of a toy environment, this is a real-world task where the agent must rewrite SQL queries to be syntactically correct and performant.
+## Action Space
+- `rewritten_query` (str): The optimized SQL query.
+- `explanation` (str): A brief explanation of the changes made and why they improve the query.
+- `is_done` (bool): Set to true if finished to submit query for final scoring.
+## Observation Space
+- `task_id` (int): The ID of the task to perform.
+- `query` (str): The SQL query to review and optimize.
+- `schema_context` (str): Database schema context (CREATE statements).
+- `hint` (str): Optional natural-language hints.
+- `step_number` (int): Current step in the episode.
+- `max_steps` (int): Maximum allowed steps.
+## Tasks
+1. **fix-broken-join (Easy)**: Identify and repair a query with an issue such as a missing ON clause.
+2. **eliminate-n-plus-one (Medium)**: Remove correlated subqueries and replace them with properly structured JOINs.
+3. **full-optimization (Hard)**: Remove redundant DISTINCT clauses, avoid SELECT *, use index hints, and fix implicit type casts in a more complex query.
+## Setup & Testing
+```bash
+# Verify using openenv
+openenv validate
+# Local testing
+uvicorn server:app --host 0.0.0.0 --port 7860
+# Docker build
+docker build -t sql-optimizer-env .
+docker run -p 7860:7860 sql-optimizer-env
+```
+## Baseline Evaluation
+A provided `baseline.py` script replicates inference.
+Usage:
+```bash
+export OPENAI_API_KEY=sk-...
+python baseline.py
+```

__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Root package

baseline.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import os
+from openai import OpenAI
+from env.environment import SQLEnv
+from env.models import Action
+def run_task(env: SQLEnv, task_id: int) -> float:
+    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
+    obs = env.reset(task_id)
+    messages = [
+        {"role": "system", "content": "You are an expert SQL DBA. You rewrite SQL queries to be correct, optimized, and performant."}
+    ]
+    prompt = f"""
+Task # {obs.task_id}
+Original Query: {obs.query}
+Database Schema Context: {obs.schema_context}
+Hint: {obs.hint}
+Please provide the optimized query. Output ONLY the raw SQL query, no markdown formatting, no explanation.
+"""
+    messages.append({"role": "user", "content": prompt.strip()})
+    try:
+        response = client.chat.completions.create(
+            model="gpt-3.5-turbo",
+            messages=messages,
+            temperature=0.0
+        )
+        rewritten_query = response.choices[0].message.content.strip()
+        if rewritten_query.startswith("```sql"):
+            rewritten_query = rewritten_query[6:]
+        if rewritten_query.endswith("```"):
+            rewritten_query = rewritten_query[:-3]
+        rewritten_query = rewritten_query.strip()
+    except Exception as e:
+        print(f"Error calling OpenAI API: {e}")
+        rewritten_query = obs.query
+    action = Action(
+        rewritten_query=rewritten_query,
+        explanation="Baseline inference using LLM",
+        is_done=True
+    )
+    _, reward, done, info = env.step(action)
+    return env.final_grader_score
+def run_all_tasks():
+    if not os.environ.get("OPENAI_API_KEY"):
+        raise ValueError("OPENAI_API_KEY environment variable is required.")
+    env = SQLEnv()
+    scores = {}
+    for task_id in [1, 2, 3]:
+        print(f"Running baseline for Task {task_id}...")
+        score = run_task(env, task_id)
+        scores[task_id] = score
+        print(f"Task {task_id} Grader Score: {score}")
+    return scores
+if __name__ == "__main__":
+    try:
+        scores = run_all_tasks()
+        print("\nBaseline Evaluation Results:")
+        for t, s in scores.items():
+            print(f"Task {t}: {s}/1.0")
+    except Exception as e:
+        print(f"Baseline Evaluation Failed: {e}")

client.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""SQL Query Optimizer Client."""
+from typing import Dict
+from openenv.core import EnvClient
+from openenv.core.client_types import StepResult
+from openenv.core.env_server.types import State
+from models import Action, Observation
+class SQLEnvClient(
+    EnvClient[Action, Observation, State]
+):
+    """
+    Client for the SQL Query Optimizer Environment.
+    This client maintains a persistent WebSocket connection to the environment server.
+    """
+    def _step_payload(self, action: Action) -> Dict:
+        """
+        Convert Action to JSON payload for step message.
+        """
+        return action.model_dump()
+    def _parse_result(self, payload: Dict) -> StepResult[Observation]:
+        """
+        Parse server response into StepResult[Observation].
+        """
+        obs_data = payload.get("observation", {})
+        observation = Observation(**obs_data)
+        # Get reward payload properly (whether it's a dict or primitive)
+        reward_data = payload.get("reward")
+        return StepResult(
+            observation=observation,
+            reward=reward_data,
+            done=payload.get("done", False),
+        )
+    def _parse_state(self, payload: Dict) -> State:
+        """
+        Parse server response into State object.
+        """
+        return State(
+            episode_id=payload.get("episode_id"),
+            step_count=payload.get("step_count", 0),
+        )

env/__init__.py ADDED Viewed

File without changes

env/environment.py ADDED Viewed

	@@ -0,0 +1,106 @@

+from typing import Tuple, Dict, Any, List
+from .models import Observation, Action, Reward
+from .tasks import TASKS, grade_action, get_task
+from .reward import compute_reward
+class SQLEnv:
+    def __init__(self):
+        self.current_task_id = None
+        self.task = None
+        self.step_number = 0
+        self.max_steps = 0
+        self.history = []
+        self.cumulative_score = 0.0
+        self.previous_grader_score = 0.0
+        self.final_grader_score = 0.0
+    def reset(self, task_id: int) -> Observation:
+        task = get_task(task_id)
+        if not task:
+            raise ValueError(f"Task {task_id} not found.")
+        self.current_task_id = task_id
+        self.task = task
+        self.step_number = 1
+        self.max_steps = task["max_steps"]
+        self.history = []
+        self.cumulative_score = 0.0
+        self.previous_grader_score = 0.0
+        self.final_grader_score = 0.0
+        obs = Observation(
+            task_id=self.current_task_id,
+            query=self.task["initial_query"],
+            schema_context=self.task["schema_context"],
+            hint=self.task["hint"],
+            step_number=self.step_number,
+            max_steps=self.max_steps
+        )
+        self.history.append({"step": 0, "type": "reset", "observation": obs.model_dump()})
+        return obs
+    def step(self, action: Action) -> Tuple[Observation, Reward, bool, Dict[str, Any]]:
+        if not self.task:
+            raise RuntimeError("Environment not initialized. Call reset() first.")
+        grader_score, breakdown, feedback = grade_action(self.current_task_id, action.rewritten_query)
+        action_valid = len(action.rewritten_query.strip()) > 0
+        done = action.is_done or self.step_number >= self.max_steps
+        step_reward = compute_reward(
+            grader_score=grader_score,
+            previous_score=self.previous_grader_score,
+            step_number=self.step_number,
+            max_steps=self.max_steps,
+            is_done=done,
+            action_valid=action_valid
+        )
+        self.cumulative_score += step_reward
+        self.previous_grader_score = grader_score
+        reward = Reward(
+            score=step_reward,
+            breakdown=breakdown,
+            feedback=feedback
+        )
+        obs = Observation(
+            task_id=self.current_task_id,
+            query=action.rewritten_query,
+            schema_context=self.task["schema_context"],
+            hint=self.task["hint"],
+            step_number=self.step_number + 1,
+            max_steps=self.max_steps
+        )
+        info = {
+            "cumulative_score": self.cumulative_score,
+            "grader_score": grader_score
+        }
+        if done:
+            self.final_grader_score = grader_score
+        self.history.append({
+            "step": self.step_number,
+            "type": "step",
+            "action": action.model_dump(),
+            "reward": reward.model_dump(),
+            "done": done,
+            "info": info
+        })
+        self.step_number += 1
+        return obs, reward, done, info
+    def state(self) -> Dict[str, Any]:
+        return {
+            "current_task_id": self.current_task_id,
+            "step_number": self.step_number,
+            "max_steps": self.max_steps,
+            "cumulative_score": self.cumulative_score,
+            "final_grader_score": self.final_grader_score,
+            "history": self.history
+        }

env/models.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from typing import Optional, Dict
+from pydantic import BaseModel, Field
+class Observation(BaseModel):
+    task_id: int = Field(description="The ID of the task to perform.")
+    query: str = Field(description="The SQL query to review and optimize.")
+    schema_context: str = Field(description="The database schema context for the query, such as CREATE TABLE statements.")
+    hint: Optional[str] = Field(default=None, description="An optional natural-language hint or description of the problem.")
+    step_number: int = Field(description="The current step number in the episode (1-indexed).")
+    max_steps: int = Field(description="The maximum allowed steps for this task.")
+class Action(BaseModel):
+    rewritten_query: str = Field(description="The rewritten, optimized SQL query.")
+    explanation: str = Field(description="A brief explanation of the changes made and why they improve the query.")
+    is_done: bool = Field(description="Set to true if you are finished and want to submit the query for final scoring.")
+class Reward(BaseModel):
+    score: float = Field(description="The overall score for the episode (0.0 to 1.0).")
+    breakdown: Dict[str, float] = Field(default_factory=dict, description="A breakdown of the score by sub-criteria.")
+    feedback: str = Field(description="Specific feedback on the rewritten query or action taken.")

env/reward.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from typing import Dict
+def compute_reward(grader_score: float, previous_score: float, step_number: int, max_steps: int, is_done: bool, action_valid: bool) -> float:
+    """
+    Computes a shaped reward based on the problem statement requirements:
+    - Partial credit per step: +0.0-0.5 for incremental improvement
+    - Completion bonus: +0.5 if grader score >= 0.8 when is_done=True
+    - Step penalty: -0.02 per unnecessary step (> task's min required steps)
+    - Invalid action penalty: -0.1 for empty/unparseable queries
+    - Total clamped to [0.0, 1.0]
+    """
+    reward = 0.0
+    if not action_valid:
+        return -0.1
+    improvement = max(0.0, grader_score - previous_score)
+    # Give partial credit up to 0.5 based on improvement
+    reward += improvement * 0.5
+    if is_done and grader_score >= 0.8:
+        reward += 0.5
+    if step_number > max_steps:
+        reward -= 0.02 * (step_number - max_steps)
+    return max(0.0, min(1.0, reward))

env/tasks.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import sqlglot
+from sqlglot import exp
+from typing import Dict, Any, Tuple
+TASKS = {
+    1: {
+        "name": "fix-broken-join",
+        "difficulty": "easy",
+        "schema_context": "CREATE TABLE users (id INT, name VARCHAR); CREATE TABLE orders (id INT, user_id INT, amount DECIMAL);",
+        "hint": "The query is trying to join users and orders, but it is missing the ON clause, creating a cross join.",
+        "initial_query": "SELECT users.name, orders.amount FROM users JOIN orders;",
+        "max_steps": 3,
+    },
+    2: {
+        "name": "eliminate-n-plus-one",
+        "difficulty": "medium",
+        "schema_context": "CREATE TABLE employees (id INT, dept_id INT, name VARCHAR); CREATE TABLE departments (id INT, name VARCHAR);",
+        "hint": "The query uses a correlated subquery in the WHERE clause. Rewrite it using a JOIN to improve performance.",
+        "initial_query": "SELECT e.name FROM employees e WHERE e.dept_id IN (SELECT d.id FROM departments d WHERE d.name = 'Engineering');",
+        "max_steps": 4,
+    },
+    3: {
+        "name": "full-optimization",
+        "difficulty": "hard",
+        "schema_context": "CREATE TABLE sales (id INT, product_id INT, sale_date DATE, amount DECIMAL); CREATE INDEX idx_sales_date ON sales(sale_date);",
+        "hint": "Optimize the query: remove redundant DISTINCT, avoid SELECT *, use index hint if applicable, and fix implicit type casts.",
+        "initial_query": "SELECT DISTINCT * FROM sales s WHERE CAST(s.sale_date AS VARCHAR) = '2023-01-01';",
+        "max_steps": 5,
+    }
+}
+def grade_task_1(rewritten_query: str) -> Tuple[float, Dict[str, float], str]:
+    try:
+        parsed = sqlglot.parse_one(rewritten_query, read="postgres")
+    except Exception as e:
+        return 0.0, {"parse_error": 1.0}, f"Query could not be parsed: {e}"
+    score = 0.0
+    feedback = []
+    breakdown = {}
+    joins = list(parsed.find_all(exp.Join))
+    if not joins:
+        return 0.0, {"missing_join": 1.0}, "No JOIN found in the query."
+    join = joins[0]
+    if join.args.get("on"):
+        score += 1.0
+        breakdown["has_on_clause"] = 1.0
+        feedback.append("Successfully added the ON clause.")
+    else:
+        breakdown["has_on_clause"] = 0.0
+        feedback.append("The JOIN is still missing an ON clause.")
+    return score, breakdown, " ".join(feedback)
+def grade_task_2(rewritten_query: str) -> Tuple[float, Dict[str, float], str]:
+    try:
+        parsed = sqlglot.parse_one(rewritten_query, read="postgres")
+    except Exception as e:
+        return 0.0, {"parse_error": 1.0}, f"Query could not be parsed: {e}"
+    score = 0.0
+    breakdown = {}
+    feedback = []
+    subqueries = list(parsed.find_all(exp.Subquery))
+    if not subqueries and not list(parsed.find_all(exp.In)):
+        score += 0.5
+        breakdown["removed_correlated_subquery"] = 0.5
+        feedback.append("Removed correlated subquery.")
+    else:
+        breakdown["removed_correlated_subquery"] = 0.0
+        feedback.append("Correlated subquery still present.")
+    joins = list(parsed.find_all(exp.Join))
+    if joins:
+        score += 0.5
+        breakdown["added_join"] = 0.5
+        feedback.append("Added JOIN successfully.")
+    else:
+        breakdown["added_join"] = 0.0
+        feedback.append("Missing JOIN.")
+    return score, breakdown, " ".join(feedback)
+def grade_task_3(rewritten_query: str) -> Tuple[float, Dict[str, float], str]:
+    try:
+        parsed = sqlglot.parse_one(rewritten_query, read="postgres")
+    except Exception as e:
+        return 0.0, {"parse_error": 1.0}, f"Query could not be parsed: {e}"
+    score = 0.0
+    breakdown = {"no_distinct": 0.0, "no_select_star": 0.0, "fixed_cast": 0.0, "has_index_hint": 0.0}
+    feedback = []
+    if not parsed.args.get("distinct"):
+        score += 0.25
+        breakdown["no_distinct"] = 0.25
+        feedback.append("Removed redundant DISTINCT.")
+    stars = list(parsed.find_all(exp.Star))
+    if not stars:
+        score += 0.25
+        breakdown["no_select_star"] = 0.25
+        feedback.append("Replaced SELECT * with explicit columns.")
+    casts = list(parsed.find_all(exp.Cast))
+    cast_on_date = False
+    for c in casts:
+        this = c.args.get("this")
+        if isinstance(this, exp.Column) and this.name.lower() == "sale_date":
+            cast_on_date = True
+    if not cast_on_date:
+        score += 0.25
+        breakdown["fixed_cast"] = 0.25
+        feedback.append("Fixed implicit type cast on sale_date.")
+    if "INDEX" in rewritten_query.upper():
+        score += 0.25
+        breakdown["has_index_hint"] = 0.25
+        feedback.append("Added index hint.")
+    return score, breakdown, " ".join(feedback)
+def grade_action(task_id: int, rewritten_query: str) -> Tuple[float, Dict[str, float], str]:
+    if task_id == 1:
+        return grade_task_1(rewritten_query)
+    elif task_id == 2:
+        return grade_task_2(rewritten_query)
+    elif task_id == 3:
+        return grade_task_3(rewritten_query)
+    return 0.0, {}, "Unknown task."
+def get_task(task_id: int) -> Dict[str, Any]:
+    return TASKS.get(task_id)

models.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from typing import Optional, Dict
+from pydantic import BaseModel, Field
+class Observation(BaseModel):
+    task_id: int = Field(description="The ID of the task to perform.")
+    query: str = Field(description="The SQL query to review and optimize.")
+    schema_context: str = Field(description="The database schema context for the query, such as CREATE TABLE statements.")
+    hint: Optional[str] = Field(default=None, description="An optional natural-language hint or description of the problem.")
+    step_number: int = Field(description="The current step number in the episode (1-indexed).")
+    max_steps: int = Field(description="The maximum allowed steps for this task.")
+class Action(BaseModel):
+    rewritten_query: str = Field(description="The rewritten, optimized SQL query.")
+    explanation: str = Field(description="A brief explanation of the changes made and why they improve the query.")
+    is_done: bool = Field(description="Set to true if you are finished and want to submit the query for final scoring.")
+class Reward(BaseModel):
+    score: float = Field(description="The overall score for the episode (0.0 to 1.0).")
+    breakdown: Dict[str, float] = Field(default_factory=dict, description="A breakdown of the score by sub-criteria.")
+    feedback: str = Field(description="Specific feedback on the rewritten query or action taken.")

openenv.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+name: sql-query-optimizer
+version: "1.0.0"
+description: "AI agent reviews and rewrites SQL queries for correctness and performance."
+tags: [openenv, sql, code-review, data-engineering]
+tasks:
+  - id: 1
+    name: fix-broken-join
+    difficulty: easy
+  - id: 2
+    name: eliminate-n-plus-one
+    difficulty: medium
+  - id: 3
+    name: full-optimization
+    difficulty: hard

pyproject.toml ADDED Viewed

	@@ -0,0 +1,22 @@

+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "sql-query-optimizer"
+version = "1.0.0"
+description = "AI agent reviews and rewrites SQL queries for correctness and performance."
+dependencies = [
+    "fastapi>=0.111.0",
+    "uvicorn>=0.30.1",
+    "pydantic>=2.7.4",
+    "openai>=1.35.3",
+    "sqlglot>=25.5.0",
+    "openenv-core>=0.2.0"
+]
+[project.scripts]
+server = "server.app:main"
+[tool.setuptools]
+packages = ["server", "env"]

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+fastapi>=0.111.0
+uvicorn>=0.30.1
+pydantic>=2.7.4
+openai>=1.35.3
+sqlglot>=25.5.0
+openenv-core>=0.1.0

server/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Server package

server/app.py ADDED Viewed

	@@ -0,0 +1,72 @@

+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from typing import Dict, Any, List
+import asyncio
+from env.environment import SQLEnv
+from env.models import Observation, Action, Reward
+from env.tasks import TASKS
+app = FastAPI(title="SQL Query Optimizer OpenEnv")
+env = SQLEnv()
+class ResetRequest(BaseModel):
+    task_id: int
+@app.post("/reset", response_model=Observation)
+async def reset(req: ResetRequest):
+    try:
+        return env.reset(req.task_id)
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+@app.post("/step")
+async def step(action: Action):
+    try:
+        obs, reward, done, info = env.step(action)
+        return {
+            "observation": obs.model_dump(),
+            "reward": reward.model_dump(),
+            "done": done,
+            "info": info
+        }
+    except RuntimeError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+@app.get("/state")
+async def state():
+    return env.state()
+@app.get("/tasks")
+async def get_tasks():
+    action_schema = Action.model_json_schema()
+    task_list = [{"id": k, **v} for k, v in TASKS.items()]
+    return {
+        "tasks": task_list,
+        "action_schema": action_schema
+    }
+@app.get("/grader")
+async def grader():
+    if not env.task:
+        raise HTTPException(status_code=400, detail="Environment not initialized.")
+    return {"grader_score": env.final_grader_score}
+class BaselineResponse(BaseModel):
+    scores: Dict[int, float]
+@app.post("/baseline", response_model=BaselineResponse)
+async def run_baseline():
+    import baseline
+    try:
+        scores = baseline.run_all_tasks()
+        return BaselineResponse(scores=scores)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+def main(host: str = "0.0.0.0", port: int = 8000):
+    import uvicorn
+    uvicorn.run(app, host=host, port=port)
+if __name__ == '__main__':
+    main()

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff