Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- graders/__init__.py +3 -0
- graders/sql_grader.py +47 -0
- openenv.yaml +21 -14
graders/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from graders.sql_grader import SQLGrader
|
| 2 |
+
|
| 3 |
+
__all__ = ["SQLGrader"]
|
graders/sql_grader.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
graders/sql_grader.py β SQLGrader class for OpenEnv Phase 2 validation.
|
| 3 |
+
Called by the OpenEnv validator to score each task submission.
|
| 4 |
+
Score must be strictly between 0 and 1 (never 0.0 or 1.0).
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class SQLGrader:
|
| 9 |
+
"""
|
| 10 |
+
Grader for all SQL Debug tasks.
|
| 11 |
+
Evaluates a fixed SQL submission and returns a score in (0, 1).
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
# Per-task solution keywords β presence indicates a correct fix
|
| 15 |
+
TASK_SIGNALS = {
|
| 16 |
+
"task_1_easy": [","],
|
| 17 |
+
"task_2_medium": ["GROUP BY"],
|
| 18 |
+
"task_3_hard": ["PARTITION BY"],
|
| 19 |
+
"task_4_expert": ["2024-12", "12-01"],
|
| 20 |
+
"task_5_optimization": ["INNER JOIN", "JOIN"],
|
| 21 |
+
"task_6_migration": ["INSERT INTO", "DROP"],
|
| 22 |
+
"task_7_chaos": ["UNIQUE", "COALESCE"],
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
def grade(self, task_id: str, fixed_sql: str, **kwargs) -> float:
|
| 26 |
+
"""
|
| 27 |
+
Grade a SQL submission.
|
| 28 |
+
|
| 29 |
+
Args:
|
| 30 |
+
task_id: The task identifier (e.g. 'task_1_easy')
|
| 31 |
+
fixed_sql: The agent's submitted SQL fix
|
| 32 |
+
|
| 33 |
+
Returns:
|
| 34 |
+
float strictly in (0, 1)
|
| 35 |
+
"""
|
| 36 |
+
signals = self.TASK_SIGNALS.get(task_id, [])
|
| 37 |
+
sql_upper = (fixed_sql or "").upper()
|
| 38 |
+
|
| 39 |
+
if not signals:
|
| 40 |
+
return 0.5 # Unknown task β neutral score
|
| 41 |
+
|
| 42 |
+
hits = sum(1 for s in signals if s.upper() in sql_upper)
|
| 43 |
+
raw = hits / len(signals)
|
| 44 |
+
|
| 45 |
+
# Map to (0.1, 0.9) β never touches 0.0 or 1.0
|
| 46 |
+
score = 0.1 + raw * 0.8
|
| 47 |
+
return round(max(0.01, min(0.99, score)), 4)
|
openenv.yaml
CHANGED
|
@@ -4,7 +4,7 @@ description: >
|
|
| 4 |
SQL Debug & Data Pipeline Repair β an OpenEnv environment where an AI agent
|
| 5 |
diagnoses and fixes broken SQL queries and ETL pipelines executed against a
|
| 6 |
live DuckDB instance. Seven tasks ranging from easy (syntax fix) to expert
|
| 7 |
-
(chaos engineering).
|
| 8 |
|
| 9 |
author: sql-debug-env
|
| 10 |
tags:
|
|
@@ -22,55 +22,62 @@ tasks:
|
|
| 22 |
max_steps: 5
|
| 23 |
description: >
|
| 24 |
Fix a SQL SELECT query with a missing comma between column names.
|
| 25 |
-
|
|
|
|
| 26 |
baseline_score: 0.5
|
| 27 |
|
| 28 |
- id: task_2_medium
|
| 29 |
difficulty: medium
|
| 30 |
max_steps: 5
|
| 31 |
description: >
|
| 32 |
-
Fix a GROUP BY aggregation query β add GROUP BY
|
| 33 |
-
|
|
|
|
| 34 |
baseline_score: 0.5
|
| 35 |
|
| 36 |
- id: task_3_hard
|
| 37 |
difficulty: hard
|
| 38 |
max_steps: 5
|
| 39 |
description: >
|
| 40 |
-
Fix a RANK() window function
|
| 41 |
-
|
|
|
|
| 42 |
baseline_score: 0.5
|
| 43 |
|
| 44 |
- id: task_4_expert
|
| 45 |
difficulty: expert
|
| 46 |
max_steps: 5
|
| 47 |
description: >
|
| 48 |
-
Fix an invalid date literal (month 13) inside a CTE
|
| 49 |
-
|
|
|
|
| 50 |
baseline_score: 0.5
|
| 51 |
|
| 52 |
- id: task_5_optimization
|
| 53 |
difficulty: expert
|
| 54 |
max_steps: 5
|
| 55 |
description: >
|
| 56 |
-
Rewrite a
|
| 57 |
-
|
|
|
|
| 58 |
baseline_score: 0.5
|
| 59 |
|
| 60 |
- id: task_6_migration
|
| 61 |
difficulty: expert
|
| 62 |
max_steps: 5
|
| 63 |
description: >
|
| 64 |
-
|
| 65 |
-
|
|
|
|
| 66 |
baseline_score: 0.5
|
| 67 |
|
| 68 |
- id: task_7_chaos
|
| 69 |
difficulty: expert
|
| 70 |
max_steps: 5
|
| 71 |
description: >
|
| 72 |
-
Fix a live ETL pipeline
|
| 73 |
-
|
|
|
|
| 74 |
baseline_score: 0.5
|
| 75 |
|
| 76 |
observation_schema:
|
|
|
|
| 4 |
SQL Debug & Data Pipeline Repair β an OpenEnv environment where an AI agent
|
| 5 |
diagnoses and fixes broken SQL queries and ETL pipelines executed against a
|
| 6 |
live DuckDB instance. Seven tasks ranging from easy (syntax fix) to expert
|
| 7 |
+
(chaos engineering).
|
| 8 |
|
| 9 |
author: sql-debug-env
|
| 10 |
tags:
|
|
|
|
| 22 |
max_steps: 5
|
| 23 |
description: >
|
| 24 |
Fix a SQL SELECT query with a missing comma between column names.
|
| 25 |
+
grader: SQLGrader
|
| 26 |
+
grading_metric: accuracy
|
| 27 |
baseline_score: 0.5
|
| 28 |
|
| 29 |
- id: task_2_medium
|
| 30 |
difficulty: medium
|
| 31 |
max_steps: 5
|
| 32 |
description: >
|
| 33 |
+
Fix a GROUP BY aggregation query β add GROUP BY clause.
|
| 34 |
+
grader: SQLGrader
|
| 35 |
+
grading_metric: accuracy
|
| 36 |
baseline_score: 0.5
|
| 37 |
|
| 38 |
- id: task_3_hard
|
| 39 |
difficulty: hard
|
| 40 |
max_steps: 5
|
| 41 |
description: >
|
| 42 |
+
Fix a RANK() window function missing PARTITION BY department.
|
| 43 |
+
grader: SQLGrader
|
| 44 |
+
grading_metric: accuracy
|
| 45 |
baseline_score: 0.5
|
| 46 |
|
| 47 |
- id: task_4_expert
|
| 48 |
difficulty: expert
|
| 49 |
max_steps: 5
|
| 50 |
description: >
|
| 51 |
+
Fix an invalid date literal (month 13) inside a CTE.
|
| 52 |
+
grader: SQLGrader
|
| 53 |
+
grading_metric: accuracy
|
| 54 |
baseline_score: 0.5
|
| 55 |
|
| 56 |
- id: task_5_optimization
|
| 57 |
difficulty: expert
|
| 58 |
max_steps: 5
|
| 59 |
description: >
|
| 60 |
+
Rewrite a CROSS JOIN query to use INNER JOIN.
|
| 61 |
+
grader: SQLGrader
|
| 62 |
+
grading_metric: accuracy
|
| 63 |
baseline_score: 0.5
|
| 64 |
|
| 65 |
- id: task_6_migration
|
| 66 |
difficulty: expert
|
| 67 |
max_steps: 5
|
| 68 |
description: >
|
| 69 |
+
Migrate denormalized table to 3NF schema safely.
|
| 70 |
+
grader: SQLGrader
|
| 71 |
+
grading_metric: accuracy
|
| 72 |
baseline_score: 0.5
|
| 73 |
|
| 74 |
- id: task_7_chaos
|
| 75 |
difficulty: expert
|
| 76 |
max_steps: 5
|
| 77 |
description: >
|
| 78 |
+
Fix a live ETL pipeline with duplicate entries and NULL emails.
|
| 79 |
+
grader: SQLGrader
|
| 80 |
+
grading_metric: accuracy
|
| 81 |
baseline_score: 0.5
|
| 82 |
|
| 83 |
observation_schema:
|