sai1912 commited on
Commit
078e08b
Β·
verified Β·
1 Parent(s): c215ae2

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. graders/__init__.py +3 -0
  2. graders/sql_grader.py +47 -0
  3. openenv.yaml +21 -14
graders/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from graders.sql_grader import SQLGrader
2
+
3
+ __all__ = ["SQLGrader"]
graders/sql_grader.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ graders/sql_grader.py β€” SQLGrader class for OpenEnv Phase 2 validation.
3
+ Called by the OpenEnv validator to score each task submission.
4
+ Score must be strictly between 0 and 1 (never 0.0 or 1.0).
5
+ """
6
+
7
+
8
+ class SQLGrader:
9
+ """
10
+ Grader for all SQL Debug tasks.
11
+ Evaluates a fixed SQL submission and returns a score in (0, 1).
12
+ """
13
+
14
+ # Per-task solution keywords β€” presence indicates a correct fix
15
+ TASK_SIGNALS = {
16
+ "task_1_easy": [","],
17
+ "task_2_medium": ["GROUP BY"],
18
+ "task_3_hard": ["PARTITION BY"],
19
+ "task_4_expert": ["2024-12", "12-01"],
20
+ "task_5_optimization": ["INNER JOIN", "JOIN"],
21
+ "task_6_migration": ["INSERT INTO", "DROP"],
22
+ "task_7_chaos": ["UNIQUE", "COALESCE"],
23
+ }
24
+
25
+ def grade(self, task_id: str, fixed_sql: str, **kwargs) -> float:
26
+ """
27
+ Grade a SQL submission.
28
+
29
+ Args:
30
+ task_id: The task identifier (e.g. 'task_1_easy')
31
+ fixed_sql: The agent's submitted SQL fix
32
+
33
+ Returns:
34
+ float strictly in (0, 1)
35
+ """
36
+ signals = self.TASK_SIGNALS.get(task_id, [])
37
+ sql_upper = (fixed_sql or "").upper()
38
+
39
+ if not signals:
40
+ return 0.5 # Unknown task β€” neutral score
41
+
42
+ hits = sum(1 for s in signals if s.upper() in sql_upper)
43
+ raw = hits / len(signals)
44
+
45
+ # Map to (0.1, 0.9) β€” never touches 0.0 or 1.0
46
+ score = 0.1 + raw * 0.8
47
+ return round(max(0.01, min(0.99, score)), 4)
openenv.yaml CHANGED
@@ -4,7 +4,7 @@ description: >
4
  SQL Debug & Data Pipeline Repair β€” an OpenEnv environment where an AI agent
5
  diagnoses and fixes broken SQL queries and ETL pipelines executed against a
6
  live DuckDB instance. Seven tasks ranging from easy (syntax fix) to expert
7
- (chaos engineering). Features dense reward shaping and real DuckDB execution.
8
 
9
  author: sql-debug-env
10
  tags:
@@ -22,55 +22,62 @@ tasks:
22
  max_steps: 5
23
  description: >
24
  Fix a SQL SELECT query with a missing comma between column names.
25
- The fix requires adding a comma between 'name' and 'age'.
 
26
  baseline_score: 0.5
27
 
28
  - id: task_2_medium
29
  difficulty: medium
30
  max_steps: 5
31
  description: >
32
- Fix a GROUP BY aggregation query β€” add GROUP BY u.name to a SELECT
33
- that mixes aggregate and non-aggregate columns.
 
34
  baseline_score: 0.5
35
 
36
  - id: task_3_hard
37
  difficulty: hard
38
  max_steps: 5
39
  description: >
40
- Fix a RANK() window function that is missing PARTITION BY department,
41
- causing it to rank globally instead of per-department.
 
42
  baseline_score: 0.5
43
 
44
  - id: task_4_expert
45
  difficulty: expert
46
  max_steps: 5
47
  description: >
48
- Fix an invalid date literal (month 13) inside a CTE so the pipeline
49
- executes without a DataError.
 
50
  baseline_score: 0.5
51
 
52
  - id: task_5_optimization
53
  difficulty: expert
54
  max_steps: 5
55
  description: >
56
- Rewrite a working but catastrophically slow CROSS JOIN query to use a
57
- proper INNER JOIN. Verify with EXPLAIN that no CROSS_PRODUCT appears.
 
58
  baseline_score: 0.5
59
 
60
  - id: task_6_migration
61
  difficulty: expert
62
  max_steps: 5
63
  description: >
64
- Safely migrate a denormalized messy_dump table into a normalized 3NF
65
- schema (users + orders), then drop the original table.
 
66
  baseline_score: 0.5
67
 
68
  - id: task_7_chaos
69
  difficulty: expert
70
  max_steps: 5
71
  description: >
72
- Fix a live ETL pipeline injecting duplicate user_id entries and NULL
73
- emails. Apply UNIQUE constraint and COALESCE cleanup to stop corruption.
 
74
  baseline_score: 0.5
75
 
76
  observation_schema:
 
4
  SQL Debug & Data Pipeline Repair β€” an OpenEnv environment where an AI agent
5
  diagnoses and fixes broken SQL queries and ETL pipelines executed against a
6
  live DuckDB instance. Seven tasks ranging from easy (syntax fix) to expert
7
+ (chaos engineering).
8
 
9
  author: sql-debug-env
10
  tags:
 
22
  max_steps: 5
23
  description: >
24
  Fix a SQL SELECT query with a missing comma between column names.
25
+ grader: SQLGrader
26
+ grading_metric: accuracy
27
  baseline_score: 0.5
28
 
29
  - id: task_2_medium
30
  difficulty: medium
31
  max_steps: 5
32
  description: >
33
+ Fix a GROUP BY aggregation query β€” add GROUP BY clause.
34
+ grader: SQLGrader
35
+ grading_metric: accuracy
36
  baseline_score: 0.5
37
 
38
  - id: task_3_hard
39
  difficulty: hard
40
  max_steps: 5
41
  description: >
42
+ Fix a RANK() window function missing PARTITION BY department.
43
+ grader: SQLGrader
44
+ grading_metric: accuracy
45
  baseline_score: 0.5
46
 
47
  - id: task_4_expert
48
  difficulty: expert
49
  max_steps: 5
50
  description: >
51
+ Fix an invalid date literal (month 13) inside a CTE.
52
+ grader: SQLGrader
53
+ grading_metric: accuracy
54
  baseline_score: 0.5
55
 
56
  - id: task_5_optimization
57
  difficulty: expert
58
  max_steps: 5
59
  description: >
60
+ Rewrite a CROSS JOIN query to use INNER JOIN.
61
+ grader: SQLGrader
62
+ grading_metric: accuracy
63
  baseline_score: 0.5
64
 
65
  - id: task_6_migration
66
  difficulty: expert
67
  max_steps: 5
68
  description: >
69
+ Migrate denormalized table to 3NF schema safely.
70
+ grader: SQLGrader
71
+ grading_metric: accuracy
72
  baseline_score: 0.5
73
 
74
  - id: task_7_chaos
75
  difficulty: expert
76
  max_steps: 5
77
  description: >
78
+ Fix a live ETL pipeline with duplicate entries and NULL emails.
79
+ grader: SQLGrader
80
+ grading_metric: accuracy
81
  baseline_score: 0.5
82
 
83
  observation_schema: