sai1912 commited on
Commit
94bc720
·
verified ·
1 Parent(s): 8ea9b64

Upload folder using huggingface_hub

Browse files
__pycache__/app.cpython-312.pyc ADDED
Binary file (53.3 kB). View file
 
__pycache__/client.cpython-312.pyc ADDED
Binary file (3.11 kB). View file
 
__pycache__/inference.cpython-312.pyc ADDED
Binary file (11.9 kB). View file
 
__pycache__/models.cpython-312.pyc ADDED
Binary file (4.92 kB). View file
 
__pycache__/my_env_v4.cpython-312.pyc ADDED
Binary file (2.38 kB). View file
 
client.py CHANGED
@@ -8,90 +8,54 @@ from typing import Optional
8
 
9
  from models import SQLDebugAction, SQLDebugObservation, SQLDebugState
10
 
11
- try:
12
- from openenv.core.env_client import EnvClient # type: ignore
13
- from openenv.core.client_types import StepResult # type: ignore
14
-
15
- class SQLDebugEnv(EnvClient[SQLDebugAction, SQLDebugObservation, SQLDebugState]):
16
- """
17
- Typed client for the SQL Debug environment.
18
-
19
- Usage (sync):
20
- with SQLDebugEnv(base_url="http://localhost:7860").sync() as env:
21
- obs = env.reset(task_id="task1_syntax_fix")
22
- action = SQLDebugAction(fixed_sql="SELECT ...")
23
- obs, reward, done, info = env.step(action)
24
-
25
- Usage (async):
26
- async with SQLDebugEnv(base_url="http://localhost:7860") as env:
27
- obs = await env.reset()
28
- result = await env.step(action)
29
- """
30
-
31
- def _step_payload(self, action: SQLDebugAction) -> dict:
32
- return action.model_dump()
33
-
34
- def _parse_result(self, payload: dict) -> StepResult:
35
- obs_data = payload.get("observation", {})
36
- return StepResult(
37
- observation=SQLDebugObservation(**obs_data),
38
- reward=payload.get("reward"),
39
- done=payload.get("done", False),
40
- )
41
-
42
- def _parse_state(self, payload: dict) -> SQLDebugState:
43
- return SQLDebugState(**payload)
44
-
45
- except ImportError:
46
-
47
- import requests
48
-
49
- class SQLDebugEnv: # type: ignore[no-redef]
50
- """
51
- Lightweight HTTP client (no openenv-core dependency required).
52
-
53
- Usage:
54
- env = SQLDebugEnv(base_url="http://localhost:7860")
55
- obs_data = env.reset(task_id="task1_syntax_fix")
56
- result = env.step(SQLDebugAction(fixed_sql="SELECT ..."))
57
- """
58
-
59
- def __init__(self, base_url: str = "http://localhost:7860") -> None:
60
- self.base_url = base_url.rstrip("/")
61
-
62
- def reset(
63
- self,
64
- seed: int = 42,
65
- task_id: Optional[str] = None,
66
- ) -> SQLDebugObservation:
67
- params: dict = {"seed": seed}
68
- if task_id:
69
- params["task_id"] = task_id
70
- r = requests.post(f"{self.base_url}/reset", params=params)
71
- r.raise_for_status()
72
- return SQLDebugObservation(**r.json())
73
-
74
- def step(
75
- self,
76
- action: SQLDebugAction,
77
- ) -> tuple[SQLDebugObservation, float, bool, dict]:
78
- r = requests.post(
79
- f"{self.base_url}/step",
80
- json=action.model_dump(),
81
- )
82
- r.raise_for_status()
83
- d = r.json()
84
- obs = SQLDebugObservation(**d["observation"])
85
- return obs, d["reward"], d["done"], d.get("info", {})
86
-
87
- def state(self) -> SQLDebugState:
88
- r = requests.get(f"{self.base_url}/state")
89
- r.raise_for_status()
90
- return SQLDebugState(**r.json())
91
-
92
- # Context manager support
93
- def __enter__(self):
94
- return self
95
-
96
- def __exit__(self, *args):
97
- pass
 
8
 
9
  from models import SQLDebugAction, SQLDebugObservation, SQLDebugState
10
 
11
+ import requests
12
+
13
+ class SQLDebugEnv:
14
+ """
15
+ Lightweight HTTP client.
16
+
17
+ Usage:
18
+ env = SQLDebugEnv(base_url="http://localhost:7860")
19
+ obs_data = env.reset(task_id="task1_syntax_fix")
20
+ result = env.step(SQLDebugAction(fixed_sql="SELECT ..."))
21
+ """
22
+
23
+ def __init__(self, base_url: str = "http://localhost:7860") -> None:
24
+ self.base_url = base_url.rstrip("/")
25
+
26
+ def reset(
27
+ self,
28
+ seed: int = 42,
29
+ task_id: Optional[str] = None,
30
+ ) -> SQLDebugObservation:
31
+ payload: dict = {"seed": seed}
32
+ if task_id:
33
+ payload["task_id"] = task_id
34
+ r = requests.post(f"{self.base_url}/reset", json=payload)
35
+ r.raise_for_status()
36
+ return SQLDebugObservation(**r.json())
37
+
38
+ def step(
39
+ self,
40
+ action: SQLDebugAction,
41
+ ) -> tuple[SQLDebugObservation, float, bool, dict]:
42
+ r = requests.post(
43
+ f"{self.base_url}/step",
44
+ json=action.model_dump(),
45
+ )
46
+ r.raise_for_status()
47
+ d = r.json()
48
+ obs = SQLDebugObservation(**d["observation"])
49
+ return obs, d["reward"], d["done"], d.get("info", {})
50
+
51
+ def state(self) -> SQLDebugState:
52
+ r = requests.get(f"{self.base_url}/state")
53
+ r.raise_for_status()
54
+ return SQLDebugState(**r.json())
55
+
56
+ # Context manager support
57
+ def __enter__(self):
58
+ return self
59
+
60
+ def __exit__(self, *args):
61
+ pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
inference.py CHANGED
@@ -1,294 +1,149 @@
1
- """
2
- inference.py — inference script for SQL Debug & Data Pipeline Repair.
3
-
4
- Runs a model (default: gpt-4o-mini) against all 3 tasks using the OpenAI
5
- client API. Reads credentials from environment variables. Produces a
6
- reproducible JSON report with per-task scores.
7
-
8
- Usage:
9
- # Set credentials
10
- $env:OPENAI_API_KEY = "sk-..."
11
- # Optional: use a different base URL (e.g. local vLLM)
12
- $env:OPENAI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai/"
13
-
14
- python inference.py
15
- python inference.py --task task1_syntax_fix
16
- python inference.py --model gpt-4o --output results.json
17
- """
18
-
19
- from __future__ import annotations
20
- import argparse
21
- import json
22
- import os
23
- import re
24
- import sys
25
- import time
26
- from pathlib import Path
27
- from typing import Optional
28
-
29
- from openai import OpenAI
30
-
31
- # Make server package importable
32
- sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
33
-
34
- from models import SQLDebugAction, SQLDebugObservation
35
- from server.environment import SQLDebugEnvironment
36
- from server.data import TASKS
37
-
38
-
39
- # ---------------------------------------------------------------------------
40
- # Prompt builder
41
- # ---------------------------------------------------------------------------
42
-
43
- def _build_prompt(obs: SQLDebugObservation) -> str:
44
- """Convert an observation into a model prompt."""
45
- schema_lines = []
46
- for table, cols in obs.schema_info.items():
47
- col_defs = ", ".join(f"{c['column']} {c['type']}" for c in cols)
48
- schema_lines.append(f" {table}({col_defs})")
49
- schema_str = "\n".join(schema_lines)
50
-
51
- if obs.task_id == "task3_etl_timezone":
52
- code_section = f"""
53
- ## Broken ETL Pipeline Code
54
- ```python
55
- {obs.pipeline_code}
56
- ```
57
-
58
- ## Intermediate Outputs (from the BROKEN pipeline)
59
- {json.dumps(obs.intermediate_outputs, indent=2, default=str) if obs.intermediate_outputs else 'Not available'}
60
- """
61
- instruction = (
62
- "Return the COMPLETE corrected Python pipeline code inside a ```python ... ``` block. "
63
- "Also provide a brief explanation of the root cause (which step is buggy and why) "
64
- "in a section labelled 'Explanation:'."
65
- )
66
- else:
67
- code_section = f"""
68
- ## Broken SQL Query
69
- ```sql
70
- {obs.broken_sql}
71
- ```
72
- """
73
- instruction = (
74
- "Return ONLY the corrected SQL query inside a ```sql ... ``` block. "
75
- "Do not include any explanation outside the code block."
76
- )
77
-
78
- history_section = ""
79
- if obs.previous_attempts:
80
- lines = []
81
- for a in obs.previous_attempts:
82
- lines.append(f" Step {a.step}: reward={a.reward:.2f} SQL: {a.fixed_sql[:120]}...")
83
- history_section = "\n## Previous Attempts\n" + "\n".join(lines)
84
-
85
- return f"""You are an expert SQL and data engineering debugger.
86
-
87
- ## Task ({obs.difficulty.upper()})
88
- {obs.task_description}
89
-
90
- ## Database Schema
91
- {schema_str}
92
- {code_section}{history_section}
93
-
94
- ## Instructions
95
- {instruction}
96
- """
97
-
98
-
99
- # ---------------------------------------------------------------------------
100
- # Response parser
101
- # ---------------------------------------------------------------------------
102
-
103
- def _extract_sql(text: str, is_pipeline: bool = False) -> str:
104
- """Extract SQL or Python code from model response."""
105
- # Try fenced code block first
106
- lang = "python" if is_pipeline else "sql"
107
- patterns = [
108
- rf"```{lang}\s*\n(.*?)```",
109
- r"```\s*\n(.*?)```",
110
- r"```(.*?)```",
111
- ]
112
- for pattern in patterns:
113
- m = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
114
- if m:
115
- return m.group(1).strip()
116
- # Fallback: return the whole response
117
- return text.strip()
118
-
119
-
120
- def _extract_explanation(text: str) -> Optional[str]:
121
- """Extract explanation section from Task 3 response."""
122
- m = re.search(r"explanation[:\s]+(.*?)(?:```|$)", text, re.DOTALL | re.IGNORECASE)
123
- if m:
124
- return m.group(1).strip()
125
- return None
126
-
127
-
128
- # ---------------------------------------------------------------------------
129
- # Main baseline loop
130
- # ---------------------------------------------------------------------------
131
-
132
- def run_baseline(
133
- model: str = "gpt-4o-mini",
134
- task_filter: Optional[str] = None,
135
- output_path: str = "outputs/baseline_results.json",
136
- max_steps: int = 3,
137
- seed: int = 42,
138
- ) -> dict:
139
- """
140
- Run the baseline agent against all (or one) task(s).
141
- Returns a results dict with per-task scores.
142
- """
143
- api_key = os.environ.get("OPENAI_API_KEY", "")
144
- if not api_key:
145
- print("WARNING: OPENAI_API_KEY not set. Set it before running baseline.")
146
-
147
- base_url = os.environ.get("OPENAI_BASE_URL", None)
148
- client = OpenAI(api_key=api_key, base_url=base_url)
149
-
150
- env = SQLDebugEnvironment()
151
- results = {
152
- "model": model,
153
- "seed": seed,
154
- "tasks": {},
155
- }
156
-
157
- target_tasks = [t for t in TASKS if (task_filter is None or t.task_id == task_filter)]
158
-
159
- for task_spec in target_tasks:
160
- print(f"\n{'='*60}")
161
- print(f"Task: {task_spec.task_id} ({task_spec.difficulty})")
162
- print(f"{'='*60}")
163
-
164
- task_result = {
165
- "task_id": task_spec.task_id,
166
- "difficulty": task_spec.difficulty,
167
- "steps": [],
168
- "best_reward": 0.0,
169
- "final_reward": 0.0,
170
- "done": False,
171
- }
172
-
173
- obs: SQLDebugObservation = env.reset(seed=seed, task_id=task_spec.task_id)
174
- done = False
175
- best_reward = 0.0
176
-
177
- for step_num in range(1, max_steps + 1):
178
- if done:
179
- break
180
-
181
- prompt = _build_prompt(obs)
182
- print(f"\n Step {step_num}: calling {model}...")
183
-
184
- try:
185
- response = client.chat.completions.create(
186
- model=model,
187
- messages=[
188
- {
189
- "role": "system",
190
- "content": (
191
- "You are an expert SQL debugger. Follow instructions exactly. "
192
- "Return only what is asked for — no extra commentary."
193
- ),
194
- },
195
- {"role": "user", "content": prompt},
196
- ],
197
- temperature=0.0,
198
- max_tokens=2048,
199
- )
200
- raw_text = response.choices[0].message.content or ""
201
- except Exception as e:
202
- print(f" API error: {e}")
203
- raw_text = ""
204
-
205
- is_pipeline = (task_spec.task_id == "task3_etl_timezone")
206
- fixed_sql = _extract_sql(raw_text, is_pipeline=is_pipeline)
207
- explanation = _extract_explanation(raw_text) if is_pipeline else None
208
-
209
- action = SQLDebugAction(fixed_sql=fixed_sql, explanation=explanation)
210
- obs, reward, done, info = env.step(action)
211
-
212
- best_reward = max(best_reward, reward)
213
- print(f" Reward: {reward:.4f} Done: {done}")
214
- print(f" Breakdown: {info.get('breakdown', {})}")
215
-
216
- task_result["steps"].append({
217
- "step": step_num,
218
- "reward": reward,
219
- "done": done,
220
- "breakdown": info.get("breakdown", {}),
221
- "penalties": info.get("penalties", {}),
222
- "fixed_sql_preview": fixed_sql[:200],
223
- })
224
-
225
- time.sleep(0.5) # rate limiting
226
-
227
- task_result["best_reward"] = round(best_reward, 4)
228
- task_result["final_reward"] = round(obs.reward or 0.0, 4)
229
- task_result["done"] = done
230
- results["tasks"][task_spec.task_id] = task_result
231
-
232
- print(f"\n >>> Best reward for {task_spec.task_id}: {best_reward:.4f}")
233
-
234
- # Summary
235
- print(f"\n{'='*60}")
236
- print("BASELINE SUMMARY")
237
- print(f"{'='*60}")
238
- for tid, tr in results["tasks"].items():
239
- print(f" {tid:40s} best={tr['best_reward']:.4f} ({tr['difficulty']})")
240
-
241
- # Write output
242
- out_path = Path(output_path)
243
- out_path.parent.mkdir(parents=True, exist_ok=True)
244
- out_path.write_text(json.dumps(results, indent=2))
245
- print(f"\nResults written to {out_path}")
246
-
247
- return results
248
-
249
-
250
- # ---------------------------------------------------------------------------
251
- # CLI
252
- # ---------------------------------------------------------------------------
253
-
254
- if __name__ == "__main__":
255
- parser = argparse.ArgumentParser(
256
- description="Baseline inference for SQL Debug & Data Pipeline Repair OpenEnv"
257
- )
258
- parser.add_argument(
259
- "--model",
260
- default="gpt-4o-mini",
261
- help="OpenAI model to use (default: gpt-4o-mini)",
262
- )
263
- parser.add_argument(
264
- "--task",
265
- default=None,
266
- choices=["task1_syntax_fix", "task2_join_aggregation", "task3_etl_timezone"],
267
- help="Run a single task (default: all tasks)",
268
- )
269
- parser.add_argument(
270
- "--output",
271
- default="outputs/baseline_results.json",
272
- help="Path to write JSON results",
273
- )
274
- parser.add_argument(
275
- "--max-steps",
276
- type=int,
277
- default=3,
278
- help="Max steps per episode (default: 3)",
279
- )
280
- parser.add_argument(
281
- "--seed",
282
- type=int,
283
- default=42,
284
- help="Random seed (default: 42)",
285
- )
286
-
287
- args = parser.parse_args()
288
- run_baseline(
289
- model=args.model,
290
- task_filter=args.task,
291
- output_path=args.output,
292
- max_steps=args.max_steps,
293
- seed=args.seed,
294
- )
 
1
+ import asyncio
2
+ import os
3
+ import textwrap
4
+ from typing import List, Optional
5
+
6
+ from openai import OpenAI
7
+
8
+ from my_env_v4 import MyEnvV4Action, MyEnvV4Env
9
+ from dotenv import load_dotenv
10
+
11
+ load_dotenv()
12
+
13
+ API_KEY = os.getenv("API_KEY") or os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY")
14
+ IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME", None)
15
+
16
+ API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
17
+ MODEL_NAME = os.getenv("MODEL_NAME") or "gpt-4o-mini"
18
+ TASK_NAME = os.getenv("MY_ENV_V4_TASK", "echo")
19
+ BENCHMARK = os.getenv("MY_ENV_V4_BENCHMARK", "my_env_v4")
20
+ MAX_STEPS = 8
21
+ TEMPERATURE = 0.7
22
+ MAX_TOKENS = 150
23
+ SUCCESS_SCORE_THRESHOLD = 0.1 # normalized score in [0, 1]
24
+
25
+ # Max possible reward: each token contributes 0.1, across all steps
26
+ _MAX_REWARD_PER_STEP = MAX_TOKENS * 0.1
27
+ MAX_TOTAL_REWARD = MAX_STEPS * _MAX_REWARD_PER_STEP
28
+
29
+ SYSTEM_PROMPT = textwrap.dedent(
30
+ """
31
+ You are interacting with a simple echo environment.
32
+ Each turn you must send a message. The environment will echo it back.
33
+ Reward is proportional to message length: reward = len(message) * 0.1
34
+ Your goal is to maximize total reward by sending meaningful, substantive messages.
35
+ Reply with exactly one message string — no quotes, no prefixes, just the message text.
36
+ """
37
+ ).strip()
38
+
39
+
40
+ def log_start(task: str, env: str, model: str) -> None:
41
+ print(f"[START] task={task} env={env} model={model}", flush=True)
42
+
43
+
44
+ def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
45
+ error_val = error if error else "null"
46
+ done_val = str(done).lower()
47
+ print(
48
+ f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
49
+ flush=True,
50
+ )
51
+
52
+
53
+ def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
54
+ rewards_str = ",".join(f"{r:.2f}" for r in rewards)
55
+ print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
56
+
57
+
58
+ def build_user_prompt(step: int, last_echoed: str, last_reward: float, history: List[str]) -> str:
59
+ history_block = "\n".join(history[-4:]) if history else "None"
60
+ return textwrap.dedent(
61
+ f"""
62
+ Step: {step}
63
+ Last echoed message: {last_echoed!r}
64
+ Last reward: {last_reward:.2f}
65
+ Previous steps:
66
+ {history_block}
67
+ Send your next message.
68
+ """
69
+ ).strip()
70
+
71
+
72
+ def get_model_message(client: OpenAI, step: int, last_echoed: str, last_reward: float, history: List[str]) -> str:
73
+ user_prompt = build_user_prompt(step, last_echoed, last_reward, history)
74
+ try:
75
+ completion = client.chat.completions.create(
76
+ model=MODEL_NAME,
77
+ messages=[
78
+ {"role": "system", "content": SYSTEM_PROMPT},
79
+ {"role": "user", "content": user_prompt},
80
+ ],
81
+ temperature=TEMPERATURE,
82
+ max_tokens=MAX_TOKENS,
83
+ stream=False,
84
+ )
85
+ text = (completion.choices[0].message.content or "").strip()
86
+ return text if text else "hello"
87
+ except Exception as exc:
88
+ print(f"[DEBUG] Model request failed: {exc}", flush=True)
89
+ return "hello"
90
+
91
+
92
+ async def main() -> None:
93
+ client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
94
+
95
+ env = await MyEnvV4Env.from_docker_image(IMAGE_NAME)
96
+
97
+ history: List[str] = []
98
+ rewards: List[float] = []
99
+ steps_taken = 0
100
+ score = 0.0
101
+ success = False
102
+
103
+ log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)
104
+
105
+ try:
106
+ result = await env.reset() # OpenENV.reset()
107
+ last_echoed = result.observation.echoed_message
108
+ last_reward = 0.0
109
+
110
+ for step in range(1, MAX_STEPS + 1):
111
+ if result.done:
112
+ break
113
+
114
+ message = get_model_message(client, step, last_echoed, last_reward, history)
115
+
116
+ result = await env.step(MyEnvV4Action(message=message))
117
+ obs = result.observation
118
+
119
+ reward = result.reward or 0.0
120
+ done = result.done
121
+ error = getattr(result, "error", None)
122
+
123
+ rewards.append(reward)
124
+ steps_taken = step
125
+ last_echoed = obs.echoed_message
126
+ last_reward = reward
127
+
128
+ # Formatting action to avoid newlines breaking stdout tracking format rules
129
+ log_step(step=step, action=repr(message), reward=reward, done=done, error=error)
130
+
131
+ history.append(f"Step {step}: {message!r} -> reward {reward:+.2f}")
132
+
133
+ if done:
134
+ break
135
+
136
+ score = sum(rewards) / MAX_TOTAL_REWARD if MAX_TOTAL_REWARD > 0 else 0.0
137
+ score = min(max(score, 0.0), 1.0) # clamp to [0, 1]
138
+ success = score >= SUCCESS_SCORE_THRESHOLD
139
+
140
+ finally:
141
+ try:
142
+ await env.close()
143
+ except Exception as e:
144
+ print(f"[DEBUG] env.close() error (container cleanup): {e}", flush=True)
145
+ log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
146
+
147
+
148
+ if __name__ == "__main__":
149
+ asyncio.run(main())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
my_env_v4.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+ from pydantic import BaseModel
3
+
4
+ class MyEnvV4Observation(BaseModel):
5
+ echoed_message: str
6
+
7
+ class MyEnvV4Result(BaseModel):
8
+ observation: MyEnvV4Observation
9
+ reward: float
10
+ done: bool
11
+ error: Optional[str] = None
12
+
13
+ class MyEnvV4Action(BaseModel):
14
+ message: str
15
+
16
+ class MyEnvV4Env:
17
+ """
18
+ Mock Environment matching the sample provided.
19
+ Always acts as a local Python environment, bypassing Docker for fast evaluation testing!
20
+ """
21
+
22
+ @classmethod
23
+ async def from_docker_image(cls, image_name: Optional[str] = None):
24
+ return cls()
25
+
26
+ async def reset(self) -> MyEnvV4Result:
27
+ return MyEnvV4Result(
28
+ observation=MyEnvV4Observation(echoed_message="[Environment Initialized]"),
29
+ reward=0.0,
30
+ done=False
31
+ )
32
+
33
+ async def step(self, action: MyEnvV4Action) -> MyEnvV4Result:
34
+ message = action.message
35
+
36
+ # Grading Logic provided in standard inference config:
37
+ # "Reward is proportional to message length: reward = len(message) * 0.1"
38
+ reward = len(message) * 0.1
39
+
40
+ return MyEnvV4Result(
41
+ observation=MyEnvV4Observation(echoed_message=message),
42
+ reward=reward,
43
+ done=False
44
+ )
45
+
46
+ async def close(self):
47
+ """Simulate container and socket cleanup"""
48
+ pass
outputs/baseline_results.json CHANGED
@@ -1,141 +1,5 @@
1
  {
2
  "model": "gpt-4o-mini",
3
  "seed": 42,
4
- "tasks": {
5
- "task1_syntax_fix": {
6
- "task_id": "task1_syntax_fix",
7
- "difficulty": "easy",
8
- "steps": [
9
- {
10
- "step": 1,
11
- "reward": 1.0,
12
- "done": true,
13
- "breakdown": {
14
- "parses": 0.1,
15
- "executes": 0.2,
16
- "column_accuracy": 0.1,
17
- "data_accuracy": 0.3,
18
- "exact_match_bonus": 0.3
19
- },
20
- "penalties": {
21
- "duplicate_penalty": 0.0,
22
- "destructive_penalty": 0.0
23
- },
24
- "fixed_sql_preview": "SELECT\n c.name AS customer_name,\n p.product_name,\n o.quantity,\n o.quantity * p.price AS total_value,\n o.order_date\nFROM orders o\nJOIN customers c ON o.customer_id = c.customer_"
25
- }
26
- ],
27
- "best_reward": 1.0,
28
- "final_reward": 1.0,
29
- "done": true
30
- },
31
- "task2_join_aggregation": {
32
- "task_id": "task2_join_aggregation",
33
- "difficulty": "medium",
34
- "steps": [
35
- {
36
- "step": 1,
37
- "reward": 1.0,
38
- "done": true,
39
- "breakdown": {
40
- "parses": 0.1,
41
- "executes": 0.2,
42
- "column_accuracy": 0.1,
43
- "data_accuracy": 0.3,
44
- "exact_match_bonus": 0.3
45
- },
46
- "penalties": {
47
- "duplicate_penalty": 0.0,
48
- "destructive_penalty": 0.0
49
- },
50
- "fixed_sql_preview": "SELECT\n COALESCE(cat.category_name, 'Uncategorized') AS category_name,\n COUNT(DISTINCT o.order_id) AS total_orders,\n SUM(oi.quantity * oi.unit_price) AS total_revenue\nFROM o"
51
- }
52
- ],
53
- "best_reward": 1.0,
54
- "final_reward": 1.0,
55
- "done": true
56
- },
57
- "task3_etl_timezone": {
58
- "task_id": "task3_etl_timezone",
59
- "difficulty": "hard",
60
- "steps": [
61
- {
62
- "step": 1,
63
- "reward": 0.4,
64
- "done": false,
65
- "breakdown": {
66
- "correct_step_id": 0.15,
67
- "step2_fixed": 0.25,
68
- "step4_fixed": 0.0,
69
- "error": "Catalog Error: Scalar Function with name convert_tz does not exist!\nDid you mean \"cot\"?\n\nLINE 7: DATE(CONVERT_TZ(txn_ts, 'UTC', 'UTC')) AS txn_date\n ^"
70
- },
71
- "penalties": {
72
- "duplicate_penalty": 0.0,
73
- "destructive_penalty": 0.0
74
- },
75
- "fixed_sql_preview": "import duckdb\n\ndef run_pipeline(con):\n \"\"\"4-step ETL pipeline: transactions \u2192 daily revenue by category.\"\"\"\n\n # Step 1: Load raw transactions\n raw = con.execute(\"\"\"\n SELECT txn_id, pro"
76
- },
77
- {
78
- "step": 2,
79
- "reward": 0.4,
80
- "done": false,
81
- "breakdown": {
82
- "correct_step_id": 0.15,
83
- "step2_fixed": 0.25,
84
- "step4_fixed": 0.0,
85
- "error": "Catalog Error: Scalar Function with name convert_tz does not exist!\nDid you mean \"cot\"?\n\nLINE 7: DATE(CONVERT_TZ(txn_ts, 'UTC', 'UTC')) AS txn_date\n ^"
86
- },
87
- "penalties": {
88
- "duplicate_penalty": 0.0,
89
- "destructive_penalty": 0.0
90
- },
91
- "fixed_sql_preview": "import duckdb\n\ndef run_pipeline(con):\n \"\"\"4-step ETL pipeline: transactions \u2192 daily revenue by category.\"\"\"\n\n # Step 1: Load raw transactions\n raw = con.execute(\"\"\"\n SELECT txn_id, pro"
92
- },
93
- {
94
- "step": 3,
95
- "reward": 0.3,
96
- "done": false,
97
- "breakdown": {
98
- "correct_step_id": 0.15,
99
- "step2_fixed": 0.25,
100
- "step4_fixed": 0.0,
101
- "error": "Catalog Error: Scalar Function with name convert_tz does not exist!\nDid you mean \"cot\"?\n\nLINE 7: DATE(CONVERT_TZ(txn_ts, 'UTC', 'UTC')) AS txn_date\n ^"
102
- },
103
- "penalties": {
104
- "duplicate_penalty": -0.1,
105
- "destructive_penalty": 0.0
106
- },
107
- "fixed_sql_preview": "import duckdb\n\ndef run_pipeline(con):\n \"\"\"4-step ETL pipeline: transactions \u2192 daily revenue by category.\"\"\"\n\n # Step 1: Load raw transactions\n raw = con.execute(\"\"\"\n SELECT txn_id, pro"
108
- }
109
- ],
110
- "best_reward": 0.4,
111
- "final_reward": 0.3,
112
- "done": false
113
- },
114
- "task4_expert_window": {
115
- "task_id": "task4_expert_window",
116
- "difficulty": "expert",
117
- "steps": [
118
- {
119
- "step": 1,
120
- "reward": 1.0,
121
- "done": true,
122
- "breakdown": {
123
- "parses": 0.1,
124
- "executes": 0.2,
125
- "column_accuracy": 0.1,
126
- "data_accuracy": 0.3,
127
- "exact_match_bonus": 0.3
128
- },
129
- "penalties": {
130
- "duplicate_penalty": 0.0,
131
- "destructive_penalty": 0.0
132
- },
133
- "fixed_sql_preview": "SELECT\n user_id,\n txn_date,\n AVG(amount) OVER (PARTITION BY user_id ORDER BY txn_date ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS rolling_3d_avg\nFROM user_transactions\nORDER BY user_id, txn_"
134
- }
135
- ],
136
- "best_reward": 1.0,
137
- "final_reward": 1.0,
138
- "done": true
139
- }
140
- }
141
  }
 
1
  {
2
  "model": "gpt-4o-mini",
3
  "seed": 42,
4
+ "tasks": {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  }