DEVessi commited on
Commit
516d2c6
·
verified ·
1 Parent(s): 32dd99b

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. inference.py +84 -78
  2. server/devops_sandbox_environment.py +55 -10
inference.py CHANGED
@@ -100,86 +100,92 @@ def main():
100
 
101
  client = OpenAI(api_key=HF_TOKEN or "dummy_key", base_url=API_BASE_URL)
102
 
 
 
103
  # Note: openenv evaluation specifically needs exactly 3 things: [START], [STEP] logs, [END]
104
- messages = [{"role": "system", "content": SYSTEM_PROMPT}]
105
-
106
- with DevopsSandboxEnv(base_url=ENV_URL).sync() as env:
107
- result = env.reset()
108
- obs = result.observation
109
-
110
- print(f"[START] task={TASK_NAME} env={BENCHMARK} model={MODEL_NAME}", flush=True)
111
-
112
- messages.append({
113
- "role": "user",
114
- "content": (
115
- f"Here is the initial state of the broken app:\n\n"
116
- f"```\n{obs.stdout}\n```\n\n"
117
- f"Current directory: {obs.current_dir}\n"
118
- f"Score: {obs.grader_score}/1.0\n\n"
119
- f"What bash command should I run first?"
120
- ),
121
- })
122
-
123
- rewards = []
124
- is_done = False
125
- steps_taken = 0
126
- final_score = 0.0
127
-
128
- for turn in range(1, MAX_TURNS + 1):
129
- try:
130
- response = client.chat.completions.create(
131
- model=MODEL_NAME,
132
- messages=messages,
133
- temperature=0.2,
134
- max_tokens=256,
135
- )
136
- llm_text = response.choices[0].message.content or ""
137
- except Exception as e:
138
- err_msg = str(e).replace('"', "'")
139
- # Need to emit an empty step on failure? Usually not, just end.
140
- break
141
-
142
- command = extract_command(llm_text)
143
- if not command:
144
- command = "ls -la /app"
145
-
146
- error_msg = "null"
147
- try:
148
- result = env.step(BashAction(command=command))
149
  obs = result.observation
150
- except Exception as e:
151
- obs = env.state # Mock failed obs
152
- error_msg = str(e).replace('\n', ' ')
153
-
154
- steps_taken += 1
155
- reward_val = obs.reward if hasattr(obs, 'reward') else getattr(obs, 'grader_score', 0.0)
156
- rewards.append(f"{reward_val:.2f}")
157
- is_done = result.done if hasattr(result, 'done') else getattr(obs, 'done', False)
158
- done_str = "true" if is_done else "false"
159
-
160
- action_str = command.replace('\n', ' ; ')
161
- print(f"[STEP] step={steps_taken} action={action_str} reward={reward_val:.2f} done={done_str} error={error_msg}", flush=True)
162
-
163
- messages.append({"role": "assistant", "content": llm_text})
164
- messages.append({
165
- "role": "user",
166
- "content": (
167
- f"Command output:\n"
168
- f"stdout:\n```\n{getattr(obs, 'stdout', '')}\n```\n"
169
- f"stderr:\n```\n{getattr(obs, 'stderr', '')}\n```\n"
170
- f"Current score: {getattr(obs, 'grader_score', 0.0)}/1.0\n"
171
- f"Grader feedback: {getattr(obs, 'grader_feedback', '')}\n\n"
172
- f"What command should I run next?"
173
- ),
174
- })
175
-
176
- final_score = getattr(obs, 'grader_score', 0.0)
177
- if getattr(obs, 'grader_score', 0.0) >= 1.0 or getattr(obs, 'done', False) or result.done:
178
- break
179
-
180
- success_str = "true" if final_score >= 1.0 else "false"
181
- rewards_str = ",".join(rewards) if rewards else "0.00"
182
- print(f"[END] success={success_str} steps={steps_taken} score={final_score:.2f} rewards={rewards_str}", flush=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
184
  if __name__ == "__main__":
185
  main()
 
100
 
101
  client = OpenAI(api_key=HF_TOKEN or "dummy_key", base_url=API_BASE_URL)
102
 
103
+ TASKS = ["easy", "medium", "hard"]
104
+
105
  # Note: openenv evaluation specifically needs exactly 3 things: [START], [STEP] logs, [END]
106
+ for task_name in TASKS:
107
+ messages = [{"role": "system", "content": SYSTEM_PROMPT}]
108
+
109
+ try:
110
+ with DevopsSandboxEnv(base_url=ENV_URL).sync() as env:
111
+ result = env.reset(task_name=task_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  obs = result.observation
113
+
114
+ print(f"[START] task={task_name} env={BENCHMARK} model={MODEL_NAME}", flush=True)
115
+
116
+ messages.append({
117
+ "role": "user",
118
+ "content": (
119
+ f"Here is the initial state of the broken app:\n\n"
120
+ f"```\n{obs.stdout}\n```\n\n"
121
+ f"Current directory: {obs.current_dir}\n"
122
+ f"Score: {obs.grader_score}/1.0\n\n"
123
+ f"What bash command should I run first?"
124
+ ),
125
+ })
126
+
127
+ rewards = []
128
+ is_done = False
129
+ steps_taken = 0
130
+ final_score = 0.0
131
+
132
+ for turn in range(1, MAX_TURNS + 1):
133
+ try:
134
+ response = client.chat.completions.create(
135
+ model=MODEL_NAME,
136
+ messages=messages,
137
+ temperature=0.2,
138
+ max_tokens=256,
139
+ )
140
+ llm_text = response.choices[0].message.content or ""
141
+ except Exception as e:
142
+ err_msg = str(e).replace('"', "'")
143
+ break
144
+
145
+ command = extract_command(llm_text)
146
+ if not command:
147
+ command = "ls -la /app"
148
+
149
+ error_msg = "null"
150
+ try:
151
+ result = env.step(BashAction(command=command))
152
+ obs = result.observation
153
+ except Exception as e:
154
+ obs = env.state # Mock failed obs
155
+ error_msg = str(e).replace('\n', ' ')
156
+
157
+ steps_taken += 1
158
+ reward_val = obs.reward if hasattr(obs, 'reward') else getattr(obs, 'grader_score', 0.0)
159
+ rewards.append(f"{reward_val:.2f}")
160
+ is_done = result.done if hasattr(result, 'done') else getattr(obs, 'done', False)
161
+ done_str = "true" if is_done else "false"
162
+
163
+ action_str = command.replace('\n', ' ; ')
164
+ print(f"[STEP] step={steps_taken} action={action_str} reward={reward_val:.2f} done={done_str} error={error_msg}", flush=True)
165
+
166
+ messages.append({"role": "assistant", "content": llm_text})
167
+ messages.append({
168
+ "role": "user",
169
+ "content": (
170
+ f"Command output:\n"
171
+ f"stdout:\n```\n{getattr(obs, 'stdout', '')}\n```\n"
172
+ f"stderr:\n```\n{getattr(obs, 'stderr', '')}\n```\n"
173
+ f"Current score: {getattr(obs, 'grader_score', 0.0)}/1.0\n"
174
+ f"Grader feedback: {getattr(obs, 'grader_feedback', '')}\n\n"
175
+ f"What command should I run next?"
176
+ ),
177
+ })
178
+
179
+ final_score = getattr(obs, 'grader_score', 0.0)
180
+ if getattr(obs, 'grader_score', 0.0) >= 1.0 or getattr(obs, 'done', False) or (hasattr(result, 'done') and result.done):
181
+ break
182
+
183
+ success_str = "true" if final_score >= 1.0 else "false"
184
+ rewards_str = ",".join(rewards) if rewards else "0.00"
185
+ print(f"[END] success={success_str} steps={steps_taken} score={final_score:.2f} rewards={rewards_str}", flush=True)
186
+ except Exception as e:
187
+ # Make sure to emit END log even on catastrophic wrapper failures so Hackathon doesn't crash inference.py
188
+ print(f"[END] success=false steps=0 score=0.00 rewards=0.00", flush=True)
189
 
190
  if __name__ == "__main__":
191
  main()
server/devops_sandbox_environment.py CHANGED
@@ -51,6 +51,7 @@ class DevOpsSandbox(Environment):
51
  self._state = State(episode_id=str(uuid4()), step_count=0)
52
  self._current_dir: str = "/app"
53
  self._last_score: float = 0.0
 
54
 
55
  # When running on Windows locally, `/app` and `/app_backup` don't exist naturally,
56
  # so we will use absolute paths mapped to our repo if they aren't at root.
@@ -81,6 +82,7 @@ class DevOpsSandbox(Environment):
81
  self._state = State(episode_id=eid, step_count=0)
82
  self._last_score = 0.0
83
  self._current_dir = self._app_dir
 
84
 
85
  self._reset_filesystem()
86
  self._inject_grader_script()
@@ -88,7 +90,38 @@ class DevOpsSandbox(Environment):
88
  # Gather initial observation
89
  init_stdout = self._exec_cmd(f"ls -la {self._app_dir} && echo '---' && cat {os.path.join(self._app_dir, 'config.json')}")
90
 
91
- task_prompt = (
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  "=== SELF-HEALING DEVOPS SANDBOX ===\n"
93
  f"You have been dropped into a container with a broken Node.js Express backend in {self._app_dir}.\n\n"
94
  "YOUR MISSION: Diagnose and fix ALL bugs so that:\n"
@@ -110,7 +143,7 @@ class DevOpsSandbox(Environment):
110
  stdout=task_prompt,
111
  stderr="",
112
  current_dir=self._current_dir,
113
- task_id="devops_sandbox",
114
  grader_score=0.0,
115
  grader_feedback="Episode started. Fix the bugs!",
116
  done=False,
@@ -132,11 +165,11 @@ class DevOpsSandbox(Environment):
132
  stdout="",
133
  stderr="Empty command. Please provide a bash command.",
134
  current_dir=self._current_dir,
135
- task_id="devops_sandbox",
136
  grader_score=self._last_score,
137
  grader_feedback="No command executed.",
138
  done=False,
139
- reward=self._last_score,
140
  )
141
 
142
  # Handle 'cd' commands manually since subprocess run is transient
@@ -159,6 +192,7 @@ class DevOpsSandbox(Environment):
159
 
160
  # Run the grader anyway, even if just a cd
161
  score, feedback = self._grade()
 
162
  self._last_score = score
163
  episode_done = (score >= 1.0) or (self._state.step_count >= MAX_STEPS)
164
 
@@ -166,11 +200,11 @@ class DevOpsSandbox(Environment):
166
  stdout=stdout,
167
  stderr=stderr,
168
  current_dir=self._current_dir,
169
- task_id="devops_sandbox",
170
  grader_score=score,
171
  grader_feedback=feedback,
172
  done=episode_done,
173
- reward=score,
174
  )
175
 
176
  # Execute normal command
@@ -181,6 +215,7 @@ class DevOpsSandbox(Environment):
181
  stdout, stderr = "", f"Command execution error: {e}"
182
 
183
  score, feedback = self._grade()
 
184
  self._last_score = score
185
  episode_done = (score >= 1.0) or (self._state.step_count >= MAX_STEPS)
186
 
@@ -188,11 +223,11 @@ class DevOpsSandbox(Environment):
188
  stdout=stdout,
189
  stderr=stderr,
190
  current_dir=self._current_dir,
191
- task_id="devops_sandbox",
192
  grader_score=score,
193
  grader_feedback=feedback,
194
  done=episode_done,
195
- reward=score,
196
  )
197
 
198
  @property
@@ -390,5 +425,15 @@ class DevOpsSandbox(Environment):
390
  logger.exception("Grader error")
391
  feedback_parts.append(f"Grader error (score preserved): {exc}")
392
 
393
- score = round(min(max(score, 0.0), 1.0), 2)
394
- return (score, " | ".join(feedback_parts))
 
 
 
 
 
 
 
 
 
 
 
51
  self._state = State(episode_id=str(uuid4()), step_count=0)
52
  self._current_dir: str = "/app"
53
  self._last_score: float = 0.0
54
+ self._current_task: str = "hard"
55
 
56
  # When running on Windows locally, `/app` and `/app_backup` don't exist naturally,
57
  # so we will use absolute paths mapped to our repo if they aren't at root.
 
82
  self._state = State(episode_id=eid, step_count=0)
83
  self._last_score = 0.0
84
  self._current_dir = self._app_dir
85
+ self._current_task = kwargs.get("task_name", "hard")
86
 
87
  self._reset_filesystem()
88
  self._inject_grader_script()
 
90
  # Gather initial observation
91
  init_stdout = self._exec_cmd(f"ls -la {self._app_dir} && echo '---' && cat {os.path.join(self._app_dir, 'config.json')}")
92
 
93
+ if self._current_task == "easy":
94
+ task_prompt = (
95
+ "=== SELF-HEALING DEVOPS SANDBOX ===\n"
96
+ f"You have been dropped into a container with a broken Node.js Express backend in {self._app_dir}.\n\n"
97
+ "YOUR MISSION [EASY]: Diagnose and fix the port bug so that:\n"
98
+ " 1. The app starts without errors on port 3000\n"
99
+ " 2. GET /health returns HTTP 200\n\n"
100
+ "HINTS:\n"
101
+ " - Check config.json for wrong settings\n\n"
102
+ "Use bash commands to explore, edit files, and test.\n"
103
+ "When you think you've fixed everything, run: npm start\n\n"
104
+ "--- INITIAL DIRECTORY LISTING ---\n"
105
+ f"{init_stdout}\n"
106
+ )
107
+ elif self._current_task == "medium":
108
+ task_prompt = (
109
+ "=== SELF-HEALING DEVOPS SANDBOX ===\n"
110
+ f"You have been dropped into a container with a broken Node.js Express backend in {self._app_dir}.\n\n"
111
+ "YOUR MISSION [MEDIUM]: Diagnose and fix TWO bugs so that:\n"
112
+ " 1. The app starts without errors on port 3000\n"
113
+ " 2. GET /health returns HTTP 200\n"
114
+ " 3. GET /api/users returns HTTP 200 with valid JSON\n\n"
115
+ "HINTS:\n"
116
+ " - Check config.json for wrong settings\n"
117
+ " - Look for syntax errors in routes/users.js\n\n"
118
+ "Use bash commands to explore, edit files, and test.\n"
119
+ "When you think you've fixed everything, run: npm start\n\n"
120
+ "--- INITIAL DIRECTORY LISTING ---\n"
121
+ f"{init_stdout}\n"
122
+ )
123
+ else:
124
+ task_prompt = (
125
  "=== SELF-HEALING DEVOPS SANDBOX ===\n"
126
  f"You have been dropped into a container with a broken Node.js Express backend in {self._app_dir}.\n\n"
127
  "YOUR MISSION: Diagnose and fix ALL bugs so that:\n"
 
143
  stdout=task_prompt,
144
  stderr="",
145
  current_dir=self._current_dir,
146
+ task_id=self._current_task,
147
  grader_score=0.0,
148
  grader_feedback="Episode started. Fix the bugs!",
149
  done=False,
 
165
  stdout="",
166
  stderr="Empty command. Please provide a bash command.",
167
  current_dir=self._current_dir,
168
+ task_id=self._current_task,
169
  grader_score=self._last_score,
170
  grader_feedback="No command executed.",
171
  done=False,
172
+ reward=0.0,
173
  )
174
 
175
  # Handle 'cd' commands manually since subprocess run is transient
 
192
 
193
  # Run the grader anyway, even if just a cd
194
  score, feedback = self._grade()
195
+ reward = max(0.0, score - self._last_score)
196
  self._last_score = score
197
  episode_done = (score >= 1.0) or (self._state.step_count >= MAX_STEPS)
198
 
 
200
  stdout=stdout,
201
  stderr=stderr,
202
  current_dir=self._current_dir,
203
+ task_id=self._current_task,
204
  grader_score=score,
205
  grader_feedback=feedback,
206
  done=episode_done,
207
+ reward=reward,
208
  )
209
 
210
  # Execute normal command
 
215
  stdout, stderr = "", f"Command execution error: {e}"
216
 
217
  score, feedback = self._grade()
218
+ reward = max(0.0, score - self._last_score)
219
  self._last_score = score
220
  episode_done = (score >= 1.0) or (self._state.step_count >= MAX_STEPS)
221
 
 
223
  stdout=stdout,
224
  stderr=stderr,
225
  current_dir=self._current_dir,
226
+ task_id=self._current_task,
227
  grader_score=score,
228
  grader_feedback=feedback,
229
  done=episode_done,
230
+ reward=reward,
231
  )
232
 
233
  @property
 
425
  logger.exception("Grader error")
426
  feedback_parts.append(f"Grader error (score preserved): {exc}")
427
 
428
+ # Scale score based on task difficulty
429
+ if self._current_task == "easy":
430
+ raw_target = 0.45
431
+ elif self._current_task == "medium":
432
+ raw_target = 0.60
433
+ else:
434
+ raw_target = 1.0
435
+
436
+ final_score = min(1.0, score / raw_target)
437
+ final_score = round(min(max(final_score, 0.0), 1.0), 2)
438
+
439
+ return (final_score, " | ".join(feedback_parts))