Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- inference.py +84 -78
- server/devops_sandbox_environment.py +55 -10
inference.py
CHANGED
|
@@ -100,86 +100,92 @@ def main():
|
|
| 100 |
|
| 101 |
client = OpenAI(api_key=HF_TOKEN or "dummy_key", base_url=API_BASE_URL)
|
| 102 |
|
|
|
|
|
|
|
| 103 |
# Note: openenv evaluation specifically needs exactly 3 things: [START], [STEP] logs, [END]
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
print(f"[START] task={TASK_NAME} env={BENCHMARK} model={MODEL_NAME}", flush=True)
|
| 111 |
-
|
| 112 |
-
messages.append({
|
| 113 |
-
"role": "user",
|
| 114 |
-
"content": (
|
| 115 |
-
f"Here is the initial state of the broken app:\n\n"
|
| 116 |
-
f"```\n{obs.stdout}\n```\n\n"
|
| 117 |
-
f"Current directory: {obs.current_dir}\n"
|
| 118 |
-
f"Score: {obs.grader_score}/1.0\n\n"
|
| 119 |
-
f"What bash command should I run first?"
|
| 120 |
-
),
|
| 121 |
-
})
|
| 122 |
-
|
| 123 |
-
rewards = []
|
| 124 |
-
is_done = False
|
| 125 |
-
steps_taken = 0
|
| 126 |
-
final_score = 0.0
|
| 127 |
-
|
| 128 |
-
for turn in range(1, MAX_TURNS + 1):
|
| 129 |
-
try:
|
| 130 |
-
response = client.chat.completions.create(
|
| 131 |
-
model=MODEL_NAME,
|
| 132 |
-
messages=messages,
|
| 133 |
-
temperature=0.2,
|
| 134 |
-
max_tokens=256,
|
| 135 |
-
)
|
| 136 |
-
llm_text = response.choices[0].message.content or ""
|
| 137 |
-
except Exception as e:
|
| 138 |
-
err_msg = str(e).replace('"', "'")
|
| 139 |
-
# Need to emit an empty step on failure? Usually not, just end.
|
| 140 |
-
break
|
| 141 |
-
|
| 142 |
-
command = extract_command(llm_text)
|
| 143 |
-
if not command:
|
| 144 |
-
command = "ls -la /app"
|
| 145 |
-
|
| 146 |
-
error_msg = "null"
|
| 147 |
-
try:
|
| 148 |
-
result = env.step(BashAction(command=command))
|
| 149 |
obs = result.observation
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
|
| 184 |
if __name__ == "__main__":
|
| 185 |
main()
|
|
|
|
| 100 |
|
| 101 |
client = OpenAI(api_key=HF_TOKEN or "dummy_key", base_url=API_BASE_URL)
|
| 102 |
|
| 103 |
+
TASKS = ["easy", "medium", "hard"]
|
| 104 |
+
|
| 105 |
# Note: openenv evaluation specifically needs exactly 3 things: [START], [STEP] logs, [END]
|
| 106 |
+
for task_name in TASKS:
|
| 107 |
+
messages = [{"role": "system", "content": SYSTEM_PROMPT}]
|
| 108 |
+
|
| 109 |
+
try:
|
| 110 |
+
with DevopsSandboxEnv(base_url=ENV_URL).sync() as env:
|
| 111 |
+
result = env.reset(task_name=task_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
obs = result.observation
|
| 113 |
+
|
| 114 |
+
print(f"[START] task={task_name} env={BENCHMARK} model={MODEL_NAME}", flush=True)
|
| 115 |
+
|
| 116 |
+
messages.append({
|
| 117 |
+
"role": "user",
|
| 118 |
+
"content": (
|
| 119 |
+
f"Here is the initial state of the broken app:\n\n"
|
| 120 |
+
f"```\n{obs.stdout}\n```\n\n"
|
| 121 |
+
f"Current directory: {obs.current_dir}\n"
|
| 122 |
+
f"Score: {obs.grader_score}/1.0\n\n"
|
| 123 |
+
f"What bash command should I run first?"
|
| 124 |
+
),
|
| 125 |
+
})
|
| 126 |
+
|
| 127 |
+
rewards = []
|
| 128 |
+
is_done = False
|
| 129 |
+
steps_taken = 0
|
| 130 |
+
final_score = 0.0
|
| 131 |
+
|
| 132 |
+
for turn in range(1, MAX_TURNS + 1):
|
| 133 |
+
try:
|
| 134 |
+
response = client.chat.completions.create(
|
| 135 |
+
model=MODEL_NAME,
|
| 136 |
+
messages=messages,
|
| 137 |
+
temperature=0.2,
|
| 138 |
+
max_tokens=256,
|
| 139 |
+
)
|
| 140 |
+
llm_text = response.choices[0].message.content or ""
|
| 141 |
+
except Exception as e:
|
| 142 |
+
err_msg = str(e).replace('"', "'")
|
| 143 |
+
break
|
| 144 |
+
|
| 145 |
+
command = extract_command(llm_text)
|
| 146 |
+
if not command:
|
| 147 |
+
command = "ls -la /app"
|
| 148 |
+
|
| 149 |
+
error_msg = "null"
|
| 150 |
+
try:
|
| 151 |
+
result = env.step(BashAction(command=command))
|
| 152 |
+
obs = result.observation
|
| 153 |
+
except Exception as e:
|
| 154 |
+
obs = env.state # Mock failed obs
|
| 155 |
+
error_msg = str(e).replace('\n', ' ')
|
| 156 |
+
|
| 157 |
+
steps_taken += 1
|
| 158 |
+
reward_val = obs.reward if hasattr(obs, 'reward') else getattr(obs, 'grader_score', 0.0)
|
| 159 |
+
rewards.append(f"{reward_val:.2f}")
|
| 160 |
+
is_done = result.done if hasattr(result, 'done') else getattr(obs, 'done', False)
|
| 161 |
+
done_str = "true" if is_done else "false"
|
| 162 |
+
|
| 163 |
+
action_str = command.replace('\n', ' ; ')
|
| 164 |
+
print(f"[STEP] step={steps_taken} action={action_str} reward={reward_val:.2f} done={done_str} error={error_msg}", flush=True)
|
| 165 |
+
|
| 166 |
+
messages.append({"role": "assistant", "content": llm_text})
|
| 167 |
+
messages.append({
|
| 168 |
+
"role": "user",
|
| 169 |
+
"content": (
|
| 170 |
+
f"Command output:\n"
|
| 171 |
+
f"stdout:\n```\n{getattr(obs, 'stdout', '')}\n```\n"
|
| 172 |
+
f"stderr:\n```\n{getattr(obs, 'stderr', '')}\n```\n"
|
| 173 |
+
f"Current score: {getattr(obs, 'grader_score', 0.0)}/1.0\n"
|
| 174 |
+
f"Grader feedback: {getattr(obs, 'grader_feedback', '')}\n\n"
|
| 175 |
+
f"What command should I run next?"
|
| 176 |
+
),
|
| 177 |
+
})
|
| 178 |
+
|
| 179 |
+
final_score = getattr(obs, 'grader_score', 0.0)
|
| 180 |
+
if getattr(obs, 'grader_score', 0.0) >= 1.0 or getattr(obs, 'done', False) or (hasattr(result, 'done') and result.done):
|
| 181 |
+
break
|
| 182 |
+
|
| 183 |
+
success_str = "true" if final_score >= 1.0 else "false"
|
| 184 |
+
rewards_str = ",".join(rewards) if rewards else "0.00"
|
| 185 |
+
print(f"[END] success={success_str} steps={steps_taken} score={final_score:.2f} rewards={rewards_str}", flush=True)
|
| 186 |
+
except Exception as e:
|
| 187 |
+
# Make sure to emit END log even on catastrophic wrapper failures so Hackathon doesn't crash inference.py
|
| 188 |
+
print(f"[END] success=false steps=0 score=0.00 rewards=0.00", flush=True)
|
| 189 |
|
| 190 |
if __name__ == "__main__":
|
| 191 |
main()
|
server/devops_sandbox_environment.py
CHANGED
|
@@ -51,6 +51,7 @@ class DevOpsSandbox(Environment):
|
|
| 51 |
self._state = State(episode_id=str(uuid4()), step_count=0)
|
| 52 |
self._current_dir: str = "/app"
|
| 53 |
self._last_score: float = 0.0
|
|
|
|
| 54 |
|
| 55 |
# When running on Windows locally, `/app` and `/app_backup` don't exist naturally,
|
| 56 |
# so we will use absolute paths mapped to our repo if they aren't at root.
|
|
@@ -81,6 +82,7 @@ class DevOpsSandbox(Environment):
|
|
| 81 |
self._state = State(episode_id=eid, step_count=0)
|
| 82 |
self._last_score = 0.0
|
| 83 |
self._current_dir = self._app_dir
|
|
|
|
| 84 |
|
| 85 |
self._reset_filesystem()
|
| 86 |
self._inject_grader_script()
|
|
@@ -88,7 +90,38 @@ class DevOpsSandbox(Environment):
|
|
| 88 |
# Gather initial observation
|
| 89 |
init_stdout = self._exec_cmd(f"ls -la {self._app_dir} && echo '---' && cat {os.path.join(self._app_dir, 'config.json')}")
|
| 90 |
|
| 91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
"=== SELF-HEALING DEVOPS SANDBOX ===\n"
|
| 93 |
f"You have been dropped into a container with a broken Node.js Express backend in {self._app_dir}.\n\n"
|
| 94 |
"YOUR MISSION: Diagnose and fix ALL bugs so that:\n"
|
|
@@ -110,7 +143,7 @@ class DevOpsSandbox(Environment):
|
|
| 110 |
stdout=task_prompt,
|
| 111 |
stderr="",
|
| 112 |
current_dir=self._current_dir,
|
| 113 |
-
task_id=
|
| 114 |
grader_score=0.0,
|
| 115 |
grader_feedback="Episode started. Fix the bugs!",
|
| 116 |
done=False,
|
|
@@ -132,11 +165,11 @@ class DevOpsSandbox(Environment):
|
|
| 132 |
stdout="",
|
| 133 |
stderr="Empty command. Please provide a bash command.",
|
| 134 |
current_dir=self._current_dir,
|
| 135 |
-
task_id=
|
| 136 |
grader_score=self._last_score,
|
| 137 |
grader_feedback="No command executed.",
|
| 138 |
done=False,
|
| 139 |
-
reward=
|
| 140 |
)
|
| 141 |
|
| 142 |
# Handle 'cd' commands manually since subprocess run is transient
|
|
@@ -159,6 +192,7 @@ class DevOpsSandbox(Environment):
|
|
| 159 |
|
| 160 |
# Run the grader anyway, even if just a cd
|
| 161 |
score, feedback = self._grade()
|
|
|
|
| 162 |
self._last_score = score
|
| 163 |
episode_done = (score >= 1.0) or (self._state.step_count >= MAX_STEPS)
|
| 164 |
|
|
@@ -166,11 +200,11 @@ class DevOpsSandbox(Environment):
|
|
| 166 |
stdout=stdout,
|
| 167 |
stderr=stderr,
|
| 168 |
current_dir=self._current_dir,
|
| 169 |
-
task_id=
|
| 170 |
grader_score=score,
|
| 171 |
grader_feedback=feedback,
|
| 172 |
done=episode_done,
|
| 173 |
-
reward=
|
| 174 |
)
|
| 175 |
|
| 176 |
# Execute normal command
|
|
@@ -181,6 +215,7 @@ class DevOpsSandbox(Environment):
|
|
| 181 |
stdout, stderr = "", f"Command execution error: {e}"
|
| 182 |
|
| 183 |
score, feedback = self._grade()
|
|
|
|
| 184 |
self._last_score = score
|
| 185 |
episode_done = (score >= 1.0) or (self._state.step_count >= MAX_STEPS)
|
| 186 |
|
|
@@ -188,11 +223,11 @@ class DevOpsSandbox(Environment):
|
|
| 188 |
stdout=stdout,
|
| 189 |
stderr=stderr,
|
| 190 |
current_dir=self._current_dir,
|
| 191 |
-
task_id=
|
| 192 |
grader_score=score,
|
| 193 |
grader_feedback=feedback,
|
| 194 |
done=episode_done,
|
| 195 |
-
reward=
|
| 196 |
)
|
| 197 |
|
| 198 |
@property
|
|
@@ -390,5 +425,15 @@ class DevOpsSandbox(Environment):
|
|
| 390 |
logger.exception("Grader error")
|
| 391 |
feedback_parts.append(f"Grader error (score preserved): {exc}")
|
| 392 |
|
| 393 |
-
|
| 394 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
self._state = State(episode_id=str(uuid4()), step_count=0)
|
| 52 |
self._current_dir: str = "/app"
|
| 53 |
self._last_score: float = 0.0
|
| 54 |
+
self._current_task: str = "hard"
|
| 55 |
|
| 56 |
# When running on Windows locally, `/app` and `/app_backup` don't exist naturally,
|
| 57 |
# so we will use absolute paths mapped to our repo if they aren't at root.
|
|
|
|
| 82 |
self._state = State(episode_id=eid, step_count=0)
|
| 83 |
self._last_score = 0.0
|
| 84 |
self._current_dir = self._app_dir
|
| 85 |
+
self._current_task = kwargs.get("task_name", "hard")
|
| 86 |
|
| 87 |
self._reset_filesystem()
|
| 88 |
self._inject_grader_script()
|
|
|
|
| 90 |
# Gather initial observation
|
| 91 |
init_stdout = self._exec_cmd(f"ls -la {self._app_dir} && echo '---' && cat {os.path.join(self._app_dir, 'config.json')}")
|
| 92 |
|
| 93 |
+
if self._current_task == "easy":
|
| 94 |
+
task_prompt = (
|
| 95 |
+
"=== SELF-HEALING DEVOPS SANDBOX ===\n"
|
| 96 |
+
f"You have been dropped into a container with a broken Node.js Express backend in {self._app_dir}.\n\n"
|
| 97 |
+
"YOUR MISSION [EASY]: Diagnose and fix the port bug so that:\n"
|
| 98 |
+
" 1. The app starts without errors on port 3000\n"
|
| 99 |
+
" 2. GET /health returns HTTP 200\n\n"
|
| 100 |
+
"HINTS:\n"
|
| 101 |
+
" - Check config.json for wrong settings\n\n"
|
| 102 |
+
"Use bash commands to explore, edit files, and test.\n"
|
| 103 |
+
"When you think you've fixed everything, run: npm start\n\n"
|
| 104 |
+
"--- INITIAL DIRECTORY LISTING ---\n"
|
| 105 |
+
f"{init_stdout}\n"
|
| 106 |
+
)
|
| 107 |
+
elif self._current_task == "medium":
|
| 108 |
+
task_prompt = (
|
| 109 |
+
"=== SELF-HEALING DEVOPS SANDBOX ===\n"
|
| 110 |
+
f"You have been dropped into a container with a broken Node.js Express backend in {self._app_dir}.\n\n"
|
| 111 |
+
"YOUR MISSION [MEDIUM]: Diagnose and fix TWO bugs so that:\n"
|
| 112 |
+
" 1. The app starts without errors on port 3000\n"
|
| 113 |
+
" 2. GET /health returns HTTP 200\n"
|
| 114 |
+
" 3. GET /api/users returns HTTP 200 with valid JSON\n\n"
|
| 115 |
+
"HINTS:\n"
|
| 116 |
+
" - Check config.json for wrong settings\n"
|
| 117 |
+
" - Look for syntax errors in routes/users.js\n\n"
|
| 118 |
+
"Use bash commands to explore, edit files, and test.\n"
|
| 119 |
+
"When you think you've fixed everything, run: npm start\n\n"
|
| 120 |
+
"--- INITIAL DIRECTORY LISTING ---\n"
|
| 121 |
+
f"{init_stdout}\n"
|
| 122 |
+
)
|
| 123 |
+
else:
|
| 124 |
+
task_prompt = (
|
| 125 |
"=== SELF-HEALING DEVOPS SANDBOX ===\n"
|
| 126 |
f"You have been dropped into a container with a broken Node.js Express backend in {self._app_dir}.\n\n"
|
| 127 |
"YOUR MISSION: Diagnose and fix ALL bugs so that:\n"
|
|
|
|
| 143 |
stdout=task_prompt,
|
| 144 |
stderr="",
|
| 145 |
current_dir=self._current_dir,
|
| 146 |
+
task_id=self._current_task,
|
| 147 |
grader_score=0.0,
|
| 148 |
grader_feedback="Episode started. Fix the bugs!",
|
| 149 |
done=False,
|
|
|
|
| 165 |
stdout="",
|
| 166 |
stderr="Empty command. Please provide a bash command.",
|
| 167 |
current_dir=self._current_dir,
|
| 168 |
+
task_id=self._current_task,
|
| 169 |
grader_score=self._last_score,
|
| 170 |
grader_feedback="No command executed.",
|
| 171 |
done=False,
|
| 172 |
+
reward=0.0,
|
| 173 |
)
|
| 174 |
|
| 175 |
# Handle 'cd' commands manually since subprocess run is transient
|
|
|
|
| 192 |
|
| 193 |
# Run the grader anyway, even if just a cd
|
| 194 |
score, feedback = self._grade()
|
| 195 |
+
reward = max(0.0, score - self._last_score)
|
| 196 |
self._last_score = score
|
| 197 |
episode_done = (score >= 1.0) or (self._state.step_count >= MAX_STEPS)
|
| 198 |
|
|
|
|
| 200 |
stdout=stdout,
|
| 201 |
stderr=stderr,
|
| 202 |
current_dir=self._current_dir,
|
| 203 |
+
task_id=self._current_task,
|
| 204 |
grader_score=score,
|
| 205 |
grader_feedback=feedback,
|
| 206 |
done=episode_done,
|
| 207 |
+
reward=reward,
|
| 208 |
)
|
| 209 |
|
| 210 |
# Execute normal command
|
|
|
|
| 215 |
stdout, stderr = "", f"Command execution error: {e}"
|
| 216 |
|
| 217 |
score, feedback = self._grade()
|
| 218 |
+
reward = max(0.0, score - self._last_score)
|
| 219 |
self._last_score = score
|
| 220 |
episode_done = (score >= 1.0) or (self._state.step_count >= MAX_STEPS)
|
| 221 |
|
|
|
|
| 223 |
stdout=stdout,
|
| 224 |
stderr=stderr,
|
| 225 |
current_dir=self._current_dir,
|
| 226 |
+
task_id=self._current_task,
|
| 227 |
grader_score=score,
|
| 228 |
grader_feedback=feedback,
|
| 229 |
done=episode_done,
|
| 230 |
+
reward=reward,
|
| 231 |
)
|
| 232 |
|
| 233 |
@property
|
|
|
|
| 425 |
logger.exception("Grader error")
|
| 426 |
feedback_parts.append(f"Grader error (score preserved): {exc}")
|
| 427 |
|
| 428 |
+
# Scale score based on task difficulty
|
| 429 |
+
if self._current_task == "easy":
|
| 430 |
+
raw_target = 0.45
|
| 431 |
+
elif self._current_task == "medium":
|
| 432 |
+
raw_target = 0.60
|
| 433 |
+
else:
|
| 434 |
+
raw_target = 1.0
|
| 435 |
+
|
| 436 |
+
final_score = min(1.0, score / raw_target)
|
| 437 |
+
final_score = round(min(max(final_score, 0.0), 1.0), 2)
|
| 438 |
+
|
| 439 |
+
return (final_score, " | ".join(feedback_parts))
|