Spaces:

DEVessi
/

devops_sandbox

Running

App Files Files Community

DEVessi commited on 3 days ago

Commit

516d2c6

verified ·

1 Parent(s): 32dd99b

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

inference.py +84 -78
server/devops_sandbox_environment.py +55 -10

inference.py CHANGED Viewed

@@ -100,86 +100,92 @@ def main():
     client = OpenAI(api_key=HF_TOKEN or "dummy_key", base_url=API_BASE_URL)
     # Note: openenv evaluation specifically needs exactly 3 things: [START], [STEP] logs, [END]
-    messages = [{"role": "system", "content": SYSTEM_PROMPT}]
-    with DevopsSandboxEnv(base_url=ENV_URL).sync() as env:
-        result = env.reset()
-        obs = result.observation
-        print(f"[START] task={TASK_NAME} env={BENCHMARK} model={MODEL_NAME}", flush=True)
-        messages.append({
-            "role": "user",
-            "content": (
-                f"Here is the initial state of the broken app:\n\n"
-                f"```\n{obs.stdout}\n```\n\n"
-                f"Current directory: {obs.current_dir}\n"
-                f"Score: {obs.grader_score}/1.0\n\n"
-                f"What bash command should I run first?"
-            ),
-        })
-        rewards = []
-        is_done = False
-        steps_taken = 0
-        final_score = 0.0
-        for turn in range(1, MAX_TURNS + 1):
-            try:
-                response = client.chat.completions.create(
-                    model=MODEL_NAME,
-                    messages=messages,
-                    temperature=0.2,
-                    max_tokens=256,
-                )
-                llm_text = response.choices[0].message.content or ""
-            except Exception as e:
-                err_msg = str(e).replace('"', "'")
-                # Need to emit an empty step on failure? Usually not, just end.
-                break
-            command = extract_command(llm_text)
-            if not command:
-                command = "ls -la /app"
-            error_msg = "null"
-            try:
-                result = env.step(BashAction(command=command))
                 obs = result.observation
-            except Exception as e:
-                obs = env.state  # Mock failed obs
-                error_msg = str(e).replace('\n', ' ')
-            steps_taken += 1
-            reward_val = obs.reward if hasattr(obs, 'reward') else getattr(obs, 'grader_score', 0.0)
-            rewards.append(f"{reward_val:.2f}")
-            is_done = result.done if hasattr(result, 'done') else getattr(obs, 'done', False)
-            done_str = "true" if is_done else "false"
-            action_str = command.replace('\n', ' ; ')
-            print(f"[STEP] step={steps_taken} action={action_str} reward={reward_val:.2f} done={done_str} error={error_msg}", flush=True)
-            messages.append({"role": "assistant", "content": llm_text})
-            messages.append({
-                "role": "user",
-                "content": (
-                    f"Command output:\n"
-                    f"stdout:\n```\n{getattr(obs, 'stdout', '')}\n```\n"
-                    f"stderr:\n```\n{getattr(obs, 'stderr', '')}\n```\n"
-                    f"Current score: {getattr(obs, 'grader_score', 0.0)}/1.0\n"
-                    f"Grader feedback: {getattr(obs, 'grader_feedback', '')}\n\n"
-                    f"What command should I run next?"
-                ),
-            })
-            final_score = getattr(obs, 'grader_score', 0.0)
-            if getattr(obs, 'grader_score', 0.0) >= 1.0 or getattr(obs, 'done', False) or result.done:
-                break
-        success_str = "true" if final_score >= 1.0 else "false"
-        rewards_str = ",".join(rewards) if rewards else "0.00"
-        print(f"[END] success={success_str} steps={steps_taken} score={final_score:.2f} rewards={rewards_str}", flush=True)
 if __name__ == "__main__":
     main()

     client = OpenAI(api_key=HF_TOKEN or "dummy_key", base_url=API_BASE_URL)
+    TASKS = ["easy", "medium", "hard"]
     # Note: openenv evaluation specifically needs exactly 3 things: [START], [STEP] logs, [END]
+    for task_name in TASKS:
+        messages = [{"role": "system", "content": SYSTEM_PROMPT}]
+        try:
+            with DevopsSandboxEnv(base_url=ENV_URL).sync() as env:
+                result = env.reset(task_name=task_name)
                 obs = result.observation
+                print(f"[START] task={task_name} env={BENCHMARK} model={MODEL_NAME}", flush=True)
+                messages.append({
+                    "role": "user",
+                    "content": (
+                        f"Here is the initial state of the broken app:\n\n"
+                        f"```\n{obs.stdout}\n```\n\n"
+                        f"Current directory: {obs.current_dir}\n"
+                        f"Score: {obs.grader_score}/1.0\n\n"
+                        f"What bash command should I run first?"
+                    ),
+                })
+                rewards = []
+                is_done = False
+                steps_taken = 0
+                final_score = 0.0
+                for turn in range(1, MAX_TURNS + 1):
+                    try:
+                        response = client.chat.completions.create(
+                            model=MODEL_NAME,
+                            messages=messages,
+                            temperature=0.2,
+                            max_tokens=256,
+                        )
+                        llm_text = response.choices[0].message.content or ""
+                    except Exception as e:
+                        err_msg = str(e).replace('"', "'")
+                        break
+                    command = extract_command(llm_text)
+                    if not command:
+                        command = "ls -la /app"
+                    error_msg = "null"
+                    try:
+                        result = env.step(BashAction(command=command))
+                        obs = result.observation
+                    except Exception as e:
+                        obs = env.state  # Mock failed obs
+                        error_msg = str(e).replace('\n', ' ')
+                    steps_taken += 1
+                    reward_val = obs.reward if hasattr(obs, 'reward') else getattr(obs, 'grader_score', 0.0)
+                    rewards.append(f"{reward_val:.2f}")
+                    is_done = result.done if hasattr(result, 'done') else getattr(obs, 'done', False)
+                    done_str = "true" if is_done else "false"
+                    action_str = command.replace('\n', ' ; ')
+                    print(f"[STEP] step={steps_taken} action={action_str} reward={reward_val:.2f} done={done_str} error={error_msg}", flush=True)
+                    messages.append({"role": "assistant", "content": llm_text})
+                    messages.append({
+                        "role": "user",
+                        "content": (
+                            f"Command output:\n"
+                            f"stdout:\n```\n{getattr(obs, 'stdout', '')}\n```\n"
+                            f"stderr:\n```\n{getattr(obs, 'stderr', '')}\n```\n"
+                            f"Current score: {getattr(obs, 'grader_score', 0.0)}/1.0\n"
+                            f"Grader feedback: {getattr(obs, 'grader_feedback', '')}\n\n"
+                            f"What command should I run next?"
+                        ),
+                    })
+                    final_score = getattr(obs, 'grader_score', 0.0)
+                    if getattr(obs, 'grader_score', 0.0) >= 1.0 or getattr(obs, 'done', False) or (hasattr(result, 'done') and result.done):
+                        break
+                success_str = "true" if final_score >= 1.0 else "false"
+                rewards_str = ",".join(rewards) if rewards else "0.00"
+                print(f"[END] success={success_str} steps={steps_taken} score={final_score:.2f} rewards={rewards_str}", flush=True)
+        except Exception as e:
+             # Make sure to emit END log even on catastrophic wrapper failures so Hackathon doesn't crash inference.py
+             print(f"[END] success=false steps=0 score=0.00 rewards=0.00", flush=True)
 if __name__ == "__main__":
     main()

server/devops_sandbox_environment.py CHANGED Viewed

@@ -51,6 +51,7 @@ class DevOpsSandbox(Environment):
         self._state = State(episode_id=str(uuid4()), step_count=0)
         self._current_dir: str = "/app"
         self._last_score: float = 0.0
         # When running on Windows locally, `/app` and `/app_backup` don't exist naturally,
         # so we will use absolute paths mapped to our repo if they aren't at root.
@@ -81,6 +82,7 @@ class DevOpsSandbox(Environment):
         self._state = State(episode_id=eid, step_count=0)
         self._last_score = 0.0
         self._current_dir = self._app_dir
         self._reset_filesystem()
         self._inject_grader_script()
@@ -88,7 +90,38 @@ class DevOpsSandbox(Environment):
         # Gather initial observation
         init_stdout = self._exec_cmd(f"ls -la {self._app_dir} && echo '---' && cat {os.path.join(self._app_dir, 'config.json')}")
-        task_prompt = (
             "=== SELF-HEALING DEVOPS SANDBOX ===\n"
             f"You have been dropped into a container with a broken Node.js Express backend in {self._app_dir}.\n\n"
             "YOUR MISSION: Diagnose and fix ALL bugs so that:\n"
@@ -110,7 +143,7 @@ class DevOpsSandbox(Environment):
             stdout=task_prompt,
             stderr="",
             current_dir=self._current_dir,
-            task_id="devops_sandbox",
             grader_score=0.0,
             grader_feedback="Episode started. Fix the bugs!",
             done=False,
@@ -132,11 +165,11 @@ class DevOpsSandbox(Environment):
                 stdout="",
                 stderr="Empty command. Please provide a bash command.",
                 current_dir=self._current_dir,
-                task_id="devops_sandbox",
                 grader_score=self._last_score,
                 grader_feedback="No command executed.",
                 done=False,
-                reward=self._last_score,
             )
         # Handle 'cd' commands manually since subprocess run is transient
@@ -159,6 +192,7 @@ class DevOpsSandbox(Environment):
             # Run the grader anyway, even if just a cd
             score, feedback = self._grade()
             self._last_score = score
             episode_done = (score >= 1.0) or (self._state.step_count >= MAX_STEPS)
@@ -166,11 +200,11 @@ class DevOpsSandbox(Environment):
                 stdout=stdout,
                 stderr=stderr,
                 current_dir=self._current_dir,
-                task_id="devops_sandbox",
                 grader_score=score,
                 grader_feedback=feedback,
                 done=episode_done,
-                reward=score,
             )
         # Execute normal command
@@ -181,6 +215,7 @@ class DevOpsSandbox(Environment):
             stdout, stderr = "", f"Command execution error: {e}"
         score, feedback = self._grade()
         self._last_score = score
         episode_done = (score >= 1.0) or (self._state.step_count >= MAX_STEPS)
@@ -188,11 +223,11 @@ class DevOpsSandbox(Environment):
             stdout=stdout,
             stderr=stderr,
             current_dir=self._current_dir,
-            task_id="devops_sandbox",
             grader_score=score,
             grader_feedback=feedback,
             done=episode_done,
-            reward=score,
         )
     @property
@@ -390,5 +425,15 @@ class DevOpsSandbox(Environment):
             logger.exception("Grader error")
             feedback_parts.append(f"Grader error (score preserved): {exc}")
-        score = round(min(max(score, 0.0), 1.0), 2)
-        return (score, " | ".join(feedback_parts))

         self._state = State(episode_id=str(uuid4()), step_count=0)
         self._current_dir: str = "/app"
         self._last_score: float = 0.0
+        self._current_task: str = "hard"
         # When running on Windows locally, `/app` and `/app_backup` don't exist naturally,
         # so we will use absolute paths mapped to our repo if they aren't at root.
         self._state = State(episode_id=eid, step_count=0)
         self._last_score = 0.0
         self._current_dir = self._app_dir
+        self._current_task = kwargs.get("task_name", "hard")
         self._reset_filesystem()
         self._inject_grader_script()
         # Gather initial observation
         init_stdout = self._exec_cmd(f"ls -la {self._app_dir} && echo '---' && cat {os.path.join(self._app_dir, 'config.json')}")
+        if self._current_task == "easy":
+            task_prompt = (
+                "=== SELF-HEALING DEVOPS SANDBOX ===\n"
+                f"You have been dropped into a container with a broken Node.js Express backend in {self._app_dir}.\n\n"
+                "YOUR MISSION [EASY]: Diagnose and fix the port bug so that:\n"
+                "  1. The app starts without errors on port 3000\n"
+                "  2. GET /health returns HTTP 200\n\n"
+                "HINTS:\n"
+                "  - Check config.json for wrong settings\n\n"
+                "Use bash commands to explore, edit files, and test.\n"
+                "When you think you've fixed everything, run: npm start\n\n"
+                "--- INITIAL DIRECTORY LISTING ---\n"
+                f"{init_stdout}\n"
+            )
+        elif self._current_task == "medium":
+            task_prompt = (
+                "=== SELF-HEALING DEVOPS SANDBOX ===\n"
+                f"You have been dropped into a container with a broken Node.js Express backend in {self._app_dir}.\n\n"
+                "YOUR MISSION [MEDIUM]: Diagnose and fix TWO bugs so that:\n"
+                "  1. The app starts without errors on port 3000\n"
+                "  2. GET /health returns HTTP 200\n"
+                "  3. GET /api/users returns HTTP 200 with valid JSON\n\n"
+                "HINTS:\n"
+                "  - Check config.json for wrong settings\n"
+                "  - Look for syntax errors in routes/users.js\n\n"
+                "Use bash commands to explore, edit files, and test.\n"
+                "When you think you've fixed everything, run: npm start\n\n"
+                "--- INITIAL DIRECTORY LISTING ---\n"
+                f"{init_stdout}\n"
+            )
+        else:
+            task_prompt = (
             "=== SELF-HEALING DEVOPS SANDBOX ===\n"
             f"You have been dropped into a container with a broken Node.js Express backend in {self._app_dir}.\n\n"
             "YOUR MISSION: Diagnose and fix ALL bugs so that:\n"
             stdout=task_prompt,
             stderr="",
             current_dir=self._current_dir,
+            task_id=self._current_task,
             grader_score=0.0,
             grader_feedback="Episode started. Fix the bugs!",
             done=False,
                 stdout="",
                 stderr="Empty command. Please provide a bash command.",
                 current_dir=self._current_dir,
+                task_id=self._current_task,
                 grader_score=self._last_score,
                 grader_feedback="No command executed.",
                 done=False,
+                reward=0.0,
             )
         # Handle 'cd' commands manually since subprocess run is transient
             # Run the grader anyway, even if just a cd
             score, feedback = self._grade()
+            reward = max(0.0, score - self._last_score)
             self._last_score = score
             episode_done = (score >= 1.0) or (self._state.step_count >= MAX_STEPS)
                 stdout=stdout,
                 stderr=stderr,
                 current_dir=self._current_dir,
+                task_id=self._current_task,
                 grader_score=score,
                 grader_feedback=feedback,
                 done=episode_done,
+                reward=reward,
             )
         # Execute normal command
             stdout, stderr = "", f"Command execution error: {e}"
         score, feedback = self._grade()
+        reward = max(0.0, score - self._last_score)
         self._last_score = score
         episode_done = (score >= 1.0) or (self._state.step_count >= MAX_STEPS)
             stdout=stdout,
             stderr=stderr,
             current_dir=self._current_dir,
+            task_id=self._current_task,
             grader_score=score,
             grader_feedback=feedback,
             done=episode_done,
+            reward=reward,
         )
     @property
             logger.exception("Grader error")
             feedback_parts.append(f"Grader error (score preserved): {exc}")
+        # Scale score based on task difficulty
+        if self._current_task == "easy":
+            raw_target = 0.45
+        elif self._current_task == "medium":
+            raw_target = 0.60
+        else:
+            raw_target = 1.0
+        final_score = min(1.0, score / raw_target)
+        final_score = round(min(max(final_score, 0.0), 1.0), 2)
+        return (final_score, " | ".join(feedback_parts))