Commit ·
4de7d31
1
Parent(s): c8f3b98
improved grading
Browse files- README.md +18 -5
- baseline_runner.py +3 -3
- sample_inf_script.py +0 -255
- sample_val_script.txt +0 -185
- server/__init__.py +1 -1
- server/app.py +3 -3
- server/environment.py +1 -1
- server/graders/__init__.py +61 -19
- smoke_test.py +2 -2
- tests/test_baseline.py +3 -3
- tests/test_determinism.py +5 -5
- tests/test_environment_flow.py +3 -3
- tutorial_references/02-deployment.md +0 -427
README.md
CHANGED
|
@@ -206,15 +206,15 @@ Each step, the agent chooses exactly one action:
|
|
| 206 |
|
| 207 |
## Grading System — How Scores Work
|
| 208 |
|
| 209 |
-
Scoring is **deterministic** (same actions always produce the same score)
|
| 210 |
|
| 211 |
### The Formula
|
| 212 |
|
| 213 |
```
|
| 214 |
-
FINAL SCORE = Base + Partial Fixes + Complete Bonus + Efficiency - Hint Penalty - Failed Edit Penalty
|
| 215 |
```
|
| 216 |
|
| 217 |
-
Clamped to `
|
| 218 |
|
| 219 |
### Component Breakdown
|
| 220 |
|
|
@@ -223,10 +223,23 @@ Clamped to `(0.01, 0.99)`.
|
|
| 223 |
| Base score | 5% | Participation credit |
|
| 224 |
| Partial fixes | 35% | Proportional to `issues_fixed / issues_total` |
|
| 225 |
| Complete bonus | 25% | All issues fixed |
|
| 226 |
-
|
|
| 227 |
-
|
|
|
|
|
| 228 |
| Failed edit penalty | -2% each | Per edit with no valid file path |
|
| 229 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
---
|
| 231 |
|
| 232 |
## API Endpoints
|
|
|
|
| 206 |
|
| 207 |
## Grading System — How Scores Work
|
| 208 |
|
| 209 |
+
Scoring is **deterministic** (same actions always produce the same score), **dynamic** (different strategies get different scores), and **difficulty-aware** (harder tasks are graded more generously).
|
| 210 |
|
| 211 |
### The Formula
|
| 212 |
|
| 213 |
```
|
| 214 |
+
FINAL SCORE = Base + Partial Fixes + Complete Bonus + Difficulty Bonus + Efficiency - Hint Penalty - Failed Edit Penalty
|
| 215 |
```
|
| 216 |
|
| 217 |
+
Clamped to `[0.0, 1.0]`.
|
| 218 |
|
| 219 |
### Component Breakdown
|
| 220 |
|
|
|
|
| 223 |
| Base score | 5% | Participation credit |
|
| 224 |
| Partial fixes | 35% | Proportional to `issues_fixed / issues_total` |
|
| 225 |
| Complete bonus | 25% | All issues fixed |
|
| 226 |
+
| Difficulty bonus | 0-3% | Extra reward for fully solving hard/expert tasks |
|
| 227 |
+
| Efficiency | 25% | Decays with extra steps — slower decay for harder tasks |
|
| 228 |
+
| Hint penalty | -3% to -4% each | Per `request_hint` action (cheaper for hard/expert) |
|
| 229 |
| Failed edit penalty | -2% each | Per edit with no valid file path |
|
| 230 |
|
| 231 |
+
### Difficulty Modifiers
|
| 232 |
+
|
| 233 |
+
The grader adjusts three parameters based on task difficulty:
|
| 234 |
+
|
| 235 |
+
| Difficulty | Max Score | Efficiency Decay | Hint Cost |
|
| 236 |
+
|------------|-----------|------------------|-----------|
|
| 237 |
+
| Easy | 0.90 | 0.03/step (strict) | 4% each |
|
| 238 |
+
| Medium | 0.90 | 0.027/step | 4% each |
|
| 239 |
+
| Hard/Expert | 0.93 | 0.021/step (forgiving) | 3% each |
|
| 240 |
+
|
| 241 |
+
This means: solving a 4-bug expert pipeline in 6 steps scores higher than solving a 1-bug easy task in 3 steps, reflecting the genuine difficulty difference.
|
| 242 |
+
|
| 243 |
---
|
| 244 |
|
| 245 |
## API Endpoints
|
baseline_runner.py
CHANGED
|
@@ -6,13 +6,13 @@ Applies expected_fixes directly to verify the environment + grader work e2e.
|
|
| 6 |
|
| 7 |
from typing import List, Optional
|
| 8 |
|
| 9 |
-
from server.environment import
|
| 10 |
from server.graders import run_grader
|
| 11 |
from server.models import Action, ActionType, FileEdit, GraderResult
|
| 12 |
from server.tasks.task_registry import TASK_REGISTRY
|
| 13 |
|
| 14 |
|
| 15 |
-
def _heuristic_episode(env:
|
| 16 |
"""Run one episode using a heuristic that applies expected fixes."""
|
| 17 |
obs = env.reset(task_id=task_id, scenario_id=scenario_id)
|
| 18 |
|
|
@@ -141,7 +141,7 @@ def run_baseline_episodes(task_id: Optional[str] = None, num_episodes: int = 1)
|
|
| 141 |
for scenario in scenarios:
|
| 142 |
if episodes_run >= num_episodes:
|
| 143 |
break
|
| 144 |
-
env =
|
| 145 |
result = _heuristic_episode(env, tid, scenario["id"])
|
| 146 |
results.append(result)
|
| 147 |
episodes_run += 1
|
|
|
|
| 6 |
|
| 7 |
from typing import List, Optional
|
| 8 |
|
| 9 |
+
from server.environment import CloudNativeDebugEnvironment
|
| 10 |
from server.graders import run_grader
|
| 11 |
from server.models import Action, ActionType, FileEdit, GraderResult
|
| 12 |
from server.tasks.task_registry import TASK_REGISTRY
|
| 13 |
|
| 14 |
|
| 15 |
+
def _heuristic_episode(env: CloudNativeDebugEnvironment, task_id: str, scenario_id: Optional[str] = None) -> GraderResult:
|
| 16 |
"""Run one episode using a heuristic that applies expected fixes."""
|
| 17 |
obs = env.reset(task_id=task_id, scenario_id=scenario_id)
|
| 18 |
|
|
|
|
| 141 |
for scenario in scenarios:
|
| 142 |
if episodes_run >= num_episodes:
|
| 143 |
break
|
| 144 |
+
env = CloudNativeDebugEnvironment()
|
| 145 |
result = _heuristic_episode(env, tid, scenario["id"])
|
| 146 |
results.append(result)
|
| 147 |
episodes_run += 1
|
sample_inf_script.py
DELETED
|
@@ -1,255 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Inference Script Example
|
| 3 |
-
===================================
|
| 4 |
-
MANDATORY
|
| 5 |
-
- Before submitting, ensure the following variables are defined in your environment configuration:
|
| 6 |
-
API_BASE_URL The API endpoint for the LLM.
|
| 7 |
-
MODEL_NAME The model identifier to use for inference.
|
| 8 |
-
HF_TOKEN Your Hugging Face / API key.
|
| 9 |
-
|
| 10 |
-
- The inference script must be named `inference.py` and placed in the root directory of the project
|
| 11 |
-
- Participants must use OpenAI Client for all LLM calls using above variables
|
| 12 |
-
"""
|
| 13 |
-
|
| 14 |
-
import os
|
| 15 |
-
import re
|
| 16 |
-
import base64
|
| 17 |
-
import textwrap
|
| 18 |
-
from io import BytesIO
|
| 19 |
-
from typing import List, Optional, Dict
|
| 20 |
-
|
| 21 |
-
from openai import OpenAI
|
| 22 |
-
import numpy as np
|
| 23 |
-
from PIL import Image
|
| 24 |
-
|
| 25 |
-
from browsergym_env import BrowserGymAction, BrowserGymEnv
|
| 26 |
-
|
| 27 |
-
API_BASE_URL = os.getenv("API_BASE_URL") // "https://router.huggingface.co/v1"
|
| 28 |
-
API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 29 |
-
MODEL_NAME = os.getenv("MODEL_NAME")
|
| 30 |
-
MAX_STEPS = 8
|
| 31 |
-
MAX_DOM_CHARS = 3500
|
| 32 |
-
TEMPERATURE = 0.2
|
| 33 |
-
MAX_TOKENS = 200
|
| 34 |
-
FALLBACK_ACTION = "noop()"
|
| 35 |
-
|
| 36 |
-
DEBUG = True
|
| 37 |
-
ACTION_PREFIX_RE = re.compile(
|
| 38 |
-
r"^(action|next action)\s*[:\-]\s*",
|
| 39 |
-
re.IGNORECASE,
|
| 40 |
-
)
|
| 41 |
-
ACTION_PATTERN = re.compile(r"[A-Za-z_]+\s*\(.*\)", re.DOTALL)
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
SYSTEM_PROMPT = textwrap.dedent(
|
| 45 |
-
"""
|
| 46 |
-
You control a web browser through BrowserGym.
|
| 47 |
-
Reply with exactly one action string.
|
| 48 |
-
The action must be a valid BrowserGym command such as:
|
| 49 |
-
- noop()
|
| 50 |
-
- click('<BID>')
|
| 51 |
-
- type('selector', 'text to enter')
|
| 52 |
-
- fill('selector', 'text to enter')
|
| 53 |
-
- send_keys('Enter')
|
| 54 |
-
- scroll('down')
|
| 55 |
-
Use single quotes around string arguments.
|
| 56 |
-
When clicking, use the BrowserGym element IDs (BIDs) listed in the user message.
|
| 57 |
-
If you are unsure, respond with noop().
|
| 58 |
-
Do not include explanations or additional text.
|
| 59 |
-
"""
|
| 60 |
-
).strip()
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
def build_history_lines(history: List[str]) -> str:
|
| 64 |
-
if not history:
|
| 65 |
-
return "None"
|
| 66 |
-
return "\n".join(history[-4:])
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
def extract_screenshot_uri(observation) -> Optional[str]:
|
| 70 |
-
if observation.screenshot is None:
|
| 71 |
-
return None
|
| 72 |
-
screen_array = np.array(observation.screenshot, dtype=np.uint8)
|
| 73 |
-
image = Image.fromarray(screen_array)
|
| 74 |
-
buffer = BytesIO()
|
| 75 |
-
image.save(buffer, format="PNG")
|
| 76 |
-
buffer.seek(0)
|
| 77 |
-
data_uri = base64.b64encode(buffer.read()).decode("utf-8")
|
| 78 |
-
return f"data:image/png;base64,{data_uri}"
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
def extract_clickable_elements(observation) -> List[Dict[str, str]]:
|
| 82 |
-
"""Collect BrowserGym element IDs that can be clicked."""
|
| 83 |
-
|
| 84 |
-
metadata = getattr(observation, "metadata", {}) or {}
|
| 85 |
-
obs_dict = metadata.get("browsergym_obs", {}) or {}
|
| 86 |
-
extra_props = obs_dict.get("extra_element_properties", {}) or {}
|
| 87 |
-
|
| 88 |
-
clickables: List[Dict[str, str]] = []
|
| 89 |
-
for bid, props in extra_props.items():
|
| 90 |
-
if not props.get("clickable"):
|
| 91 |
-
continue
|
| 92 |
-
|
| 93 |
-
bbox = props.get("bbox") or []
|
| 94 |
-
bbox_str = ", ".join(bbox) if bbox else "?"
|
| 95 |
-
clickables.append(
|
| 96 |
-
{
|
| 97 |
-
"bid": str(bid),
|
| 98 |
-
"bbox": bbox_str,
|
| 99 |
-
}
|
| 100 |
-
)
|
| 101 |
-
|
| 102 |
-
# Keep a stable ordering for readability
|
| 103 |
-
clickables.sort(key=lambda item: item["bid"])
|
| 104 |
-
return clickables
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
def build_user_prompt(step: int, observation, history: List[str]) -> str:
|
| 108 |
-
goal = observation.goal or "(not provided)"
|
| 109 |
-
url = observation.url or "(unknown)"
|
| 110 |
-
error_note = "Yes" if observation.last_action_error else "No"
|
| 111 |
-
|
| 112 |
-
clickables = extract_clickable_elements(observation)
|
| 113 |
-
if clickables:
|
| 114 |
-
actions_hint = "\n".join(
|
| 115 |
-
f" - {item['bid']} (bbox: {item['bbox']})" for item in clickables
|
| 116 |
-
)
|
| 117 |
-
else:
|
| 118 |
-
actions_hint = " (none detected)"
|
| 119 |
-
|
| 120 |
-
prompt = textwrap.dedent(
|
| 121 |
-
f"""
|
| 122 |
-
Step: {step}
|
| 123 |
-
Goal: {goal}
|
| 124 |
-
Current URL: {url}
|
| 125 |
-
Previous steps:
|
| 126 |
-
{build_history_lines(history)}
|
| 127 |
-
Last action error: {error_note}
|
| 128 |
-
Available clickable element IDs: {actions_hint}
|
| 129 |
-
Reply with exactly one BrowserGym action string.
|
| 130 |
-
"""
|
| 131 |
-
).strip()
|
| 132 |
-
return prompt
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
def parse_model_action(response_text: str) -> str:
|
| 136 |
-
if not response_text:
|
| 137 |
-
return FALLBACK_ACTION
|
| 138 |
-
|
| 139 |
-
# Prefer the first line that looks like an action string
|
| 140 |
-
lines = response_text.splitlines()
|
| 141 |
-
for raw_line in lines:
|
| 142 |
-
line = raw_line.strip()
|
| 143 |
-
if not line:
|
| 144 |
-
continue
|
| 145 |
-
line = ACTION_PREFIX_RE.sub("", line)
|
| 146 |
-
match = ACTION_PATTERN.search(line)
|
| 147 |
-
if match:
|
| 148 |
-
action = match.group(0).strip()
|
| 149 |
-
# Collapse internal whitespace
|
| 150 |
-
action = re.sub(r"\s+", " ", action)
|
| 151 |
-
# If the model tried to click by natural-language description while we
|
| 152 |
-
# only exposed numeric BrowserGym IDs, fallback to the single detected ID.
|
| 153 |
-
return action
|
| 154 |
-
|
| 155 |
-
# Fall back to searching the whole response
|
| 156 |
-
match = ACTION_PATTERN.search(response_text)
|
| 157 |
-
if match:
|
| 158 |
-
action = match.group(0).strip()
|
| 159 |
-
action = re.sub(r"\s+", " ", action)
|
| 160 |
-
return action
|
| 161 |
-
|
| 162 |
-
return FALLBACK_ACTION
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
def main() -> None:
|
| 166 |
-
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
|
| 167 |
-
|
| 168 |
-
env = BrowserGymEnv.from_docker_image(
|
| 169 |
-
image="browsergym-env:latest",
|
| 170 |
-
env_vars={
|
| 171 |
-
"BROWSERGYM_BENCHMARK": "miniwob",
|
| 172 |
-
"BROWSERGYM_TASK_NAME": "click-test",
|
| 173 |
-
},
|
| 174 |
-
)
|
| 175 |
-
|
| 176 |
-
history: List[str] = []
|
| 177 |
-
|
| 178 |
-
try:
|
| 179 |
-
result = env.reset()
|
| 180 |
-
observation = result.observation
|
| 181 |
-
print(f"Episode goal: {observation.goal}")
|
| 182 |
-
|
| 183 |
-
for step in range(1, MAX_STEPS + 1):
|
| 184 |
-
if result.done:
|
| 185 |
-
print("Environment signalled done. Stopping early.")
|
| 186 |
-
break
|
| 187 |
-
|
| 188 |
-
user_prompt = build_user_prompt(step, observation, history)
|
| 189 |
-
user_content = [{"type": "text", "text": user_prompt}]
|
| 190 |
-
screenshot_uri = extract_screenshot_uri(observation)
|
| 191 |
-
if screenshot_uri:
|
| 192 |
-
user_content.append(
|
| 193 |
-
{
|
| 194 |
-
"type": "image_url",
|
| 195 |
-
"image_url": {"url": screenshot_uri},
|
| 196 |
-
}
|
| 197 |
-
)
|
| 198 |
-
|
| 199 |
-
messages = [
|
| 200 |
-
{
|
| 201 |
-
"role": "system",
|
| 202 |
-
"content": [{"type": "text", "text": SYSTEM_PROMPT}],
|
| 203 |
-
},
|
| 204 |
-
{
|
| 205 |
-
"role": "user",
|
| 206 |
-
"content": user_content,
|
| 207 |
-
},
|
| 208 |
-
]
|
| 209 |
-
|
| 210 |
-
try:
|
| 211 |
-
completion = client.chat.completions.create(
|
| 212 |
-
model=MODEL_NAME,
|
| 213 |
-
messages=messages,
|
| 214 |
-
temperature=TEMPERATURE,
|
| 215 |
-
max_tokens=MAX_TOKENS,
|
| 216 |
-
stream=False,
|
| 217 |
-
)
|
| 218 |
-
response_text = completion.choices[0].message.content or ""
|
| 219 |
-
# pylint: disable=broad-except
|
| 220 |
-
except Exception as exc: # noqa: BLE001
|
| 221 |
-
failure_msg = f"Model request failed ({exc}). Using fallback action."
|
| 222 |
-
print(failure_msg)
|
| 223 |
-
response_text = FALLBACK_ACTION
|
| 224 |
-
|
| 225 |
-
action_str = parse_model_action(response_text)
|
| 226 |
-
print(f"Step {step}: model suggested -> {action_str}")
|
| 227 |
-
|
| 228 |
-
result = env.step(BrowserGymAction(action_str=action_str))
|
| 229 |
-
observation = result.observation
|
| 230 |
-
|
| 231 |
-
reward = result.reward or 0.0
|
| 232 |
-
error_flag = " ERROR" if observation.last_action_error else ""
|
| 233 |
-
history_line = (
|
| 234 |
-
f"Step {step}: {action_str} -> reward {reward:+.2f}{error_flag}"
|
| 235 |
-
)
|
| 236 |
-
history.append(history_line)
|
| 237 |
-
print(
|
| 238 |
-
" Reward: "
|
| 239 |
-
f"{reward:+.2f} | Done: {result.done} | Last action error: "
|
| 240 |
-
f"{observation.last_action_error}"
|
| 241 |
-
)
|
| 242 |
-
|
| 243 |
-
if result.done:
|
| 244 |
-
print("Episode complete.")
|
| 245 |
-
break
|
| 246 |
-
|
| 247 |
-
else:
|
| 248 |
-
print(f"Reached max steps ({MAX_STEPS}).")
|
| 249 |
-
|
| 250 |
-
finally:
|
| 251 |
-
env.close()
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
if __name__ == "__main__":
|
| 255 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sample_val_script.txt
DELETED
|
@@ -1,185 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env bash
|
| 2 |
-
#
|
| 3 |
-
# validate-submission.sh — OpenEnv Submission Validator
|
| 4 |
-
#
|
| 5 |
-
# Checks that your HF Space is live, Docker image builds, and openenv validate passes.
|
| 6 |
-
#
|
| 7 |
-
# Prerequisites:
|
| 8 |
-
# - Docker: https://docs.docker.com/get-docker/
|
| 9 |
-
# - openenv-core: pip install openenv-core
|
| 10 |
-
# - curl (usually pre-installed)
|
| 11 |
-
#
|
| 12 |
-
# Run:
|
| 13 |
-
# curl -fsSL https://raw.githubusercontent.com/<owner>/<repo>/main/scripts/validate-submission.sh | bash -s -- <ping_url> [repo_dir]
|
| 14 |
-
#
|
| 15 |
-
# Or download and run locally:
|
| 16 |
-
# chmod +x validate-submission.sh
|
| 17 |
-
# ./validate-submission.sh <ping_url> [repo_dir]
|
| 18 |
-
#
|
| 19 |
-
# Arguments:
|
| 20 |
-
# ping_url Your HuggingFace Space URL (e.g. https://your-space.hf.space)
|
| 21 |
-
# repo_dir Path to your repo (default: current directory)
|
| 22 |
-
#
|
| 23 |
-
# Examples:
|
| 24 |
-
# ./validate-submission.sh https://my-team.hf.space
|
| 25 |
-
# ./validate-submission.sh https://my-team.hf.space ./my-repo
|
| 26 |
-
#
|
| 27 |
-
|
| 28 |
-
set -uo pipefail
|
| 29 |
-
|
| 30 |
-
DOCKER_BUILD_TIMEOUT=600
|
| 31 |
-
if [ -t 1 ]; then
|
| 32 |
-
RED='\033[0;31m'
|
| 33 |
-
GREEN='\033[0;32m'
|
| 34 |
-
YELLOW='\033[1;33m'
|
| 35 |
-
BOLD='\033[1m'
|
| 36 |
-
NC='\033[0m'
|
| 37 |
-
else
|
| 38 |
-
RED='' GREEN='' YELLOW='' BOLD='' NC=''
|
| 39 |
-
fi
|
| 40 |
-
|
| 41 |
-
run_with_timeout() {
|
| 42 |
-
local secs="$1"; shift
|
| 43 |
-
if command -v timeout &>/dev/null; then
|
| 44 |
-
timeout "$secs" "$@"
|
| 45 |
-
elif command -v gtimeout &>/dev/null; then
|
| 46 |
-
gtimeout "$secs" "$@"
|
| 47 |
-
else
|
| 48 |
-
"$@" &
|
| 49 |
-
local pid=$!
|
| 50 |
-
( sleep "$secs" && kill "$pid" 2>/dev/null ) &
|
| 51 |
-
local watcher=$!
|
| 52 |
-
wait "$pid" 2>/dev/null
|
| 53 |
-
local rc=$?
|
| 54 |
-
kill "$watcher" 2>/dev/null
|
| 55 |
-
wait "$watcher" 2>/dev/null
|
| 56 |
-
return $rc
|
| 57 |
-
fi
|
| 58 |
-
}
|
| 59 |
-
|
| 60 |
-
portable_mktemp() {
|
| 61 |
-
local prefix="${1:-validate}"
|
| 62 |
-
mktemp "${TMPDIR:-/tmp}/${prefix}-XXXXXX" 2>/dev/null || mktemp
|
| 63 |
-
}
|
| 64 |
-
|
| 65 |
-
CLEANUP_FILES=()
|
| 66 |
-
cleanup() { rm -f "${CLEANUP_FILES[@]+"${CLEANUP_FILES[@]}"}"; }
|
| 67 |
-
trap cleanup EXIT
|
| 68 |
-
|
| 69 |
-
PING_URL="${1:-}"
|
| 70 |
-
REPO_DIR="${2:-.}"
|
| 71 |
-
|
| 72 |
-
if [ -z "$PING_URL" ]; then
|
| 73 |
-
printf "Usage: %s <ping_url> [repo_dir]\n" "$0"
|
| 74 |
-
printf "\n"
|
| 75 |
-
printf " ping_url Your HuggingFace Space URL (e.g. https://your-space.hf.space)\n"
|
| 76 |
-
printf " repo_dir Path to your repo (default: current directory)\n"
|
| 77 |
-
exit 1
|
| 78 |
-
fi
|
| 79 |
-
|
| 80 |
-
if ! REPO_DIR="$(cd "$REPO_DIR" 2>/dev/null && pwd)"; then
|
| 81 |
-
printf "Error: directory '%s' not found\n" "${2:-.}"
|
| 82 |
-
exit 1
|
| 83 |
-
fi
|
| 84 |
-
PING_URL="${PING_URL%/}"
|
| 85 |
-
export PING_URL
|
| 86 |
-
PASS=0
|
| 87 |
-
|
| 88 |
-
log() { printf "[%s] %b\n" "$(date -u +%H:%M:%S)" "$*"; }
|
| 89 |
-
pass() { log "${GREEN}PASSED${NC} -- $1"; PASS=$((PASS + 1)); }
|
| 90 |
-
fail() { log "${RED}FAILED${NC} -- $1"; }
|
| 91 |
-
hint() { printf " ${YELLOW}Hint:${NC} %b\n" "$1"; }
|
| 92 |
-
stop_at() {
|
| 93 |
-
printf "\n"
|
| 94 |
-
printf "${RED}${BOLD}Validation stopped at %s.${NC} Fix the above before continuing.\n" "$1"
|
| 95 |
-
exit 1
|
| 96 |
-
}
|
| 97 |
-
|
| 98 |
-
printf "\n"
|
| 99 |
-
printf "${BOLD}========================================${NC}\n"
|
| 100 |
-
printf "${BOLD} OpenEnv Submission Validator${NC}\n"
|
| 101 |
-
printf "${BOLD}========================================${NC}\n"
|
| 102 |
-
log "Repo: $REPO_DIR"
|
| 103 |
-
log "Ping URL: $PING_URL"
|
| 104 |
-
printf "\n"
|
| 105 |
-
|
| 106 |
-
log "${BOLD}Step 1/3: Pinging HF Space${NC} ($PING_URL/reset) ..."
|
| 107 |
-
|
| 108 |
-
CURL_OUTPUT=$(portable_mktemp "validate-curl")
|
| 109 |
-
CLEANUP_FILES+=("$CURL_OUTPUT")
|
| 110 |
-
HTTP_CODE=$(curl -s -o "$CURL_OUTPUT" -w "%{http_code}" -X POST \
|
| 111 |
-
-H "Content-Type: application/json" -d '{}' \
|
| 112 |
-
"$PING_URL/reset" --max-time 30 2>"$CURL_OUTPUT" || printf "000")
|
| 113 |
-
|
| 114 |
-
if [ "$HTTP_CODE" = "200" ]; then
|
| 115 |
-
pass "HF Space is live and responds to /reset"
|
| 116 |
-
elif [ "$HTTP_CODE" = "000" ]; then
|
| 117 |
-
fail "HF Space not reachable (connection failed or timed out)"
|
| 118 |
-
hint "Check your network connection and that the Space is running."
|
| 119 |
-
hint "Try: curl -s -o /dev/null -w '%%{http_code}' -X POST $PING_URL/reset"
|
| 120 |
-
stop_at "Step 1"
|
| 121 |
-
else
|
| 122 |
-
fail "HF Space /reset returned HTTP $HTTP_CODE (expected 200)"
|
| 123 |
-
hint "Make sure your Space is running and the URL is correct."
|
| 124 |
-
hint "Try opening $PING_URL in your browser first."
|
| 125 |
-
stop_at "Step 1"
|
| 126 |
-
fi
|
| 127 |
-
|
| 128 |
-
log "${BOLD}Step 2/3: Running docker build${NC} ..."
|
| 129 |
-
|
| 130 |
-
if ! command -v docker &>/dev/null; then
|
| 131 |
-
fail "docker command not found"
|
| 132 |
-
hint "Install Docker: https://docs.docker.com/get-docker/"
|
| 133 |
-
stop_at "Step 2"
|
| 134 |
-
fi
|
| 135 |
-
|
| 136 |
-
if [ -f "$REPO_DIR/Dockerfile" ]; then
|
| 137 |
-
DOCKER_CONTEXT="$REPO_DIR"
|
| 138 |
-
elif [ -f "$REPO_DIR/server/Dockerfile" ]; then
|
| 139 |
-
DOCKER_CONTEXT="$REPO_DIR/server"
|
| 140 |
-
else
|
| 141 |
-
fail "No Dockerfile found in repo root or server/ directory"
|
| 142 |
-
stop_at "Step 2"
|
| 143 |
-
fi
|
| 144 |
-
|
| 145 |
-
log " Found Dockerfile in $DOCKER_CONTEXT"
|
| 146 |
-
|
| 147 |
-
BUILD_OK=false
|
| 148 |
-
BUILD_OUTPUT=$(run_with_timeout "$DOCKER_BUILD_TIMEOUT" docker build "$DOCKER_CONTEXT" 2>&1) && BUILD_OK=true
|
| 149 |
-
|
| 150 |
-
if [ "$BUILD_OK" = true ]; then
|
| 151 |
-
pass "Docker build succeeded"
|
| 152 |
-
else
|
| 153 |
-
fail "Docker build failed (timeout=${DOCKER_BUILD_TIMEOUT}s)"
|
| 154 |
-
printf "%s\n" "$BUILD_OUTPUT" | tail -20
|
| 155 |
-
stop_at "Step 2"
|
| 156 |
-
fi
|
| 157 |
-
|
| 158 |
-
log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
|
| 159 |
-
|
| 160 |
-
if ! command -v openenv &>/dev/null; then
|
| 161 |
-
fail "openenv command not found"
|
| 162 |
-
hint "Install it: pip install openenv-core"
|
| 163 |
-
stop_at "Step 3"
|
| 164 |
-
fi
|
| 165 |
-
|
| 166 |
-
VALIDATE_OK=false
|
| 167 |
-
VALIDATE_OUTPUT=$(cd "$REPO_DIR" && openenv validate 2>&1) && VALIDATE_OK=true
|
| 168 |
-
|
| 169 |
-
if [ "$VALIDATE_OK" = true ]; then
|
| 170 |
-
pass "openenv validate passed"
|
| 171 |
-
[ -n "$VALIDATE_OUTPUT" ] && log " $VALIDATE_OUTPUT"
|
| 172 |
-
else
|
| 173 |
-
fail "openenv validate failed"
|
| 174 |
-
printf "%s\n" "$VALIDATE_OUTPUT"
|
| 175 |
-
stop_at "Step 3"
|
| 176 |
-
fi
|
| 177 |
-
|
| 178 |
-
printf "\n"
|
| 179 |
-
printf "${BOLD}========================================${NC}\n"
|
| 180 |
-
printf "${GREEN}${BOLD} All 3/3 checks passed!${NC}\n"
|
| 181 |
-
printf "${GREEN}${BOLD} Your submission is ready to submit.${NC}\n"
|
| 182 |
-
printf "${BOLD}========================================${NC}\n"
|
| 183 |
-
printf "\n"
|
| 184 |
-
|
| 185 |
-
exit 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
server/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
"""
|
|
|
|
| 1 |
+
"""Cloud-native DevOps debug environment server package."""
|
server/app.py
CHANGED
|
@@ -9,7 +9,7 @@ from fastapi.middleware.cors import CORSMiddleware
|
|
| 9 |
from fastapi.responses import HTMLResponse
|
| 10 |
from fastapi.staticfiles import StaticFiles
|
| 11 |
|
| 12 |
-
from server.environment import
|
| 13 |
from server.graders import run_grader
|
| 14 |
from server.models import (
|
| 15 |
Action,
|
|
@@ -47,7 +47,7 @@ app.add_middleware(
|
|
| 47 |
# Serve static assets (CSS, JS, images if needed later)
|
| 48 |
app.mount("/static", StaticFiles(directory=str(STATIC_DIR)), name="static")
|
| 49 |
|
| 50 |
-
env: Optional[
|
| 51 |
|
| 52 |
|
| 53 |
@app.get("/", response_class=HTMLResponse)
|
|
@@ -135,7 +135,7 @@ async def reset(request: Optional[ResetRequest] = None):
|
|
| 135 |
global env
|
| 136 |
|
| 137 |
request = request or ResetRequest()
|
| 138 |
-
env =
|
| 139 |
try:
|
| 140 |
observation = env.reset(
|
| 141 |
task_id=request.task_id,
|
|
|
|
| 9 |
from fastapi.responses import HTMLResponse
|
| 10 |
from fastapi.staticfiles import StaticFiles
|
| 11 |
|
| 12 |
+
from server.environment import CloudNativeDebugEnvironment
|
| 13 |
from server.graders import run_grader
|
| 14 |
from server.models import (
|
| 15 |
Action,
|
|
|
|
| 47 |
# Serve static assets (CSS, JS, images if needed later)
|
| 48 |
app.mount("/static", StaticFiles(directory=str(STATIC_DIR)), name="static")
|
| 49 |
|
| 50 |
+
env: Optional[CloudNativeDebugEnvironment] = None
|
| 51 |
|
| 52 |
|
| 53 |
@app.get("/", response_class=HTMLResponse)
|
|
|
|
| 135 |
global env
|
| 136 |
|
| 137 |
request = request or ResetRequest()
|
| 138 |
+
env = CloudNativeDebugEnvironment()
|
| 139 |
try:
|
| 140 |
observation = env.reset(
|
| 141 |
task_id=request.task_id,
|
server/environment.py
CHANGED
|
@@ -20,7 +20,7 @@ from server.simulators.workflow_simulator import WorkflowSimulator
|
|
| 20 |
from server.tasks.task_registry import TASK_REGISTRY, get_task
|
| 21 |
|
| 22 |
|
| 23 |
-
class
|
| 24 |
MAX_STEPS = 10
|
| 25 |
MAX_HINTS = 3
|
| 26 |
|
|
|
|
| 20 |
from server.tasks.task_registry import TASK_REGISTRY, get_task
|
| 21 |
|
| 22 |
|
| 23 |
+
class CloudNativeDebugEnvironment:
|
| 24 |
MAX_STEPS = 10
|
| 25 |
MAX_HINTS = 3
|
| 26 |
|
server/graders/__init__.py
CHANGED
|
@@ -1,22 +1,23 @@
|
|
| 1 |
"""Deterministic grader for trajectory scoring.
|
| 2 |
|
| 3 |
-
Scoring weights:
|
| 4 |
base score 5% (participation — guarantees score > 0)
|
| 5 |
partial fixes 35% (proportional to fix ratio)
|
| 6 |
-
complete bonus 25% (all issues fixed)
|
| 7 |
-
efficiency 25% (decays with extra steps)
|
| 8 |
-
hint penalty -4% each
|
| 9 |
failed edit -2% each
|
|
|
|
| 10 |
|
| 11 |
-
Score is
|
| 12 |
"""
|
| 13 |
|
| 14 |
from typing import Any, Dict, List
|
| 15 |
|
| 16 |
-
from server.models import GraderResult
|
| 17 |
from server.tasks.task_registry import TASK_REGISTRY
|
| 18 |
|
| 19 |
-
#
|
| 20 |
BASE_SCORE = 0.05
|
| 21 |
PARTIAL_FIX_WEIGHT = 0.35
|
| 22 |
COMPLETE_BONUS = 0.25
|
|
@@ -25,9 +26,19 @@ EFFICIENCY_DECAY = 0.03 # per extra step beyond optimal
|
|
| 25 |
HINT_PENALTY = 0.04
|
| 26 |
FAILED_ACTION_PENALTY = 0.02
|
| 27 |
|
| 28 |
-
#
|
| 29 |
-
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
EDIT_ACTION_TYPES = frozenset({
|
| 33 |
"edit_file", "replace_line", "add_line",
|
|
@@ -36,14 +47,27 @@ EDIT_ACTION_TYPES = frozenset({
|
|
| 36 |
|
| 37 |
|
| 38 |
def _clamp(value: float) -> float:
|
| 39 |
-
"""Clamp score to
|
| 40 |
return max(SCORE_FLOOR, min(SCORE_CEIL, round(value, 4)))
|
| 41 |
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
def run_grader(task_id: str, trajectory: List[Dict[str, Any]]) -> GraderResult:
|
| 44 |
if task_id not in TASK_REGISTRY:
|
| 45 |
raise ValueError(f"Unknown task: {task_id}")
|
| 46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
if not trajectory:
|
| 48 |
return GraderResult(
|
| 49 |
task_id=task_id,
|
|
@@ -53,6 +77,7 @@ def run_grader(task_id: str, trajectory: List[Dict[str, Any]]) -> GraderResult:
|
|
| 53 |
"partial_fixes": 0.0,
|
| 54 |
"complete_solution": 0.0,
|
| 55 |
"efficiency": 0.0,
|
|
|
|
| 56 |
"hint_penalty": 0.0,
|
| 57 |
"failed_action_penalty": 0.0,
|
| 58 |
},
|
|
@@ -72,25 +97,32 @@ def run_grader(task_id: str, trajectory: List[Dict[str, Any]]) -> GraderResult:
|
|
| 72 |
issues_total = max(1, int(final_step.get("info", {}).get("issues_total", 1)))
|
| 73 |
fix_ratio = issues_fixed / issues_total
|
| 74 |
|
| 75 |
-
# Component 1: Partial fix credit (proportional)
|
| 76 |
partial_score = PARTIAL_FIX_WEIGHT * fix_ratio
|
| 77 |
|
| 78 |
-
# Component 2: Full-solution bonus
|
| 79 |
complete_bonus = COMPLETE_BONUS if issues_fixed == issues_total else 0.0
|
| 80 |
|
| 81 |
-
# Component 3:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
if issues_fixed == 0:
|
| 83 |
efficiency_score = 0.0
|
| 84 |
elif steps_taken <= issues_total:
|
| 85 |
efficiency_score = EFFICIENCY_MAX
|
| 86 |
else:
|
| 87 |
extra = steps_taken - issues_total
|
| 88 |
-
|
|
|
|
| 89 |
|
| 90 |
-
# Component
|
| 91 |
-
|
|
|
|
| 92 |
|
| 93 |
-
# Component
|
| 94 |
failed_edits = 0
|
| 95 |
for step in trajectory:
|
| 96 |
action = step.get("action", {})
|
|
@@ -100,9 +132,18 @@ def run_grader(task_id: str, trajectory: List[Dict[str, Any]]) -> GraderResult:
|
|
| 100 |
failed_edits += 1
|
| 101 |
failed_pen = FAILED_ACTION_PENALTY * failed_edits
|
| 102 |
|
| 103 |
-
raw =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
score = _clamp(raw)
|
| 105 |
|
|
|
|
| 106 |
if score >= 0.85:
|
| 107 |
feedback = "Excellent — all issues fixed efficiently."
|
| 108 |
elif score >= 0.65:
|
|
@@ -121,6 +162,7 @@ def run_grader(task_id: str, trajectory: List[Dict[str, Any]]) -> GraderResult:
|
|
| 121 |
"base": BASE_SCORE,
|
| 122 |
"partial_fixes": round(partial_score, 4),
|
| 123 |
"complete_solution": round(complete_bonus, 4),
|
|
|
|
| 124 |
"efficiency": round(efficiency_score, 4),
|
| 125 |
"hint_penalty": round(-hint_pen, 4),
|
| 126 |
"failed_action_penalty": round(-failed_pen, 4),
|
|
|
|
| 1 |
"""Deterministic grader for trajectory scoring.
|
| 2 |
|
| 3 |
+
Scoring weights (difficulty-aware):
|
| 4 |
base score 5% (participation — guarantees score > 0)
|
| 5 |
partial fixes 35% (proportional to fix ratio)
|
| 6 |
+
complete bonus 25% (all issues fixed — scales with difficulty)
|
| 7 |
+
efficiency 25% (decays with extra steps — slower decay for harder tasks)
|
| 8 |
+
hint penalty -4% each (reduced to -3% for hard/expert)
|
| 9 |
failed edit -2% each
|
| 10 |
+
difficulty +5% bonus for hard/expert tasks when fully solved
|
| 11 |
|
| 12 |
+
Score is clamped to [0.0, 1.0].
|
| 13 |
"""
|
| 14 |
|
| 15 |
from typing import Any, Dict, List
|
| 16 |
|
| 17 |
+
from server.models import GraderResult, TaskDifficulty
|
| 18 |
from server.tasks.task_registry import TASK_REGISTRY
|
| 19 |
|
| 20 |
+
# ── Base weights ──────────────────────────────────────────────
|
| 21 |
BASE_SCORE = 0.05
|
| 22 |
PARTIAL_FIX_WEIGHT = 0.35
|
| 23 |
COMPLETE_BONUS = 0.25
|
|
|
|
| 26 |
HINT_PENALTY = 0.04
|
| 27 |
FAILED_ACTION_PENALTY = 0.02
|
| 28 |
|
| 29 |
+
# ── Difficulty modifiers ──────────────────────────────────────
|
| 30 |
+
# Maps difficulty → (complete_bonus_extra, efficiency_decay_mult, hint_penalty_mult)
|
| 31 |
+
# complete_bonus_extra: added to COMPLETE_BONUS when all issues fixed
|
| 32 |
+
# efficiency_decay_mult: multiplier on decay (lower = more forgiving)
|
| 33 |
+
# hint_penalty_mult: multiplier on hint cost (lower = cheaper hints)
|
| 34 |
+
DIFFICULTY_MODIFIERS = {
|
| 35 |
+
TaskDifficulty.EASY: (0.00, 1.0, 1.0),
|
| 36 |
+
TaskDifficulty.MEDIUM: (0.00, 0.9, 1.0),
|
| 37 |
+
TaskDifficulty.HARD: (0.03, 0.7, 0.75),
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
SCORE_FLOOR = 0.0
|
| 41 |
+
SCORE_CEIL = 1.0
|
| 42 |
|
| 43 |
EDIT_ACTION_TYPES = frozenset({
|
| 44 |
"edit_file", "replace_line", "add_line",
|
|
|
|
| 47 |
|
| 48 |
|
| 49 |
def _clamp(value: float) -> float:
|
| 50 |
+
"""Clamp score to [0, 1]."""
|
| 51 |
return max(SCORE_FLOOR, min(SCORE_CEIL, round(value, 4)))
|
| 52 |
|
| 53 |
|
| 54 |
+
def _get_difficulty(task_id: str) -> TaskDifficulty:
|
| 55 |
+
"""Look up a task's difficulty from the registry."""
|
| 56 |
+
task_cls = TASK_REGISTRY.get(task_id)
|
| 57 |
+
if task_cls is None:
|
| 58 |
+
return TaskDifficulty.MEDIUM
|
| 59 |
+
return task_cls.DIFFICULTY
|
| 60 |
+
|
| 61 |
+
|
| 62 |
def run_grader(task_id: str, trajectory: List[Dict[str, Any]]) -> GraderResult:
|
| 63 |
if task_id not in TASK_REGISTRY:
|
| 64 |
raise ValueError(f"Unknown task: {task_id}")
|
| 65 |
|
| 66 |
+
difficulty = _get_difficulty(task_id)
|
| 67 |
+
bonus_extra, decay_mult, hint_mult = DIFFICULTY_MODIFIERS.get(
|
| 68 |
+
difficulty, (0.00, 1.0, 1.0)
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
if not trajectory:
|
| 72 |
return GraderResult(
|
| 73 |
task_id=task_id,
|
|
|
|
| 77 |
"partial_fixes": 0.0,
|
| 78 |
"complete_solution": 0.0,
|
| 79 |
"efficiency": 0.0,
|
| 80 |
+
"difficulty_bonus": 0.0,
|
| 81 |
"hint_penalty": 0.0,
|
| 82 |
"failed_action_penalty": 0.0,
|
| 83 |
},
|
|
|
|
| 97 |
issues_total = max(1, int(final_step.get("info", {}).get("issues_total", 1)))
|
| 98 |
fix_ratio = issues_fixed / issues_total
|
| 99 |
|
| 100 |
+
# ── Component 1: Partial fix credit (proportional) ────────
|
| 101 |
partial_score = PARTIAL_FIX_WEIGHT * fix_ratio
|
| 102 |
|
| 103 |
+
# ── Component 2: Full-solution bonus ──────────────────────
|
| 104 |
complete_bonus = COMPLETE_BONUS if issues_fixed == issues_total else 0.0
|
| 105 |
|
| 106 |
+
# ── Component 3: Difficulty bonus ─────────────────────────
|
| 107 |
+
# Extra reward for fully solving harder tasks
|
| 108 |
+
diff_bonus = bonus_extra if issues_fixed == issues_total else 0.0
|
| 109 |
+
|
| 110 |
+
# ── Component 4: Efficiency bonus ─────────────────────────
|
| 111 |
+
# Harder tasks get slower decay (more forgiving on step count)
|
| 112 |
if issues_fixed == 0:
|
| 113 |
efficiency_score = 0.0
|
| 114 |
elif steps_taken <= issues_total:
|
| 115 |
efficiency_score = EFFICIENCY_MAX
|
| 116 |
else:
|
| 117 |
extra = steps_taken - issues_total
|
| 118 |
+
effective_decay = EFFICIENCY_DECAY * decay_mult
|
| 119 |
+
efficiency_score = max(0.0, EFFICIENCY_MAX - effective_decay * extra)
|
| 120 |
|
| 121 |
+
# ── Component 5: Hint penalty ─────────────────────────────
|
| 122 |
+
# Harder tasks get reduced hint penalty (hints are more reasonable)
|
| 123 |
+
hint_pen = HINT_PENALTY * hint_mult * hints_used
|
| 124 |
|
| 125 |
+
# ── Component 6: Failed action penalty ────────────────────
|
| 126 |
failed_edits = 0
|
| 127 |
for step in trajectory:
|
| 128 |
action = step.get("action", {})
|
|
|
|
| 132 |
failed_edits += 1
|
| 133 |
failed_pen = FAILED_ACTION_PENALTY * failed_edits
|
| 134 |
|
| 135 |
+
raw = (
|
| 136 |
+
BASE_SCORE
|
| 137 |
+
+ partial_score
|
| 138 |
+
+ complete_bonus
|
| 139 |
+
+ diff_bonus
|
| 140 |
+
+ efficiency_score
|
| 141 |
+
- hint_pen
|
| 142 |
+
- failed_pen
|
| 143 |
+
)
|
| 144 |
score = _clamp(raw)
|
| 145 |
|
| 146 |
+
# ── Feedback ──────────────────────────────────────────────
|
| 147 |
if score >= 0.85:
|
| 148 |
feedback = "Excellent — all issues fixed efficiently."
|
| 149 |
elif score >= 0.65:
|
|
|
|
| 162 |
"base": BASE_SCORE,
|
| 163 |
"partial_fixes": round(partial_score, 4),
|
| 164 |
"complete_solution": round(complete_bonus, 4),
|
| 165 |
+
"difficulty_bonus": round(diff_bonus, 4),
|
| 166 |
"efficiency": round(efficiency_score, 4),
|
| 167 |
"hint_penalty": round(-hint_pen, 4),
|
| 168 |
"failed_action_penalty": round(-failed_pen, 4),
|
smoke_test.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
"""Comprehensive smoke test for the
|
| 2 |
|
| 3 |
Usage:
|
| 4 |
.\\.venv\\Scripts\\python.exe smoke_test.py
|
|
@@ -225,7 +225,7 @@ def run_smoke(client: EndpointClient) -> int:
|
|
| 225 |
|
| 226 |
|
| 227 |
def main() -> int:
|
| 228 |
-
parser = argparse.ArgumentParser(description="Smoke test
|
| 229 |
parser.add_argument("--mode", choices=["inprocess", "live"], default="inprocess")
|
| 230 |
parser.add_argument("--base-url", default="http://127.0.0.1:8000")
|
| 231 |
args = parser.parse_args()
|
|
|
|
| 1 |
+
"""Comprehensive smoke test for the Cloud-Native DevOps Debug FastAPI server.
|
| 2 |
|
| 3 |
Usage:
|
| 4 |
.\\.venv\\Scripts\\python.exe smoke_test.py
|
|
|
|
| 225 |
|
| 226 |
|
| 227 |
def main() -> int:
|
| 228 |
+
parser = argparse.ArgumentParser(description="Smoke test Cloud-Native DevOps Debug FastAPI server")
|
| 229 |
parser.add_argument("--mode", choices=["inprocess", "live"], default="inprocess")
|
| 230 |
parser.add_argument("--base-url", default="http://127.0.0.1:8000")
|
| 231 |
args = parser.parse_args()
|
tests/test_baseline.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
"""Tests for baseline_runner and inference helpers."""
|
| 2 |
|
| 3 |
from baseline_runner import run_baseline_episodes, _heuristic_episode
|
| 4 |
-
from server.environment import
|
| 5 |
from server.tasks.task_registry import TASK_REGISTRY
|
| 6 |
|
| 7 |
|
|
@@ -15,7 +15,7 @@ def test_heuristic_baseline_scores_above_zero_on_most_scenarios():
|
|
| 15 |
nonzero = 0
|
| 16 |
for task_id, task_cls in TASK_REGISTRY.items():
|
| 17 |
for scenario in task_cls.SCENARIOS:
|
| 18 |
-
env =
|
| 19 |
result = _heuristic_episode(env, task_id, scenario["id"])
|
| 20 |
total += 1
|
| 21 |
if result.score > 0.0:
|
|
@@ -45,7 +45,7 @@ def test_heuristic_fixes_easy_tasks_well():
|
|
| 45 |
task_cls = TASK_REGISTRY[task_id]
|
| 46 |
scores = []
|
| 47 |
for scenario in task_cls.SCENARIOS:
|
| 48 |
-
env =
|
| 49 |
result = _heuristic_episode(env, task_id, scenario["id"])
|
| 50 |
scores.append(result.score)
|
| 51 |
avg = sum(scores) / len(scores)
|
|
|
|
| 1 |
"""Tests for baseline_runner and inference helpers."""
|
| 2 |
|
| 3 |
from baseline_runner import run_baseline_episodes, _heuristic_episode
|
| 4 |
+
from server.environment import CloudNativeDebugEnvironment
|
| 5 |
from server.tasks.task_registry import TASK_REGISTRY
|
| 6 |
|
| 7 |
|
|
|
|
| 15 |
nonzero = 0
|
| 16 |
for task_id, task_cls in TASK_REGISTRY.items():
|
| 17 |
for scenario in task_cls.SCENARIOS:
|
| 18 |
+
env = CloudNativeDebugEnvironment()
|
| 19 |
result = _heuristic_episode(env, task_id, scenario["id"])
|
| 20 |
total += 1
|
| 21 |
if result.score > 0.0:
|
|
|
|
| 45 |
task_cls = TASK_REGISTRY[task_id]
|
| 46 |
scores = []
|
| 47 |
for scenario in task_cls.SCENARIOS:
|
| 48 |
+
env = CloudNativeDebugEnvironment()
|
| 49 |
result = _heuristic_episode(env, task_id, scenario["id"])
|
| 50 |
scores.append(result.score)
|
| 51 |
avg = sum(scores) / len(scores)
|
tests/test_determinism.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
"""Determinism and score-range tests for the grader and environment."""
|
| 2 |
|
| 3 |
-
from server.environment import
|
| 4 |
from server.graders import run_grader
|
| 5 |
from server.models import Action, ActionType, FileEdit
|
| 6 |
from server.tasks.task_registry import TASK_REGISTRY
|
|
@@ -11,8 +11,8 @@ from server.tasks.task_registry import TASK_REGISTRY
|
|
| 11 |
|
| 12 |
def test_reset_deterministic_with_seed():
|
| 13 |
"""Same seed → same task, scenario, files, error."""
|
| 14 |
-
env1 =
|
| 15 |
-
env2 =
|
| 16 |
|
| 17 |
obs1 = env1.reset(seed=42)
|
| 18 |
obs2 = env2.reset(seed=42)
|
|
@@ -71,7 +71,7 @@ def test_full_episode_determinism():
|
|
| 71 |
"""Full episode replay produces identical trajectory and score."""
|
| 72 |
scores = []
|
| 73 |
for _ in range(5):
|
| 74 |
-
env =
|
| 75 |
env.reset(task_id="dockerfile_syntax", scenario_id="typo_filename")
|
| 76 |
action = Action(
|
| 77 |
action_type=ActionType.EDIT_FILE,
|
|
@@ -240,7 +240,7 @@ def test_all_scenarios_have_required_fields():
|
|
| 240 |
|
| 241 |
def test_end_to_end_grading_all_tasks():
|
| 242 |
"""Every task/scenario can be reset, fixed, and graded with score > 0."""
|
| 243 |
-
env =
|
| 244 |
for task_id, task_cls in TASK_REGISTRY.items():
|
| 245 |
task = task_cls()
|
| 246 |
for scenario in task.SCENARIOS:
|
|
|
|
| 1 |
"""Determinism and score-range tests for the grader and environment."""
|
| 2 |
|
| 3 |
+
from server.environment import CloudNativeDebugEnvironment
|
| 4 |
from server.graders import run_grader
|
| 5 |
from server.models import Action, ActionType, FileEdit
|
| 6 |
from server.tasks.task_registry import TASK_REGISTRY
|
|
|
|
| 11 |
|
| 12 |
def test_reset_deterministic_with_seed():
|
| 13 |
"""Same seed → same task, scenario, files, error."""
|
| 14 |
+
env1 = CloudNativeDebugEnvironment()
|
| 15 |
+
env2 = CloudNativeDebugEnvironment()
|
| 16 |
|
| 17 |
obs1 = env1.reset(seed=42)
|
| 18 |
obs2 = env2.reset(seed=42)
|
|
|
|
| 71 |
"""Full episode replay produces identical trajectory and score."""
|
| 72 |
scores = []
|
| 73 |
for _ in range(5):
|
| 74 |
+
env = CloudNativeDebugEnvironment()
|
| 75 |
env.reset(task_id="dockerfile_syntax", scenario_id="typo_filename")
|
| 76 |
action = Action(
|
| 77 |
action_type=ActionType.EDIT_FILE,
|
|
|
|
| 240 |
|
| 241 |
def test_end_to_end_grading_all_tasks():
|
| 242 |
"""Every task/scenario can be reset, fixed, and graded with score > 0."""
|
| 243 |
+
env = CloudNativeDebugEnvironment()
|
| 244 |
for task_id, task_cls in TASK_REGISTRY.items():
|
| 245 |
task = task_cls()
|
| 246 |
for scenario in task.SCENARIOS:
|
tests/test_environment_flow.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
-
from server.environment import
|
| 2 |
from server.models import Action, ActionType, FileEdit
|
| 3 |
|
| 4 |
|
| 5 |
def test_episode_flow_fix_and_autocomplete():
|
| 6 |
-
env =
|
| 7 |
obs = env.reset(task_id="dockerfile_syntax", scenario_id="typo_filename", seed=7)
|
| 8 |
assert obs.task_id == "dockerfile_syntax"
|
| 9 |
assert obs.total_issues >= 1
|
|
@@ -28,7 +28,7 @@ def test_episode_flow_fix_and_autocomplete():
|
|
| 28 |
|
| 29 |
|
| 30 |
def test_submit_runs_combined_simulation():
|
| 31 |
-
env =
|
| 32 |
env.reset(task_id="workflow_secrets_permissions", scenario_id="missing_env_secrets", seed=42)
|
| 33 |
obs, reward, done, info = env.step(Action(action_type=ActionType.SUBMIT, reasoning="validate"))
|
| 34 |
assert done is True
|
|
|
|
| 1 |
+
from server.environment import CloudNativeDebugEnvironment
|
| 2 |
from server.models import Action, ActionType, FileEdit
|
| 3 |
|
| 4 |
|
| 5 |
def test_episode_flow_fix_and_autocomplete():
|
| 6 |
+
env = CloudNativeDebugEnvironment()
|
| 7 |
obs = env.reset(task_id="dockerfile_syntax", scenario_id="typo_filename", seed=7)
|
| 8 |
assert obs.task_id == "dockerfile_syntax"
|
| 9 |
assert obs.total_issues >= 1
|
|
|
|
| 28 |
|
| 29 |
|
| 30 |
def test_submit_runs_combined_simulation():
|
| 31 |
+
env = CloudNativeDebugEnvironment()
|
| 32 |
env.reset(task_id="workflow_secrets_permissions", scenario_id="missing_env_secrets", seed=42)
|
| 33 |
obs, reward, done, info = env.step(Action(action_type=ActionType.SUBMIT, reasoning="validate"))
|
| 34 |
assert done is True
|
tutorial_references/02-deployment.md
DELETED
|
@@ -1,427 +0,0 @@
|
|
| 1 |
-
# 2. Deploying an OpenEnv environment
|
| 2 |
-
|
| 3 |
-
This section covers deploying OpenEnv environments locally, on clusters, and on Hugging Face Spaces.
|
| 4 |
-
|
| 5 |
-
**Contents:**
|
| 6 |
-
- [Local Development with Uvicorn](#local-development-with-uvicorn)
|
| 7 |
-
- [Docker Deployment](#docker-deployment)
|
| 8 |
-
- [Hugging Face Spaces](#hugging-face-spaces)
|
| 9 |
-
- [Best Practices](#best-practices)
|
| 10 |
-
|
| 11 |
-
## HF Spaces are the infrastructure for OpenEnv environments
|
| 12 |
-
|
| 13 |
-
Every HF Space provides three things that OpenEnv environments need:
|
| 14 |
-
|
| 15 |
-
| Component | What it provides | How to access | Used as |
|
| 16 |
-
|-----------|------------------|---------------|-----------|
|
| 17 |
-
| **Server** | Running environment endpoint | `https://<username>-<space-name>.hf.space` | Agent and Public API |
|
| 18 |
-
| **Repository** | Installable Python package | `pip install git+https://huggingface.co/spaces/<username>-<space-name>` | Code and client |
|
| 19 |
-
| **Registry** | Docker container image | `docker pull registry.hf.space/<username>-<space-name>:latest` | Deployment |
|
| 20 |
-
|
| 21 |
-
This means a single Space deployment gives you all the components you need to use an environment in training.
|
| 22 |
-
|
| 23 |
-
### 1. Server: A running environment endpoint
|
| 24 |
-
|
| 25 |
-
When you deploy to HF Spaces, your environment runs as a server. The client connects via **WebSocket** (`/ws`) for a persistent session:
|
| 26 |
-
|
| 27 |
-
```python
|
| 28 |
-
from echo_env import EchoEnv, EchoAction
|
| 29 |
-
|
| 30 |
-
# Connect directly to the running Space (WebSocket under the hood)
|
| 31 |
-
# Async (recommended):
|
| 32 |
-
async with EchoEnv(base_url="https://openenv-echo-env.hf.space") as client:
|
| 33 |
-
result = await client.reset()
|
| 34 |
-
result = await client.step(EchoAction(message="Hello"))
|
| 35 |
-
|
| 36 |
-
# Sync (using .sync() wrapper):
|
| 37 |
-
with EchoEnv(base_url="https://openenv-echo-env.hf.space").sync() as client:
|
| 38 |
-
result = client.reset()
|
| 39 |
-
result = client.step(EchoAction(message="Hello"))
|
| 40 |
-
```
|
| 41 |
-
|
| 42 |
-
**Endpoints available:**
|
| 43 |
-
|
| 44 |
-
| Endpoint | Protocol | Description |
|
| 45 |
-
|----------|----------|-------------|
|
| 46 |
-
| `/ws` | **WebSocket** | Persistent session (used by client) |
|
| 47 |
-
| `/health` | HTTP GET | Health check |
|
| 48 |
-
| `/reset` | HTTP POST | Reset environment (stateless) |
|
| 49 |
-
| `/step` | HTTP POST | Execute action (stateless) |
|
| 50 |
-
| `/state` | HTTP GET | Get current state |
|
| 51 |
-
| `/docs` | HTTP GET | OpenAPI documentation |
|
| 52 |
-
| `/web` | HTTP GET | Interactive web UI |
|
| 53 |
-
|
| 54 |
-
> **Note:** The Python client uses the `/ws` WebSocket endpoint by default. HTTP endpoints are available for debugging or stateless use cases.
|
| 55 |
-
|
| 56 |
-
**Example: Check if a Space is running**
|
| 57 |
-
|
| 58 |
-
```bash
|
| 59 |
-
curl https://openenv-echo-env.hf.space/health
|
| 60 |
-
# {"status": "healthy"}
|
| 61 |
-
```
|
| 62 |
-
|
| 63 |
-
### 2. Repository: Installable Python package
|
| 64 |
-
|
| 65 |
-
Every Space is a Git repository. OpenEnv environments include a `pyproject.toml`, making them pip-installable directly from the Space URL.
|
| 66 |
-
|
| 67 |
-
```bash
|
| 68 |
-
# Install client package from Space
|
| 69 |
-
pip install git+https://huggingface.co/spaces/openenv/echo-env
|
| 70 |
-
```
|
| 71 |
-
|
| 72 |
-
This installs:
|
| 73 |
-
- **Client class** (`EchoEnv`) — Handles HTTP/WebSocket communication
|
| 74 |
-
- **Models** (`EchoAction`, `EchoObservation`) — Typed action and observation classes
|
| 75 |
-
- **Utilities** — Any helper functions the environment provides
|
| 76 |
-
|
| 77 |
-
**After installation:**
|
| 78 |
-
|
| 79 |
-
```python
|
| 80 |
-
from envs.echo_env import EchoEnv, EchoAction, EchoObservation
|
| 81 |
-
|
| 82 |
-
# Now you have typed classes for the environment
|
| 83 |
-
action = EchoAction(message="Hello")
|
| 84 |
-
```
|
| 85 |
-
|
| 86 |
-
### 3. Registry: Docker container image
|
| 87 |
-
|
| 88 |
-
Every Docker-based Space has a container registry. You can pull and run the environment locally.
|
| 89 |
-
|
| 90 |
-
```bash
|
| 91 |
-
# Pull the image
|
| 92 |
-
docker pull registry.hf.space/openenv-echo-env:latest
|
| 93 |
-
|
| 94 |
-
# Run locally on port 8001
|
| 95 |
-
docker run -d -p 8001:8000 registry.hf.space/openenv-echo-env:latest
|
| 96 |
-
```
|
| 97 |
-
|
| 98 |
-
**Find the registry URL for any Space:**
|
| 99 |
-
|
| 100 |
-
1. Go to the Space page (e.g., [openenv/echo-env](https://huggingface.co/spaces/openenv/echo-env))
|
| 101 |
-
2. Click **⋮** (three dots) → **"Run locally"**
|
| 102 |
-
3. Copy the `docker run` command
|
| 103 |
-
|
| 104 |
-
### Choosing an access method
|
| 105 |
-
|
| 106 |
-
| Method | Use when | Pros | Cons |
|
| 107 |
-
|--------|----------|------|------|
|
| 108 |
-
| **Server** | Quick testing, low volume | Zero setup | Network latency, rate limits |
|
| 109 |
-
| **Repository** | Need typed classes | Type safety, IDE support | Still need a server |
|
| 110 |
-
| **Docker** | Local dev, high throughput | Full control, no network | Requires Docker |
|
| 111 |
-
|
| 112 |
-
**Typical workflow:**
|
| 113 |
-
|
| 114 |
-
```python
|
| 115 |
-
import asyncio
|
| 116 |
-
from echo_env import EchoEnv, EchoAction
|
| 117 |
-
|
| 118 |
-
async def main():
|
| 119 |
-
# Development: connect to remote Space
|
| 120 |
-
async with EchoEnv(base_url="https://openenv-echo-env.hf.space") as client:
|
| 121 |
-
result = await client.reset()
|
| 122 |
-
|
| 123 |
-
# Production: run locally for speed
|
| 124 |
-
# docker run -d -p 8001:8000 registry.hf.space/openenv-echo-env:latest
|
| 125 |
-
async with EchoEnv(base_url="http://localhost:8001") as client:
|
| 126 |
-
result = await client.reset()
|
| 127 |
-
|
| 128 |
-
# Or let the client manage Docker for you
|
| 129 |
-
client = await EchoEnv.from_env("openenv/echo-env") # Auto-pulls and runs
|
| 130 |
-
async with client:
|
| 131 |
-
result = await client.reset()
|
| 132 |
-
|
| 133 |
-
asyncio.run(main())
|
| 134 |
-
|
| 135 |
-
# For sync usage, use the .sync() wrapper:
|
| 136 |
-
with EchoEnv(base_url="http://localhost:8001").sync() as client:
|
| 137 |
-
result = client.reset()
|
| 138 |
-
```
|
| 139 |
-
|
| 140 |
-
> **Reference:** [HF Spaces Documentation](https://huggingface.co/docs/hub/spaces) | [Environment Hub Collection](https://huggingface.co/collections/openenv/environment-hub)
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
## Local Development with Uvicorn
|
| 144 |
-
|
| 145 |
-
The fastest way to iterate on environment logic is running directly with Uvicorn.
|
| 146 |
-
|
| 147 |
-
## Clone and run the environment locally
|
| 148 |
-
|
| 149 |
-
```bash
|
| 150 |
-
# Clone from HF Space
|
| 151 |
-
git clone https://huggingface.co/spaces/burtenshaw/openenv-benchmark
|
| 152 |
-
cd openenv-benchmark
|
| 153 |
-
|
| 154 |
-
# Install in editable mode
|
| 155 |
-
uv sync
|
| 156 |
-
|
| 157 |
-
# Start server
|
| 158 |
-
uv run server
|
| 159 |
-
|
| 160 |
-
# Run isolated from remote Space
|
| 161 |
-
uv run --isolated --project https://huggingface.co/spaces/burtenshaw/openenv-benchmark server
|
| 162 |
-
```
|
| 163 |
-
|
| 164 |
-
## Uvicorn directly in python
|
| 165 |
-
|
| 166 |
-
```bash
|
| 167 |
-
# Full control over uvicorn options
|
| 168 |
-
uvicorn benchmark.server.app:app --host "$HOST" --port "$PORT" --workers "$WORKERS"
|
| 169 |
-
|
| 170 |
-
# With reload for development
|
| 171 |
-
uvicorn benchmark.server.app:app --host 0.0.0.0 --port 8000 --reload
|
| 172 |
-
|
| 173 |
-
# Multi-Worker Mode For better concurrency:
|
| 174 |
-
uvicorn benchmark.server.app:app --host 0.0.0.0 --port 8000 --workers 4
|
| 175 |
-
```
|
| 176 |
-
|
| 177 |
-
| Flag | Purpose |
|
| 178 |
-
|------|---------|
|
| 179 |
-
| `--reload` | Auto-restart on code changes |
|
| 180 |
-
| `--workers N` | Run N worker processes |
|
| 181 |
-
| `--log-level debug` | Verbose logging |
|
| 182 |
-
|
| 183 |
-
## Docker Deployment
|
| 184 |
-
|
| 185 |
-
Docker provides isolation and reproducibility for production use.
|
| 186 |
-
|
| 187 |
-
### Run the environment locally from the space
|
| 188 |
-
|
| 189 |
-
```bash
|
| 190 |
-
# Run the environment locally from the space
|
| 191 |
-
docker run -d -p 8000:8000 registry.hf.space/openenv-echo-env:latest
|
| 192 |
-
```
|
| 193 |
-
|
| 194 |
-
### Build Image
|
| 195 |
-
|
| 196 |
-
```bash
|
| 197 |
-
# Clone from HF Space
|
| 198 |
-
git clone https://huggingface.co/spaces/burtenshaw/openenv-benchmark
|
| 199 |
-
cd openenv-benchmark
|
| 200 |
-
|
| 201 |
-
# Using OpenEnv CLI (recommended)
|
| 202 |
-
openenv build -t openenv-benchmark:latest
|
| 203 |
-
|
| 204 |
-
# Or with Docker directly
|
| 205 |
-
docker build -t openenv-benchmark:latest -f server/Dockerfile .
|
| 206 |
-
```
|
| 207 |
-
|
| 208 |
-
### Run Container
|
| 209 |
-
|
| 210 |
-
```bash
|
| 211 |
-
# Basic run
|
| 212 |
-
docker run -d -p 8000:8000 my-env:latest
|
| 213 |
-
|
| 214 |
-
# With environment variables
|
| 215 |
-
docker run -d -p 8000:8000 \
|
| 216 |
-
-e WORKERS=4 \
|
| 217 |
-
-e MAX_CONCURRENT_ENVS=100 \
|
| 218 |
-
my-env:latest
|
| 219 |
-
|
| 220 |
-
# Named container for easy management
|
| 221 |
-
docker run -d --name my-env -p 8000:8000 my-env:latest
|
| 222 |
-
```
|
| 223 |
-
|
| 224 |
-
### Connect from Python
|
| 225 |
-
|
| 226 |
-
```python
|
| 227 |
-
import asyncio
|
| 228 |
-
from echo_env import EchoEnv, EchoAction
|
| 229 |
-
|
| 230 |
-
async def main():
|
| 231 |
-
# Async usage (recommended)
|
| 232 |
-
async with EchoEnv(base_url="http://localhost:8000") as client:
|
| 233 |
-
result = await client.reset()
|
| 234 |
-
result = await client.step(EchoAction(message="Hello"))
|
| 235 |
-
print(result.observation)
|
| 236 |
-
|
| 237 |
-
# From Docker image
|
| 238 |
-
client = await EchoEnv.from_docker_image("<local_docker_image>")
|
| 239 |
-
async with client:
|
| 240 |
-
result = await client.reset()
|
| 241 |
-
print(result.observation)
|
| 242 |
-
|
| 243 |
-
asyncio.run(main())
|
| 244 |
-
|
| 245 |
-
# Sync usage (using .sync() wrapper)
|
| 246 |
-
with EchoEnv(base_url="http://localhost:8000").sync() as client:
|
| 247 |
-
result = client.reset()
|
| 248 |
-
result = client.step(EchoAction(message="Hello"))
|
| 249 |
-
print(result.observation)
|
| 250 |
-
```
|
| 251 |
-
|
| 252 |
-
### Container Lifecycle
|
| 253 |
-
|
| 254 |
-
| Method | Container | WebSocket | On `close()` |
|
| 255 |
-
|--------|-----------|-----------|--------------|
|
| 256 |
-
| `from_hub(repo_id)` | Starts | Connects | Stops container |
|
| 257 |
-
| `from_hub(repo_id, use_docker=False)` | None (UV) | Connects | Stops UV server |
|
| 258 |
-
| `from_docker_image(image)` | Starts | Connects | Stops container |
|
| 259 |
-
| `MyEnv(base_url=...)` | None | Connects | Disconnects only |
|
| 260 |
-
|
| 261 |
-
Find Docker Commands for Any Space
|
| 262 |
-
|
| 263 |
-
1. Open the Space on HuggingFace Hub
|
| 264 |
-
2. Click **⋮ (three dots)** menu
|
| 265 |
-
3. Select **"Run locally"**
|
| 266 |
-
4. Copy the provided `docker run` command
|
| 267 |
-
|
| 268 |
-
## Deploy with CLI
|
| 269 |
-
|
| 270 |
-
```bash
|
| 271 |
-
cd my_env
|
| 272 |
-
|
| 273 |
-
# Deploy to your namespace
|
| 274 |
-
openenv push
|
| 275 |
-
|
| 276 |
-
# Deploy to specific repo
|
| 277 |
-
openenv push --repo-id username/my-env
|
| 278 |
-
|
| 279 |
-
# Deploy as private
|
| 280 |
-
openenv push --repo-id username/my-env --private
|
| 281 |
-
```
|
| 282 |
-
|
| 283 |
-
### Space Configuration
|
| 284 |
-
|
| 285 |
-
The `openenv.yaml` manifest controls Space settings:
|
| 286 |
-
|
| 287 |
-
```yaml
|
| 288 |
-
# openenv.yaml
|
| 289 |
-
name: my_env
|
| 290 |
-
version: "1.0.0"
|
| 291 |
-
description: My custom environment
|
| 292 |
-
```
|
| 293 |
-
|
| 294 |
-
Hardware Options:
|
| 295 |
-
|
| 296 |
-
| Tier | vCPU | RAM | Cost |
|
| 297 |
-
|------|------|-----|------|
|
| 298 |
-
| CPU Basic (Free) | 2 | 16GB | Free |
|
| 299 |
-
| CPU Upgrade | 8 | 32GB | $0.03/hr |
|
| 300 |
-
|
| 301 |
-
OpenEnv environments support configuration via environment variables.
|
| 302 |
-
|
| 303 |
-
| Variable | Default | Description |
|
| 304 |
-
|----------|---------|-------------|
|
| 305 |
-
| `WORKERS` | 4 | Uvicorn worker processes |
|
| 306 |
-
| `PORT` | 8000 | Server port |
|
| 307 |
-
| `HOST` | 0.0.0.0 | Bind address |
|
| 308 |
-
| `MAX_CONCURRENT_ENVS` | 100 | Max WebSocket sessions |
|
| 309 |
-
| `ENABLE_WEB_INTERFACE` | Auto | Enable web UI |
|
| 310 |
-
|
| 311 |
-
### Environment-Specific Variables
|
| 312 |
-
|
| 313 |
-
Some environments have custom variables:
|
| 314 |
-
|
| 315 |
-
**TextArena:**
|
| 316 |
-
```bash
|
| 317 |
-
TEXTARENA_ENV_ID=Wordle-v0
|
| 318 |
-
TEXTARENA_NUM_PLAYERS=1
|
| 319 |
-
TEXTARENA_MAX_TURNS=6
|
| 320 |
-
```
|
| 321 |
-
|
| 322 |
-
**Coding Environment:**
|
| 323 |
-
```bash
|
| 324 |
-
SANDBOX_TIMEOUT=30
|
| 325 |
-
MAX_OUTPUT_LENGTH=10000
|
| 326 |
-
```
|
| 327 |
-
|
| 328 |
-
# DEMO: Deploying to Hugging Face Spaces
|
| 329 |
-
|
| 330 |
-
This demo walks through the full workflow: create an environment, test locally, deploy to HF Spaces, and use it.
|
| 331 |
-
|
| 332 |
-
## Step 1: Initialize a new environment
|
| 333 |
-
|
| 334 |
-
```bash
|
| 335 |
-
openenv init my_env
|
| 336 |
-
cd my_env
|
| 337 |
-
```
|
| 338 |
-
|
| 339 |
-
This creates the standard OpenEnv structure:
|
| 340 |
-
|
| 341 |
-
```
|
| 342 |
-
my_env/
|
| 343 |
-
├── server/
|
| 344 |
-
│ ├── app.py # FastAPI server
|
| 345 |
-
│ ├── environment.py # Your environment logic
|
| 346 |
-
│ └── Dockerfile
|
| 347 |
-
├── models.py # Action/Observation types
|
| 348 |
-
├── client.py # HTTP client
|
| 349 |
-
├── openenv.yaml # Manifest
|
| 350 |
-
└── pyproject.toml
|
| 351 |
-
```
|
| 352 |
-
|
| 353 |
-
## Step 2: Run locally
|
| 354 |
-
|
| 355 |
-
```bash
|
| 356 |
-
# Start the server
|
| 357 |
-
uv run server
|
| 358 |
-
|
| 359 |
-
# Or with uvicorn directly
|
| 360 |
-
uvicorn server.app:app --host 0.0.0.0 --port 8000 --reload
|
| 361 |
-
```
|
| 362 |
-
|
| 363 |
-
Test the health endpoint:
|
| 364 |
-
|
| 365 |
-
```bash
|
| 366 |
-
curl http://localhost:8000/health
|
| 367 |
-
# {"status": "healthy"}
|
| 368 |
-
```
|
| 369 |
-
|
| 370 |
-
## Step 3: Deploy to HF Spaces
|
| 371 |
-
|
| 372 |
-
```bash
|
| 373 |
-
openenv push --repo-id username/my-env
|
| 374 |
-
```
|
| 375 |
-
|
| 376 |
-
Your environment is now live at:
|
| 377 |
-
- Web UI: https://username-my-env.hf.space/web
|
| 378 |
-
- API Docs: https://username-my-env.hf.space/docs
|
| 379 |
-
- Health: https://username-my-env.hf.space/health
|
| 380 |
-
|
| 381 |
-
```bash
|
| 382 |
-
curl https://openenv-echo-env.hf.space/health
|
| 383 |
-
# {"status": "healthy"}
|
| 384 |
-
```
|
| 385 |
-
|
| 386 |
-
## Step 4: install the environment
|
| 387 |
-
|
| 388 |
-
```bash
|
| 389 |
-
uv pip install git+https://huggingface.co/spaces/openenv/echo_env
|
| 390 |
-
```
|
| 391 |
-
|
| 392 |
-
## Step 5: Run locally via Docker (optional)
|
| 393 |
-
|
| 394 |
-
Pull and run the container from the HF registry, or open the [browser](https://huggingface.co/spaces/openenv/echo_env?docker=true):
|
| 395 |
-
|
| 396 |
-
```bash
|
| 397 |
-
# Pull from HF Spaces registry
|
| 398 |
-
docker pull registry.hf.space/openenv-echo-env:latest
|
| 399 |
-
|
| 400 |
-
# Run locally
|
| 401 |
-
docker run -it -p 7860:7860 --platform=linux/amd64 \
|
| 402 |
-
registry.hf.space/openenv-echo-env:latest
|
| 403 |
-
```
|
| 404 |
-
|
| 405 |
-
Now connect to your local instance:
|
| 406 |
-
|
| 407 |
-
```python
|
| 408 |
-
import asyncio
|
| 409 |
-
from echo_env import EchoEnv, EchoAction
|
| 410 |
-
|
| 411 |
-
# Async (recommended)
|
| 412 |
-
async def main():
|
| 413 |
-
async with EchoEnv(base_url="http://localhost:8000") as env:
|
| 414 |
-
result = await env.reset()
|
| 415 |
-
print(result.observation)
|
| 416 |
-
result = await env.step(EchoAction(message="Hello"))
|
| 417 |
-
print(result.observation)
|
| 418 |
-
|
| 419 |
-
asyncio.run(main())
|
| 420 |
-
|
| 421 |
-
# Sync (using .sync() wrapper)
|
| 422 |
-
with EchoEnv(base_url="http://localhost:8000").sync() as env:
|
| 423 |
-
result = env.reset()
|
| 424 |
-
print(result.observation)
|
| 425 |
-
result = env.step(EchoAction(message="Hello"))
|
| 426 |
-
print(result.observation)
|
| 427 |
-
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|