Spaces:
Sleeping
Sleeping
Commit ·
646409d
1
Parent(s): 2d8d38c
fix: reset task_id parsing, grader tuple crash fallback, and inference score output
Browse files- inference.py +3 -2
- server/env.py +9 -6
- server/grader.py +9 -20
inference.py
CHANGED
|
@@ -31,7 +31,7 @@ def run_task(task_id: str):
|
|
| 31 |
except Exception as e:
|
| 32 |
error_msg = str(e).replace("\n", " ").replace("\r", "")
|
| 33 |
print(f"[STEP] step=1 action=reset_failed reward=0.01 done=true error={error_msg}")
|
| 34 |
-
print(f"[END] success=false steps=1 rewards=0.01")
|
| 35 |
return
|
| 36 |
|
| 37 |
rewards = []
|
|
@@ -109,7 +109,8 @@ def run_task(task_id: str):
|
|
| 109 |
success = any(r > 0.5 for r in rewards)
|
| 110 |
success_str = "true" if success else "false"
|
| 111 |
rewards_str = ",".join([f"{r:.2f}" for r in rewards])
|
| 112 |
-
|
|
|
|
| 113 |
|
| 114 |
def main():
|
| 115 |
target_task = os.environ.get("CODEARENA_TASK")
|
|
|
|
| 31 |
except Exception as e:
|
| 32 |
error_msg = str(e).replace("\n", " ").replace("\r", "")
|
| 33 |
print(f"[STEP] step=1 action=reset_failed reward=0.01 done=true error={error_msg}")
|
| 34 |
+
print(f"[END] success=false steps=1 score=0.01 rewards=0.01")
|
| 35 |
return
|
| 36 |
|
| 37 |
rewards = []
|
|
|
|
| 109 |
success = any(r > 0.5 for r in rewards)
|
| 110 |
success_str = "true" if success else "false"
|
| 111 |
rewards_str = ",".join([f"{r:.2f}" for r in rewards])
|
| 112 |
+
score = max(0.001, min(0.999, (sum(rewards) / len(rewards)) if rewards else 0.5))
|
| 113 |
+
print(f"[END] success={success_str} steps={step} score={score:.2f} rewards={rewards_str}")
|
| 114 |
|
| 115 |
def main():
|
| 116 |
target_task = os.environ.get("CODEARENA_TASK")
|
server/env.py
CHANGED
|
@@ -18,14 +18,17 @@ class CodeArenaEnv:
|
|
| 18 |
self.step_count = 0
|
| 19 |
self.max_steps = 5
|
| 20 |
|
| 21 |
-
def reset(self) -> CodeArenaObservation:
|
| 22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
self.previous_attempts = []
|
| 24 |
self.last_error_log = ""
|
| 25 |
self.last_test_results = ""
|
| 26 |
self.is_done = False
|
| 27 |
self.step_count = 0
|
| 28 |
-
|
| 29 |
return self.state()
|
| 30 |
|
| 31 |
def step(self, action: CodeArenaAction) -> tuple[CodeArenaObservation, float, bool, dict]:
|
|
@@ -83,9 +86,9 @@ async def lifespan(app: FastAPI):
|
|
| 83 |
app = FastAPI(lifespan=lifespan, title="CodeArena RL Environment")
|
| 84 |
|
| 85 |
@app.post("/reset")
|
| 86 |
-
def api_reset():
|
| 87 |
-
|
| 88 |
-
|
| 89 |
return {"message": "Environment reset successfully", "observation": obs.model_dump()}
|
| 90 |
|
| 91 |
@app.post("/step")
|
|
|
|
| 18 |
self.step_count = 0
|
| 19 |
self.max_steps = 5
|
| 20 |
|
| 21 |
+
def reset(self, task_id: str = None) -> CodeArenaObservation:
|
| 22 |
+
if task_id:
|
| 23 |
+
matched = [t for t in self.tasks if t.task_id == task_id]
|
| 24 |
+
self.current_task = matched[0] if matched else random.choice(self.tasks)
|
| 25 |
+
else:
|
| 26 |
+
self.current_task = random.choice(self.tasks)
|
| 27 |
self.previous_attempts = []
|
| 28 |
self.last_error_log = ""
|
| 29 |
self.last_test_results = ""
|
| 30 |
self.is_done = False
|
| 31 |
self.step_count = 0
|
|
|
|
| 32 |
return self.state()
|
| 33 |
|
| 34 |
def step(self, action: CodeArenaAction) -> tuple[CodeArenaObservation, float, bool, dict]:
|
|
|
|
| 86 |
app = FastAPI(lifespan=lifespan, title="CodeArena RL Environment")
|
| 87 |
|
| 88 |
@app.post("/reset")
|
| 89 |
+
def api_reset(body: dict = None):
|
| 90 |
+
task_id = (body or {}).get("task_id")
|
| 91 |
+
obs = _env.reset(task_id=task_id)
|
| 92 |
return {"message": "Environment reset successfully", "observation": obs.model_dump()}
|
| 93 |
|
| 94 |
@app.post("/step")
|
server/grader.py
CHANGED
|
@@ -1,36 +1,25 @@
|
|
| 1 |
from .models import ExecutionResult, TaskInfo
|
| 2 |
|
| 3 |
-
|
| 4 |
def safe_reward(reward) -> float:
|
| 5 |
-
"""
|
| 6 |
-
Final safety net: guarantees reward is strictly within (0, 1).
|
| 7 |
-
Applied at every return point as a last-mile clamp.
|
| 8 |
-
"""
|
| 9 |
try:
|
| 10 |
r = float(reward)
|
| 11 |
except Exception:
|
| 12 |
return 0.5
|
| 13 |
-
return max(0.001, min(0.999,
|
| 14 |
-
|
| 15 |
|
| 16 |
def normalize_reward(passed: int, total: int) -> float:
|
| 17 |
-
"""
|
| 18 |
-
Compute a reward strictly within the open interval (0, 1).
|
| 19 |
-
Never returns exactly 0.0 or 1.0.
|
| 20 |
-
"""
|
| 21 |
if total == 0:
|
| 22 |
return 0.5
|
| 23 |
-
|
| 24 |
-
return max(0.001, min(0.999, float(reward)))
|
| 25 |
-
|
| 26 |
|
| 27 |
def calculate_reward(exec_result: ExecutionResult, task_info: TaskInfo) -> float:
|
| 28 |
-
"""
|
| 29 |
-
Single entry-point used by env.py and app.py.
|
| 30 |
-
Delegates to normalize_reward, then applies safe_reward clamp.
|
| 31 |
-
"""
|
| 32 |
reward = normalize_reward(exec_result.test_passed, exec_result.test_total)
|
| 33 |
return safe_reward(reward)
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from .models import ExecutionResult, TaskInfo
|
| 2 |
|
|
|
|
| 3 |
def safe_reward(reward) -> float:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
try:
|
| 5 |
r = float(reward)
|
| 6 |
except Exception:
|
| 7 |
return 0.5
|
| 8 |
+
return max(0.001, min(0.999, r))
|
|
|
|
| 9 |
|
| 10 |
def normalize_reward(passed: int, total: int) -> float:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
if total == 0:
|
| 12 |
return 0.5
|
| 13 |
+
return max(0.001, min(0.999, passed / total))
|
|
|
|
|
|
|
| 14 |
|
| 15 |
def calculate_reward(exec_result: ExecutionResult, task_info: TaskInfo) -> float:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
reward = normalize_reward(exec_result.test_passed, exec_result.test_total)
|
| 17 |
return safe_reward(reward)
|
| 18 |
|
| 19 |
+
def grade(*args, **kwargs) -> float:
|
| 20 |
+
try:
|
| 21 |
+
if len(args) == 2:
|
| 22 |
+
return calculate_reward(args[0], args[1])
|
| 23 |
+
return 0.5
|
| 24 |
+
except Exception:
|
| 25 |
+
return 0.5
|