Spaces:
Sleeping
Sleeping
Commit ·
9967cb5
1
Parent(s): f14f8d9
fix: openenv.yaml tasks graders + strict score bounds
Browse files- openenv.yaml +28 -17
- server/env.py +1 -0
- server/grader.py +5 -10
openenv.yaml
CHANGED
|
@@ -1,17 +1,28 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
-
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: codearena-rl-benchmark
|
| 2 |
+
description: "RL Benchmark for Autonomous Code Repair — iterative debugging with execution feedback"
|
| 3 |
+
version: "1.0.0"
|
| 4 |
+
entrypoint: server.env:CodeArenaEnv
|
| 5 |
+
|
| 6 |
+
runtime:
|
| 7 |
+
language: python
|
| 8 |
+
python_version: "3.11"
|
| 9 |
+
|
| 10 |
+
api:
|
| 11 |
+
reset: /reset
|
| 12 |
+
step: /step
|
| 13 |
+
state: /state
|
| 14 |
+
|
| 15 |
+
tasks:
|
| 16 |
+
- id: easy
|
| 17 |
+
path: tasks/easy.json
|
| 18 |
+
grader: server.grader:grade
|
| 19 |
+
- id: medium
|
| 20 |
+
path: tasks/medium.json
|
| 21 |
+
grader: server.grader:grade
|
| 22 |
+
- id: hard
|
| 23 |
+
path: tasks/hard.json
|
| 24 |
+
grader: server.grader:grade
|
| 25 |
+
|
| 26 |
+
limits:
|
| 27 |
+
step_timeout_seconds: 2
|
| 28 |
+
max_runtime_minutes: 20
|
server/env.py
CHANGED
|
@@ -43,6 +43,7 @@ class CodeArenaEnv:
|
|
| 43 |
|
| 44 |
# Calculate Reward
|
| 45 |
reward = safe_reward(calculate_reward(exec_result, self.current_task))
|
|
|
|
| 46 |
|
| 47 |
# Update State
|
| 48 |
self.previous_attempts.append(action.proposed_fix)
|
|
|
|
| 43 |
|
| 44 |
# Calculate Reward
|
| 45 |
reward = safe_reward(calculate_reward(exec_result, self.current_task))
|
| 46 |
+
reward = max(0.001, min(0.999, reward))
|
| 47 |
|
| 48 |
# Update State
|
| 49 |
self.previous_attempts.append(action.proposed_fix)
|
server/grader.py
CHANGED
|
@@ -10,11 +10,7 @@ def safe_reward(reward) -> float:
|
|
| 10 |
r = float(reward)
|
| 11 |
except Exception:
|
| 12 |
return 0.5
|
| 13 |
-
|
| 14 |
-
return 0.1
|
| 15 |
-
elif r >= 1:
|
| 16 |
-
return 0.9
|
| 17 |
-
return r
|
| 18 |
|
| 19 |
|
| 20 |
def normalize_reward(passed: int, total: int) -> float:
|
|
@@ -25,11 +21,7 @@ def normalize_reward(passed: int, total: int) -> float:
|
|
| 25 |
if total == 0:
|
| 26 |
return 0.5
|
| 27 |
reward = passed / total
|
| 28 |
-
|
| 29 |
-
return 0.1
|
| 30 |
-
elif reward >= 1:
|
| 31 |
-
return 0.9
|
| 32 |
-
return float(reward)
|
| 33 |
|
| 34 |
|
| 35 |
def calculate_reward(exec_result: ExecutionResult, task_info: TaskInfo) -> float:
|
|
@@ -39,3 +31,6 @@ def calculate_reward(exec_result: ExecutionResult, task_info: TaskInfo) -> float
|
|
| 39 |
"""
|
| 40 |
reward = normalize_reward(exec_result.test_passed, exec_result.test_total)
|
| 41 |
return safe_reward(reward)
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
r = float(reward)
|
| 11 |
except Exception:
|
| 12 |
return 0.5
|
| 13 |
+
return max(0.001, min(0.999, r))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
|
| 16 |
def normalize_reward(passed: int, total: int) -> float:
|
|
|
|
| 21 |
if total == 0:
|
| 22 |
return 0.5
|
| 23 |
reward = passed / total
|
| 24 |
+
return max(0.001, min(0.999, float(reward)))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
|
| 27 |
def calculate_reward(exec_result: ExecutionResult, task_info: TaskInfo) -> float:
|
|
|
|
| 31 |
"""
|
| 32 |
reward = normalize_reward(exec_result.test_passed, exec_result.test_total)
|
| 33 |
return safe_reward(reward)
|
| 34 |
+
|
| 35 |
+
# Alias for OpenEnv grader
|
| 36 |
+
grade = calculate_reward
|