Spaces:

ceoavinash
/

codearena-rl

Sleeping

adityanaikhpt commited on 30 days ago

Commit

9967cb5

1 Parent(s): f14f8d9

fix: openenv.yaml tasks graders + strict score bounds

Files changed (3) hide show

openenv.yaml CHANGED Viewed

@@ -1,17 +1,28 @@
-version: "1.0"
-environment:
-  name: "CodeArena"
-  description: "RL Benchmark for Autonomous Code Repair"
-capabilities:
-  - execute_code
-  - calculate_reward
-endpoints:
-  reset: "/reset"
-  step: "/step"
-  state: "/state"
-dependencies:
-  - python: 3.10
-metrics:
-  - compile_score
-  - test_pass_ratio
-  - efficiency_score

+name: codearena-rl-benchmark
+description: "RL Benchmark for Autonomous Code Repair — iterative debugging with execution feedback"
+version: "1.0.0"
+entrypoint: server.env:CodeArenaEnv
+runtime:
+  language: python
+  python_version: "3.11"
+api:
+  reset: /reset
+  step: /step
+  state: /state
+tasks:
+  - id: easy
+    path: tasks/easy.json
+    grader: server.grader:grade
+  - id: medium
+    path: tasks/medium.json
+    grader: server.grader:grade
+  - id: hard
+    path: tasks/hard.json
+    grader: server.grader:grade
+limits:
+  step_timeout_seconds: 2
+  max_runtime_minutes: 20

server/env.py CHANGED Viewed

@@ -43,6 +43,7 @@ class CodeArenaEnv:
         # Calculate Reward
         reward = safe_reward(calculate_reward(exec_result, self.current_task))
         # Update State
         self.previous_attempts.append(action.proposed_fix)

         # Calculate Reward
         reward = safe_reward(calculate_reward(exec_result, self.current_task))
+        reward = max(0.001, min(0.999, reward))
         # Update State
         self.previous_attempts.append(action.proposed_fix)

server/grader.py CHANGED Viewed

@@ -10,11 +10,7 @@ def safe_reward(reward) -> float:
         r = float(reward)
     except Exception:
         return 0.5
-    if r <= 0:
-        return 0.1
-    elif r >= 1:
-        return 0.9
-    return r
 def normalize_reward(passed: int, total: int) -> float:
@@ -25,11 +21,7 @@ def normalize_reward(passed: int, total: int) -> float:
     if total == 0:
         return 0.5
     reward = passed / total
-    if reward <= 0:
-        return 0.1
-    elif reward >= 1:
-        return 0.9
-    return float(reward)
 def calculate_reward(exec_result: ExecutionResult, task_info: TaskInfo) -> float:
@@ -39,3 +31,6 @@ def calculate_reward(exec_result: ExecutionResult, task_info: TaskInfo) -> float
     """
     reward = normalize_reward(exec_result.test_passed, exec_result.test_total)
     return safe_reward(reward)

         r = float(reward)
     except Exception:
         return 0.5
+    return max(0.001, min(0.999, r))
 def normalize_reward(passed: int, total: int) -> float:
     if total == 0:
         return 0.5
     reward = passed / total
+    return max(0.001, min(0.999, float(reward)))
 def calculate_reward(exec_result: ExecutionResult, task_info: TaskInfo) -> float:
     """
     reward = normalize_reward(exec_result.test_passed, exec_result.test_total)
     return safe_reward(reward)
+# Alias for OpenEnv grader
+grade = calculate_reward