Spaces:

ceoavinash
/

codearena-rl

Sleeping

adityanaikhpt commited on about 1 month ago

Commit

f14f8d9

1 Parent(s): 74bfde0

fix: add safe_reward() clamp at every reward return point

Files changed (3) hide show

server/app.py CHANGED Viewed

@@ -13,7 +13,7 @@ from pydantic import BaseModel
 from server.models import CodeArenaObservation, CodeArenaAction, TaskInfo
 from server.executor import run_code_with_tests
-from server.grader import calculate_reward
 from tasks import ALL_TASKS
@@ -140,7 +140,7 @@ def api_step(action: CodeArenaAction):
         obs, reward, done, info = _env.step(action)
         return {
             "observation": obs.model_dump(),
-            "reward": reward,
             "done": done,
             "info": info,
         }
@@ -155,7 +155,7 @@ def api_step(action: CodeArenaAction):
                 "test_results": "",
                 "previous_attempts": [],
             },
-            "reward": 0.1,
             "done": True,
             "info": {},
         }

 from server.models import CodeArenaObservation, CodeArenaAction, TaskInfo
 from server.executor import run_code_with_tests
+from server.grader import calculate_reward, safe_reward
 from tasks import ALL_TASKS
         obs, reward, done, info = _env.step(action)
         return {
             "observation": obs.model_dump(),
+            "reward": safe_reward(reward),
             "done": done,
             "info": info,
         }
                 "test_results": "",
                 "previous_attempts": [],
             },
+            "reward": safe_reward(0.1),
             "done": True,
             "info": {},
         }

server/env.py CHANGED Viewed

@@ -4,7 +4,7 @@ from contextlib import asynccontextmanager
 from .models import CodeArenaObservation, CodeArenaAction, TaskInfo
 from .executor import run_code_with_tests
-from .grader import calculate_reward
 from tasks import ALL_TASKS
 class CodeArenaEnv:
@@ -42,7 +42,7 @@ class CodeArenaEnv:
         )
         # Calculate Reward
-        reward = calculate_reward(exec_result, self.current_task)
         # Update State
         self.previous_attempts.append(action.proposed_fix)

 from .models import CodeArenaObservation, CodeArenaAction, TaskInfo
 from .executor import run_code_with_tests
+from .grader import calculate_reward, safe_reward
 from tasks import ALL_TASKS
 class CodeArenaEnv:
         )
         # Calculate Reward
+        reward = safe_reward(calculate_reward(exec_result, self.current_task))
         # Update State
         self.previous_attempts.append(action.proposed_fix)

server/grader.py CHANGED Viewed

@@ -1,6 +1,22 @@
 from .models import ExecutionResult, TaskInfo
 def normalize_reward(passed: int, total: int) -> float:
     """
     Compute a reward strictly within the open interval (0, 1).
@@ -19,7 +35,7 @@ def normalize_reward(passed: int, total: int) -> float:
 def calculate_reward(exec_result: ExecutionResult, task_info: TaskInfo) -> float:
     """
     Single entry-point used by env.py and app.py.
-    Delegates entirely to normalize_reward so every task
-    always produces a score in (0, 1).
     """
-    return normalize_reward(exec_result.test_passed, exec_result.test_total)

 from .models import ExecutionResult, TaskInfo
+def safe_reward(reward) -> float:
+    """
+    Final safety net: guarantees reward is strictly within (0, 1).
+    Applied at every return point as a last-mile clamp.
+    """
+    try:
+        r = float(reward)
+    except Exception:
+        return 0.5
+    if r <= 0:
+        return 0.1
+    elif r >= 1:
+        return 0.9
+    return r
 def normalize_reward(passed: int, total: int) -> float:
     """
     Compute a reward strictly within the open interval (0, 1).
 def calculate_reward(exec_result: ExecutionResult, task_info: TaskInfo) -> float:
     """
     Single entry-point used by env.py and app.py.
+    Delegates to normalize_reward, then applies safe_reward clamp.
     """
+    reward = normalize_reward(exec_result.test_passed, exec_result.test_total)
+    return safe_reward(reward)