adityanaikhpt commited on
Commit
f14f8d9
·
1 Parent(s): 74bfde0

fix: add safe_reward() clamp at every reward return point

Browse files
Files changed (3) hide show
  1. server/app.py +3 -3
  2. server/env.py +2 -2
  3. server/grader.py +19 -3
server/app.py CHANGED
@@ -13,7 +13,7 @@ from pydantic import BaseModel
13
 
14
  from server.models import CodeArenaObservation, CodeArenaAction, TaskInfo
15
  from server.executor import run_code_with_tests
16
- from server.grader import calculate_reward
17
  from tasks import ALL_TASKS
18
 
19
 
@@ -140,7 +140,7 @@ def api_step(action: CodeArenaAction):
140
  obs, reward, done, info = _env.step(action)
141
  return {
142
  "observation": obs.model_dump(),
143
- "reward": reward,
144
  "done": done,
145
  "info": info,
146
  }
@@ -155,7 +155,7 @@ def api_step(action: CodeArenaAction):
155
  "test_results": "",
156
  "previous_attempts": [],
157
  },
158
- "reward": 0.1,
159
  "done": True,
160
  "info": {},
161
  }
 
13
 
14
  from server.models import CodeArenaObservation, CodeArenaAction, TaskInfo
15
  from server.executor import run_code_with_tests
16
+ from server.grader import calculate_reward, safe_reward
17
  from tasks import ALL_TASKS
18
 
19
 
 
140
  obs, reward, done, info = _env.step(action)
141
  return {
142
  "observation": obs.model_dump(),
143
+ "reward": safe_reward(reward),
144
  "done": done,
145
  "info": info,
146
  }
 
155
  "test_results": "",
156
  "previous_attempts": [],
157
  },
158
+ "reward": safe_reward(0.1),
159
  "done": True,
160
  "info": {},
161
  }
server/env.py CHANGED
@@ -4,7 +4,7 @@ from contextlib import asynccontextmanager
4
 
5
  from .models import CodeArenaObservation, CodeArenaAction, TaskInfo
6
  from .executor import run_code_with_tests
7
- from .grader import calculate_reward
8
  from tasks import ALL_TASKS
9
 
10
  class CodeArenaEnv:
@@ -42,7 +42,7 @@ class CodeArenaEnv:
42
  )
43
 
44
  # Calculate Reward
45
- reward = calculate_reward(exec_result, self.current_task)
46
 
47
  # Update State
48
  self.previous_attempts.append(action.proposed_fix)
 
4
 
5
  from .models import CodeArenaObservation, CodeArenaAction, TaskInfo
6
  from .executor import run_code_with_tests
7
+ from .grader import calculate_reward, safe_reward
8
  from tasks import ALL_TASKS
9
 
10
  class CodeArenaEnv:
 
42
  )
43
 
44
  # Calculate Reward
45
+ reward = safe_reward(calculate_reward(exec_result, self.current_task))
46
 
47
  # Update State
48
  self.previous_attempts.append(action.proposed_fix)
server/grader.py CHANGED
@@ -1,6 +1,22 @@
1
  from .models import ExecutionResult, TaskInfo
2
 
3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  def normalize_reward(passed: int, total: int) -> float:
5
  """
6
  Compute a reward strictly within the open interval (0, 1).
@@ -19,7 +35,7 @@ def normalize_reward(passed: int, total: int) -> float:
19
  def calculate_reward(exec_result: ExecutionResult, task_info: TaskInfo) -> float:
20
  """
21
  Single entry-point used by env.py and app.py.
22
- Delegates entirely to normalize_reward so every task
23
- always produces a score in (0, 1).
24
  """
25
- return normalize_reward(exec_result.test_passed, exec_result.test_total)
 
 
1
  from .models import ExecutionResult, TaskInfo
2
 
3
 
4
+ def safe_reward(reward) -> float:
5
+ """
6
+ Final safety net: guarantees reward is strictly within (0, 1).
7
+ Applied at every return point as a last-mile clamp.
8
+ """
9
+ try:
10
+ r = float(reward)
11
+ except Exception:
12
+ return 0.5
13
+ if r <= 0:
14
+ return 0.1
15
+ elif r >= 1:
16
+ return 0.9
17
+ return r
18
+
19
+
20
  def normalize_reward(passed: int, total: int) -> float:
21
  """
22
  Compute a reward strictly within the open interval (0, 1).
 
35
  def calculate_reward(exec_result: ExecutionResult, task_info: TaskInfo) -> float:
36
  """
37
  Single entry-point used by env.py and app.py.
38
+ Delegates to normalize_reward, then applies safe_reward clamp.
 
39
  """
40
+ reward = normalize_reward(exec_result.test_passed, exec_result.test_total)
41
+ return safe_reward(reward)