adityanaikhpt commited on
Commit
9967cb5
·
1 Parent(s): f14f8d9

fix: openenv.yaml tasks graders + strict score bounds

Browse files
Files changed (3) hide show
  1. openenv.yaml +28 -17
  2. server/env.py +1 -0
  3. server/grader.py +5 -10
openenv.yaml CHANGED
@@ -1,17 +1,28 @@
1
- version: "1.0"
2
- environment:
3
- name: "CodeArena"
4
- description: "RL Benchmark for Autonomous Code Repair"
5
- capabilities:
6
- - execute_code
7
- - calculate_reward
8
- endpoints:
9
- reset: "/reset"
10
- step: "/step"
11
- state: "/state"
12
- dependencies:
13
- - python: 3.10
14
- metrics:
15
- - compile_score
16
- - test_pass_ratio
17
- - efficiency_score
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: codearena-rl-benchmark
2
+ description: "RL Benchmark for Autonomous Code Repair — iterative debugging with execution feedback"
3
+ version: "1.0.0"
4
+ entrypoint: server.env:CodeArenaEnv
5
+
6
+ runtime:
7
+ language: python
8
+ python_version: "3.11"
9
+
10
+ api:
11
+ reset: /reset
12
+ step: /step
13
+ state: /state
14
+
15
+ tasks:
16
+ - id: easy
17
+ path: tasks/easy.json
18
+ grader: server.grader:grade
19
+ - id: medium
20
+ path: tasks/medium.json
21
+ grader: server.grader:grade
22
+ - id: hard
23
+ path: tasks/hard.json
24
+ grader: server.grader:grade
25
+
26
+ limits:
27
+ step_timeout_seconds: 2
28
+ max_runtime_minutes: 20
server/env.py CHANGED
@@ -43,6 +43,7 @@ class CodeArenaEnv:
43
 
44
  # Calculate Reward
45
  reward = safe_reward(calculate_reward(exec_result, self.current_task))
 
46
 
47
  # Update State
48
  self.previous_attempts.append(action.proposed_fix)
 
43
 
44
  # Calculate Reward
45
  reward = safe_reward(calculate_reward(exec_result, self.current_task))
46
+ reward = max(0.001, min(0.999, reward))
47
 
48
  # Update State
49
  self.previous_attempts.append(action.proposed_fix)
server/grader.py CHANGED
@@ -10,11 +10,7 @@ def safe_reward(reward) -> float:
10
  r = float(reward)
11
  except Exception:
12
  return 0.5
13
- if r <= 0:
14
- return 0.1
15
- elif r >= 1:
16
- return 0.9
17
- return r
18
 
19
 
20
  def normalize_reward(passed: int, total: int) -> float:
@@ -25,11 +21,7 @@ def normalize_reward(passed: int, total: int) -> float:
25
  if total == 0:
26
  return 0.5
27
  reward = passed / total
28
- if reward <= 0:
29
- return 0.1
30
- elif reward >= 1:
31
- return 0.9
32
- return float(reward)
33
 
34
 
35
  def calculate_reward(exec_result: ExecutionResult, task_info: TaskInfo) -> float:
@@ -39,3 +31,6 @@ def calculate_reward(exec_result: ExecutionResult, task_info: TaskInfo) -> float
39
  """
40
  reward = normalize_reward(exec_result.test_passed, exec_result.test_total)
41
  return safe_reward(reward)
 
 
 
 
10
  r = float(reward)
11
  except Exception:
12
  return 0.5
13
+ return max(0.001, min(0.999, r))
 
 
 
 
14
 
15
 
16
  def normalize_reward(passed: int, total: int) -> float:
 
21
  if total == 0:
22
  return 0.5
23
  reward = passed / total
24
+ return max(0.001, min(0.999, float(reward)))
 
 
 
 
25
 
26
 
27
  def calculate_reward(exec_result: ExecutionResult, task_info: TaskInfo) -> float:
 
31
  """
32
  reward = normalize_reward(exec_result.test_passed, exec_result.test_total)
33
  return safe_reward(reward)
34
+
35
+ # Alias for OpenEnv grader
36
+ grade = calculate_reward