adityanaikhpt commited on
Commit
646409d
·
1 Parent(s): 2d8d38c

fix: reset task_id parsing, grader tuple crash fallback, and inference score output

Browse files
Files changed (3) hide show
  1. inference.py +3 -2
  2. server/env.py +9 -6
  3. server/grader.py +9 -20
inference.py CHANGED
@@ -31,7 +31,7 @@ def run_task(task_id: str):
31
  except Exception as e:
32
  error_msg = str(e).replace("\n", " ").replace("\r", "")
33
  print(f"[STEP] step=1 action=reset_failed reward=0.01 done=true error={error_msg}")
34
- print(f"[END] success=false steps=1 rewards=0.01")
35
  return
36
 
37
  rewards = []
@@ -109,7 +109,8 @@ def run_task(task_id: str):
109
  success = any(r > 0.5 for r in rewards)
110
  success_str = "true" if success else "false"
111
  rewards_str = ",".join([f"{r:.2f}" for r in rewards])
112
- print(f"[END] success={success_str} steps={step} rewards={rewards_str}")
 
113
 
114
  def main():
115
  target_task = os.environ.get("CODEARENA_TASK")
 
31
  except Exception as e:
32
  error_msg = str(e).replace("\n", " ").replace("\r", "")
33
  print(f"[STEP] step=1 action=reset_failed reward=0.01 done=true error={error_msg}")
34
+ print(f"[END] success=false steps=1 score=0.01 rewards=0.01")
35
  return
36
 
37
  rewards = []
 
109
  success = any(r > 0.5 for r in rewards)
110
  success_str = "true" if success else "false"
111
  rewards_str = ",".join([f"{r:.2f}" for r in rewards])
112
+ score = max(0.001, min(0.999, (sum(rewards) / len(rewards)) if rewards else 0.5))
113
+ print(f"[END] success={success_str} steps={step} score={score:.2f} rewards={rewards_str}")
114
 
115
  def main():
116
  target_task = os.environ.get("CODEARENA_TASK")
server/env.py CHANGED
@@ -18,14 +18,17 @@ class CodeArenaEnv:
18
  self.step_count = 0
19
  self.max_steps = 5
20
 
21
- def reset(self) -> CodeArenaObservation:
22
- self.current_task = random.choice(self.tasks)
 
 
 
 
23
  self.previous_attempts = []
24
  self.last_error_log = ""
25
  self.last_test_results = ""
26
  self.is_done = False
27
  self.step_count = 0
28
-
29
  return self.state()
30
 
31
  def step(self, action: CodeArenaAction) -> tuple[CodeArenaObservation, float, bool, dict]:
@@ -83,9 +86,9 @@ async def lifespan(app: FastAPI):
83
  app = FastAPI(lifespan=lifespan, title="CodeArena RL Environment")
84
 
85
  @app.post("/reset")
86
- def api_reset():
87
- obs = _env.reset()
88
- # Returns 200 OK by default in FastAPI
89
  return {"message": "Environment reset successfully", "observation": obs.model_dump()}
90
 
91
  @app.post("/step")
 
18
  self.step_count = 0
19
  self.max_steps = 5
20
 
21
+ def reset(self, task_id: str = None) -> CodeArenaObservation:
22
+ if task_id:
23
+ matched = [t for t in self.tasks if t.task_id == task_id]
24
+ self.current_task = matched[0] if matched else random.choice(self.tasks)
25
+ else:
26
+ self.current_task = random.choice(self.tasks)
27
  self.previous_attempts = []
28
  self.last_error_log = ""
29
  self.last_test_results = ""
30
  self.is_done = False
31
  self.step_count = 0
 
32
  return self.state()
33
 
34
  def step(self, action: CodeArenaAction) -> tuple[CodeArenaObservation, float, bool, dict]:
 
86
  app = FastAPI(lifespan=lifespan, title="CodeArena RL Environment")
87
 
88
  @app.post("/reset")
89
+ def api_reset(body: dict = None):
90
+ task_id = (body or {}).get("task_id")
91
+ obs = _env.reset(task_id=task_id)
92
  return {"message": "Environment reset successfully", "observation": obs.model_dump()}
93
 
94
  @app.post("/step")
server/grader.py CHANGED
@@ -1,36 +1,25 @@
1
  from .models import ExecutionResult, TaskInfo
2
 
3
-
4
  def safe_reward(reward) -> float:
5
- """
6
- Final safety net: guarantees reward is strictly within (0, 1).
7
- Applied at every return point as a last-mile clamp.
8
- """
9
  try:
10
  r = float(reward)
11
  except Exception:
12
  return 0.5
13
- return max(0.001, min(0.999, float(reward)))
14
-
15
 
16
  def normalize_reward(passed: int, total: int) -> float:
17
- """
18
- Compute a reward strictly within the open interval (0, 1).
19
- Never returns exactly 0.0 or 1.0.
20
- """
21
  if total == 0:
22
  return 0.5
23
- reward = passed / total
24
- return max(0.001, min(0.999, float(reward)))
25
-
26
 
27
  def calculate_reward(exec_result: ExecutionResult, task_info: TaskInfo) -> float:
28
- """
29
- Single entry-point used by env.py and app.py.
30
- Delegates to normalize_reward, then applies safe_reward clamp.
31
- """
32
  reward = normalize_reward(exec_result.test_passed, exec_result.test_total)
33
  return safe_reward(reward)
34
 
35
- # Alias for OpenEnv grader
36
- grade = calculate_reward
 
 
 
 
 
 
1
  from .models import ExecutionResult, TaskInfo
2
 
 
3
  def safe_reward(reward) -> float:
 
 
 
 
4
  try:
5
  r = float(reward)
6
  except Exception:
7
  return 0.5
8
+ return max(0.001, min(0.999, r))
 
9
 
10
  def normalize_reward(passed: int, total: int) -> float:
 
 
 
 
11
  if total == 0:
12
  return 0.5
13
+ return max(0.001, min(0.999, passed / total))
 
 
14
 
15
  def calculate_reward(exec_result: ExecutionResult, task_info: TaskInfo) -> float:
 
 
 
 
16
  reward = normalize_reward(exec_result.test_passed, exec_result.test_total)
17
  return safe_reward(reward)
18
 
19
+ def grade(*args, **kwargs) -> float:
20
+ try:
21
+ if len(args) == 2:
22
+ return calculate_reward(args[0], args[1])
23
+ return 0.5
24
+ except Exception:
25
+ return 0.5