ajaxwin commited on
Commit
7f7bcc6
·
1 Parent(s): 5235476

refactor: grader trivial bug,

Browse files
agents/task1.py CHANGED
@@ -34,20 +34,18 @@ def oracle_t1(env: Task1Environment, seed: int, verbose: bool = False) -> Dict[s
34
  if fn and fn.get("vulnerable"):
35
  vuln_issue = fn["vulnerability_details"]["issue"]
36
  break
 
37
  if verbose:
38
  print(f" {obs.contract_name}.{fn_name}() [{vuln_issue}]")
 
39
  env.step(Action(action_type=ActionType.LIST_FUNCTIONS))
40
- env.step(Action(action_type=ActionType.GET_FUNCTION_CODE,
41
- params={"function_name": fn_name}))
42
- result = env.step(Action(action_type=ActionType.SUBMIT,
43
- params={"function_name": fn_name,
44
- "vulnerability_type": vuln_issue}))
45
- v = result.reward.value
46
- score = 1.0 if v >= 4.9 else (0.5 if v >= 0.9 else 0.0)
47
- return {"seed": seed, "contract": obs.contract_name, "target_function": fn_name,
48
- "vulnerability": vuln_issue, "grader_score": score,
49
- "cumulative_reward": result.observation.cumulative_reward}
50
 
 
 
51
 
52
  def partial_t1(env: Task1Environment, seed: int) -> Dict[str, Any]:
53
  """Correct function, 'unknown' vuln type → score = 0.5."""
@@ -55,9 +53,7 @@ def partial_t1(env: Task1Environment, seed: int) -> Dict[str, Any]:
55
  fn_name = env.state().target_function
56
  result = env.step(Action(action_type=ActionType.SUBMIT,
57
  params={"function_name": fn_name, "vulnerability_type": "unknown"}))
58
- v = result.reward.value
59
- return {"seed": seed, "grader_score": 0.5 if v >= 0.9 else 0.0,
60
- "cumulative_reward": result.observation.cumulative_reward}
61
 
62
 
63
  def random_t1(env: Task1Environment, seed: int) -> Dict[str, Any]:
@@ -96,18 +92,14 @@ def random_t1(env: Task1Environment, seed: int) -> Dict[str, Any]:
96
  result = env.step(Action(action_type=ActionType.SUBMIT,
97
  params={"function_name": random_fn,
98
  "vulnerability_type": random_vuln}))
99
- v = result.reward.value
100
- score = 1.0 if v >= 4.9 else (0.5 if v >= 0.9 else 0.0)
101
- return {"seed": seed, "grader_score": score, "submitted_fn": random_fn,
102
- "submitted_vuln": random_vuln,
103
- "cumulative_reward": result.observation.cumulative_reward}
104
 
105
 
106
  def floor_t1(env: Task1Environment, seed: int) -> Dict[str, Any]:
107
- """Always submits 'constructor' → guaranteed score = 0.0."""
108
  env.reset(seed=seed)
109
  result = env.step(Action(action_type=ActionType.SUBMIT,
110
  params={"function_name": "constructor",
111
  "vulnerability_type": "reentrancy"}))
112
- return {"seed": seed, "grader_score": 0.0,
113
- "cumulative_reward": result.observation.cumulative_reward}
 
34
  if fn and fn.get("vulnerable"):
35
  vuln_issue = fn["vulnerability_details"]["issue"]
36
  break
37
+
38
  if verbose:
39
  print(f" {obs.contract_name}.{fn_name}() [{vuln_issue}]")
40
+
41
  env.step(Action(action_type=ActionType.LIST_FUNCTIONS))
42
+ env.step(Action(action_type=ActionType.GET_FUNCTION_CODE, params={"function_name": fn_name}))
43
+
44
+ result = env.step(Action(action_type=ActionType.SUBMIT, params={"function_name": fn_name,
45
+ "vulnerability_type": vuln_issue}))
 
 
 
 
 
 
46
 
47
+ return {"seed": seed, "contract": obs.contract_name, "target_function": fn_name,
48
+ "vulnerability": vuln_issue, "grader_score": result.reward.value}
49
 
50
  def partial_t1(env: Task1Environment, seed: int) -> Dict[str, Any]:
51
  """Correct function, 'unknown' vuln type → score = 0.5."""
 
53
  fn_name = env.state().target_function
54
  result = env.step(Action(action_type=ActionType.SUBMIT,
55
  params={"function_name": fn_name, "vulnerability_type": "unknown"}))
56
+ return {"seed": seed, "grader_score": result.reward.value}
 
 
57
 
58
 
59
  def random_t1(env: Task1Environment, seed: int) -> Dict[str, Any]:
 
92
  result = env.step(Action(action_type=ActionType.SUBMIT,
93
  params={"function_name": random_fn,
94
  "vulnerability_type": random_vuln}))
95
+ return {"seed": seed, "grader_score": result.reward.value, "submitted_fn":
96
+ random_fn, "submitted_vuln": random_vuln}
 
 
 
97
 
98
 
99
  def floor_t1(env: Task1Environment, seed: int) -> Dict[str, Any]:
100
+ """Always submits 'constructor' → guaranteed score = 0.0001"""
101
  env.reset(seed=seed)
102
  result = env.step(Action(action_type=ActionType.SUBMIT,
103
  params={"function_name": "constructor",
104
  "vulnerability_type": "reentrancy"}))
105
+ return {"seed": seed, "grader_score": result.reward.value }
 
eval.py CHANGED
@@ -50,18 +50,18 @@ def run_task1_eval(n: int, seed_offset: int, verbose: bool) -> Dict[str, Any]:
50
  env = Task1Environment()
51
 
52
  # Oracle
53
- print("▶ Oracle (correct function + correct vuln 1.0):")
54
  oracle_eps = []
55
  for i in range(n):
56
  ep = oracle_t1(env, seed_offset + i, verbose)
57
  oracle_eps.append(ep)
58
  print(f" seed={ep['seed']:3d} {ep['contract']:12s}.{ep['target_function']:18s}"
59
- f" score={ep['grader_score']:.1f} reward={ep['cumulative_reward']:+.2f}")
60
  oracle_avg = _avg(oracle_eps)
61
  print(f"\n Oracle avg: {oracle_avg:.3f}")
62
 
63
  # Partial
64
- print("\n▶ Partial (correct function, 'unknown' vuln 0.5):")
65
  partial_eps = [partial_t1(env, seed_offset + i) for i in range(n)]
66
  partial_avg = _avg(partial_eps)
67
  print(f" Partial avg: {partial_avg:.3f}")
@@ -88,12 +88,12 @@ def run_task1_eval(n: int, seed_offset: int, verbose: bool) -> Dict[str, Any]:
88
  for v in sorted(vuln_seen):
89
  print(f" {vuln_seen[v]:2d}× {v}")
90
 
91
- # assert oracle_avg == 1.0, f"Oracle avg {oracle_avg:.3f} should be 1.0"
92
- # assert partial_avg == 0.5, f"Partial avg {partial_avg:.3f} should be 0.5"
93
- # assert floor_avg == 0.0, f"Floor avg {floor_avg:.3f} should be 0.0"
94
- # assert oracle_avg >= random_avg >= floor_avg, \
95
- # f"Score ordering violated: oracle={oracle_avg}, random={random_avg}, floor={floor_avg}"
96
- print(f"\n ✅ Task 1: oracle(1.0) ≥ partial(0.5) ≥ random({random_avg:.3f}) ≥ floor(0.0)")
97
 
98
  return {
99
  "task_id": "task1_vuln_detection",
 
50
  env = Task1Environment()
51
 
52
  # Oracle
53
+ print("▶ Oracle (correct function + correct vuln = ~1.0):")
54
  oracle_eps = []
55
  for i in range(n):
56
  ep = oracle_t1(env, seed_offset + i, verbose)
57
  oracle_eps.append(ep)
58
  print(f" seed={ep['seed']:3d} {ep['contract']:12s}.{ep['target_function']:18s}"
59
+ f" score={ep['grader_score']:.1f}")
60
  oracle_avg = _avg(oracle_eps)
61
  print(f"\n Oracle avg: {oracle_avg:.3f}")
62
 
63
  # Partial
64
+ print("\n▶ Partial (correct function, 'unknown' vuln = ~0.5):")
65
  partial_eps = [partial_t1(env, seed_offset + i) for i in range(n)]
66
  partial_avg = _avg(partial_eps)
67
  print(f" Partial avg: {partial_avg:.3f}")
 
88
  for v in sorted(vuln_seen):
89
  print(f" {vuln_seen[v]:2d}× {v}")
90
 
91
+ assert oracle_avg == 1.0, f"Oracle avg {oracle_avg:.3f} should be 1.0"
92
+ assert partial_avg == 0.5, f"Partial avg {partial_avg:.3f} should be 0.5"
93
+ assert floor_avg == 0.0, f"Floor avg {floor_avg:.3f} should be 0.0"
94
+ assert oracle_avg >= random_avg >= floor_avg, \
95
+ f"Score ordering violated: oracle={oracle_avg}, random={random_avg}, floor={floor_avg}"
96
+ print(f"\n ✅ Task 1: oracle({oracle_avg}) ≥ partial({partial_avg}) ≥ random({random_avg:.3f}) ≥ floor({floor_avg})")
97
 
98
  return {
99
  "task_id": "task1_vuln_detection",
server/tasks/task1/environment.py CHANGED
@@ -50,7 +50,7 @@ class Task1Environment(BaseEnv):
50
  def __init__(self, contracts_path: Optional[str] = None) -> None:
51
  self._contracts = load_contracts(contracts_path) if contracts_path else load_contracts()
52
  self._rng = random.Random()
53
- self._max_steps: int = 0
54
 
55
  # Episode state (initialised by reset)
56
  self._contract: Dict[str, Any] = {}
 
50
  def __init__(self, contracts_path: Optional[str] = None) -> None:
51
  self._contracts = load_contracts(contracts_path) if contracts_path else load_contracts()
52
  self._rng = random.Random()
53
+ self._max_steps: int = 40
54
 
55
  # Episode state (initialised by reset)
56
  self._contract: Dict[str, Any] = {}
server/tasks/task1/grader.py CHANGED
@@ -19,8 +19,8 @@ class Task1Grader:
19
 
20
  def grade(self, submitted_function: str, submitted_vuln_type: str, steps: int, cummulative_cost: int) -> float:
21
  """Returns grade strictly in (0, 1)."""
22
- func_match = submitted_function.strip().lower() != self.target_function
23
- issue_match = SemanticMatcher().match(self.vulnerability_issue, submitted_vuln_type)
24
 
25
  # Score formula
26
  free_budget = (cummulative_cost / steps) * (self.n + 2)
 
19
 
20
  def grade(self, submitted_function: str, submitted_vuln_type: str, steps: int, cummulative_cost: int) -> float:
21
  """Returns grade strictly in (0, 1)."""
22
+ func_match = submitted_function.strip().lower() == self.target_function
23
+ issue_match = SemanticMatcher().matchscore(self.vulnerability_issue, submitted_vuln_type)
24
 
25
  # Score formula
26
  free_budget = (cummulative_cost / steps) * (self.n + 2)