Spaces:
Running
Running
ajaxwin commited on
Commit ·
7f7bcc6
1
Parent(s): 5235476
refactor: grader trivial bug,
Browse files- agents/task1.py +13 -21
- eval.py +9 -9
- server/tasks/task1/environment.py +1 -1
- server/tasks/task1/grader.py +2 -2
agents/task1.py
CHANGED
|
@@ -34,20 +34,18 @@ def oracle_t1(env: Task1Environment, seed: int, verbose: bool = False) -> Dict[s
|
|
| 34 |
if fn and fn.get("vulnerable"):
|
| 35 |
vuln_issue = fn["vulnerability_details"]["issue"]
|
| 36 |
break
|
|
|
|
| 37 |
if verbose:
|
| 38 |
print(f" {obs.contract_name}.{fn_name}() [{vuln_issue}]")
|
|
|
|
| 39 |
env.step(Action(action_type=ActionType.LIST_FUNCTIONS))
|
| 40 |
-
env.step(Action(action_type=ActionType.GET_FUNCTION_CODE,
|
| 41 |
-
|
| 42 |
-
result = env.step(Action(action_type=ActionType.SUBMIT,
|
| 43 |
-
|
| 44 |
-
"vulnerability_type": vuln_issue}))
|
| 45 |
-
v = result.reward.value
|
| 46 |
-
score = 1.0 if v >= 4.9 else (0.5 if v >= 0.9 else 0.0)
|
| 47 |
-
return {"seed": seed, "contract": obs.contract_name, "target_function": fn_name,
|
| 48 |
-
"vulnerability": vuln_issue, "grader_score": score,
|
| 49 |
-
"cumulative_reward": result.observation.cumulative_reward}
|
| 50 |
|
|
|
|
|
|
|
| 51 |
|
| 52 |
def partial_t1(env: Task1Environment, seed: int) -> Dict[str, Any]:
|
| 53 |
"""Correct function, 'unknown' vuln type → score = 0.5."""
|
|
@@ -55,9 +53,7 @@ def partial_t1(env: Task1Environment, seed: int) -> Dict[str, Any]:
|
|
| 55 |
fn_name = env.state().target_function
|
| 56 |
result = env.step(Action(action_type=ActionType.SUBMIT,
|
| 57 |
params={"function_name": fn_name, "vulnerability_type": "unknown"}))
|
| 58 |
-
|
| 59 |
-
return {"seed": seed, "grader_score": 0.5 if v >= 0.9 else 0.0,
|
| 60 |
-
"cumulative_reward": result.observation.cumulative_reward}
|
| 61 |
|
| 62 |
|
| 63 |
def random_t1(env: Task1Environment, seed: int) -> Dict[str, Any]:
|
|
@@ -96,18 +92,14 @@ def random_t1(env: Task1Environment, seed: int) -> Dict[str, Any]:
|
|
| 96 |
result = env.step(Action(action_type=ActionType.SUBMIT,
|
| 97 |
params={"function_name": random_fn,
|
| 98 |
"vulnerability_type": random_vuln}))
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
return {"seed": seed, "grader_score": score, "submitted_fn": random_fn,
|
| 102 |
-
"submitted_vuln": random_vuln,
|
| 103 |
-
"cumulative_reward": result.observation.cumulative_reward}
|
| 104 |
|
| 105 |
|
| 106 |
def floor_t1(env: Task1Environment, seed: int) -> Dict[str, Any]:
|
| 107 |
-
"""Always submits 'constructor' → guaranteed score = 0.
|
| 108 |
env.reset(seed=seed)
|
| 109 |
result = env.step(Action(action_type=ActionType.SUBMIT,
|
| 110 |
params={"function_name": "constructor",
|
| 111 |
"vulnerability_type": "reentrancy"}))
|
| 112 |
-
return {"seed": seed, "grader_score":
|
| 113 |
-
"cumulative_reward": result.observation.cumulative_reward}
|
|
|
|
| 34 |
if fn and fn.get("vulnerable"):
|
| 35 |
vuln_issue = fn["vulnerability_details"]["issue"]
|
| 36 |
break
|
| 37 |
+
|
| 38 |
if verbose:
|
| 39 |
print(f" {obs.contract_name}.{fn_name}() [{vuln_issue}]")
|
| 40 |
+
|
| 41 |
env.step(Action(action_type=ActionType.LIST_FUNCTIONS))
|
| 42 |
+
env.step(Action(action_type=ActionType.GET_FUNCTION_CODE, params={"function_name": fn_name}))
|
| 43 |
+
|
| 44 |
+
result = env.step(Action(action_type=ActionType.SUBMIT, params={"function_name": fn_name,
|
| 45 |
+
"vulnerability_type": vuln_issue}))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
+
return {"seed": seed, "contract": obs.contract_name, "target_function": fn_name,
|
| 48 |
+
"vulnerability": vuln_issue, "grader_score": result.reward.value}
|
| 49 |
|
| 50 |
def partial_t1(env: Task1Environment, seed: int) -> Dict[str, Any]:
|
| 51 |
"""Correct function, 'unknown' vuln type → score = 0.5."""
|
|
|
|
| 53 |
fn_name = env.state().target_function
|
| 54 |
result = env.step(Action(action_type=ActionType.SUBMIT,
|
| 55 |
params={"function_name": fn_name, "vulnerability_type": "unknown"}))
|
| 56 |
+
return {"seed": seed, "grader_score": result.reward.value}
|
|
|
|
|
|
|
| 57 |
|
| 58 |
|
| 59 |
def random_t1(env: Task1Environment, seed: int) -> Dict[str, Any]:
|
|
|
|
| 92 |
result = env.step(Action(action_type=ActionType.SUBMIT,
|
| 93 |
params={"function_name": random_fn,
|
| 94 |
"vulnerability_type": random_vuln}))
|
| 95 |
+
return {"seed": seed, "grader_score": result.reward.value, "submitted_fn":
|
| 96 |
+
random_fn, "submitted_vuln": random_vuln}
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
|
| 99 |
def floor_t1(env: Task1Environment, seed: int) -> Dict[str, Any]:
|
| 100 |
+
"""Always submits 'constructor' → guaranteed score = 0.0001"""
|
| 101 |
env.reset(seed=seed)
|
| 102 |
result = env.step(Action(action_type=ActionType.SUBMIT,
|
| 103 |
params={"function_name": "constructor",
|
| 104 |
"vulnerability_type": "reentrancy"}))
|
| 105 |
+
return {"seed": seed, "grader_score": result.reward.value }
|
|
|
eval.py
CHANGED
|
@@ -50,18 +50,18 @@ def run_task1_eval(n: int, seed_offset: int, verbose: bool) -> Dict[str, Any]:
|
|
| 50 |
env = Task1Environment()
|
| 51 |
|
| 52 |
# Oracle
|
| 53 |
-
print("▶ Oracle (correct function + correct vuln
|
| 54 |
oracle_eps = []
|
| 55 |
for i in range(n):
|
| 56 |
ep = oracle_t1(env, seed_offset + i, verbose)
|
| 57 |
oracle_eps.append(ep)
|
| 58 |
print(f" seed={ep['seed']:3d} {ep['contract']:12s}.{ep['target_function']:18s}"
|
| 59 |
-
f" score={ep['grader_score']:.1f}
|
| 60 |
oracle_avg = _avg(oracle_eps)
|
| 61 |
print(f"\n Oracle avg: {oracle_avg:.3f}")
|
| 62 |
|
| 63 |
# Partial
|
| 64 |
-
print("\n▶ Partial (correct function, 'unknown' vuln
|
| 65 |
partial_eps = [partial_t1(env, seed_offset + i) for i in range(n)]
|
| 66 |
partial_avg = _avg(partial_eps)
|
| 67 |
print(f" Partial avg: {partial_avg:.3f}")
|
|
@@ -88,12 +88,12 @@ def run_task1_eval(n: int, seed_offset: int, verbose: bool) -> Dict[str, Any]:
|
|
| 88 |
for v in sorted(vuln_seen):
|
| 89 |
print(f" {vuln_seen[v]:2d}× {v}")
|
| 90 |
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
print(f"\n ✅ Task 1: oracle(
|
| 97 |
|
| 98 |
return {
|
| 99 |
"task_id": "task1_vuln_detection",
|
|
|
|
| 50 |
env = Task1Environment()
|
| 51 |
|
| 52 |
# Oracle
|
| 53 |
+
print("▶ Oracle (correct function + correct vuln = ~1.0):")
|
| 54 |
oracle_eps = []
|
| 55 |
for i in range(n):
|
| 56 |
ep = oracle_t1(env, seed_offset + i, verbose)
|
| 57 |
oracle_eps.append(ep)
|
| 58 |
print(f" seed={ep['seed']:3d} {ep['contract']:12s}.{ep['target_function']:18s}"
|
| 59 |
+
f" score={ep['grader_score']:.1f}")
|
| 60 |
oracle_avg = _avg(oracle_eps)
|
| 61 |
print(f"\n Oracle avg: {oracle_avg:.3f}")
|
| 62 |
|
| 63 |
# Partial
|
| 64 |
+
print("\n▶ Partial (correct function, 'unknown' vuln = ~0.5):")
|
| 65 |
partial_eps = [partial_t1(env, seed_offset + i) for i in range(n)]
|
| 66 |
partial_avg = _avg(partial_eps)
|
| 67 |
print(f" Partial avg: {partial_avg:.3f}")
|
|
|
|
| 88 |
for v in sorted(vuln_seen):
|
| 89 |
print(f" {vuln_seen[v]:2d}× {v}")
|
| 90 |
|
| 91 |
+
assert oracle_avg == 1.0, f"Oracle avg {oracle_avg:.3f} should be 1.0"
|
| 92 |
+
assert partial_avg == 0.5, f"Partial avg {partial_avg:.3f} should be 0.5"
|
| 93 |
+
assert floor_avg == 0.0, f"Floor avg {floor_avg:.3f} should be 0.0"
|
| 94 |
+
assert oracle_avg >= random_avg >= floor_avg, \
|
| 95 |
+
f"Score ordering violated: oracle={oracle_avg}, random={random_avg}, floor={floor_avg}"
|
| 96 |
+
print(f"\n ✅ Task 1: oracle({oracle_avg}) ≥ partial({partial_avg}) ≥ random({random_avg:.3f}) ≥ floor({floor_avg})")
|
| 97 |
|
| 98 |
return {
|
| 99 |
"task_id": "task1_vuln_detection",
|
server/tasks/task1/environment.py
CHANGED
|
@@ -50,7 +50,7 @@ class Task1Environment(BaseEnv):
|
|
| 50 |
def __init__(self, contracts_path: Optional[str] = None) -> None:
|
| 51 |
self._contracts = load_contracts(contracts_path) if contracts_path else load_contracts()
|
| 52 |
self._rng = random.Random()
|
| 53 |
-
self._max_steps: int =
|
| 54 |
|
| 55 |
# Episode state (initialised by reset)
|
| 56 |
self._contract: Dict[str, Any] = {}
|
|
|
|
| 50 |
def __init__(self, contracts_path: Optional[str] = None) -> None:
|
| 51 |
self._contracts = load_contracts(contracts_path) if contracts_path else load_contracts()
|
| 52 |
self._rng = random.Random()
|
| 53 |
+
self._max_steps: int = 40
|
| 54 |
|
| 55 |
# Episode state (initialised by reset)
|
| 56 |
self._contract: Dict[str, Any] = {}
|
server/tasks/task1/grader.py
CHANGED
|
@@ -19,8 +19,8 @@ class Task1Grader:
|
|
| 19 |
|
| 20 |
def grade(self, submitted_function: str, submitted_vuln_type: str, steps: int, cummulative_cost: int) -> float:
|
| 21 |
"""Returns grade strictly in (0, 1)."""
|
| 22 |
-
func_match = submitted_function.strip().lower()
|
| 23 |
-
issue_match = SemanticMatcher().
|
| 24 |
|
| 25 |
# Score formula
|
| 26 |
free_budget = (cummulative_cost / steps) * (self.n + 2)
|
|
|
|
| 19 |
|
| 20 |
def grade(self, submitted_function: str, submitted_vuln_type: str, steps: int, cummulative_cost: int) -> float:
|
| 21 |
"""Returns grade strictly in (0, 1)."""
|
| 22 |
+
func_match = submitted_function.strip().lower() == self.target_function
|
| 23 |
+
issue_match = SemanticMatcher().matchscore(self.vulnerability_issue, submitted_vuln_type)
|
| 24 |
|
| 25 |
# Score formula
|
| 26 |
free_budget = (cummulative_cost / steps) * (self.n + 2)
|