Spaces:

Codex47
/

SmartContractAudit

Running

App Files Files Community

ajaxwin commited on 5 days ago

Commit

7f7bcc6

1 Parent(s): 5235476

refactor: grader trivial bug,

Browse files

Files changed (4) hide show

agents/task1.py +13 -21
eval.py +9 -9
server/tasks/task1/environment.py +1 -1
server/tasks/task1/grader.py +2 -2

agents/task1.py CHANGED Viewed

@@ -34,20 +34,18 @@ def oracle_t1(env: Task1Environment, seed: int, verbose: bool = False) -> Dict[s
         if fn and fn.get("vulnerable"):
             vuln_issue = fn["vulnerability_details"]["issue"]
             break
     if verbose:
         print(f"    {obs.contract_name}.{fn_name}()  [{vuln_issue}]")
     env.step(Action(action_type=ActionType.LIST_FUNCTIONS))
-    env.step(Action(action_type=ActionType.GET_FUNCTION_CODE,
-                    params={"function_name": fn_name}))
-    result = env.step(Action(action_type=ActionType.SUBMIT,
-                              params={"function_name": fn_name,
-                                      "vulnerability_type": vuln_issue}))
-    v = result.reward.value
-    score = 1.0 if v >= 4.9 else (0.5 if v >= 0.9 else 0.0)
-    return {"seed": seed, "contract": obs.contract_name, "target_function": fn_name,
-            "vulnerability": vuln_issue, "grader_score": score,
-            "cumulative_reward": result.observation.cumulative_reward}
 def partial_t1(env: Task1Environment, seed: int) -> Dict[str, Any]:
     """Correct function, 'unknown' vuln type → score = 0.5."""
@@ -55,9 +53,7 @@ def partial_t1(env: Task1Environment, seed: int) -> Dict[str, Any]:
     fn_name = env.state().target_function
     result  = env.step(Action(action_type=ActionType.SUBMIT,
                                params={"function_name": fn_name, "vulnerability_type": "unknown"}))
-    v = result.reward.value
-    return {"seed": seed, "grader_score": 0.5 if v >= 0.9 else 0.0,
-            "cumulative_reward": result.observation.cumulative_reward}
 def random_t1(env: Task1Environment, seed: int) -> Dict[str, Any]:
@@ -96,18 +92,14 @@ def random_t1(env: Task1Environment, seed: int) -> Dict[str, Any]:
     result = env.step(Action(action_type=ActionType.SUBMIT,
                               params={"function_name": random_fn,
                                       "vulnerability_type": random_vuln}))
-    v = result.reward.value
-    score = 1.0 if v >= 4.9 else (0.5 if v >= 0.9 else 0.0)
-    return {"seed": seed, "grader_score": score, "submitted_fn": random_fn,
-            "submitted_vuln": random_vuln,
-            "cumulative_reward": result.observation.cumulative_reward}
 def floor_t1(env: Task1Environment, seed: int) -> Dict[str, Any]:
-    """Always submits 'constructor' → guaranteed score = 0.0."""
     env.reset(seed=seed)
     result = env.step(Action(action_type=ActionType.SUBMIT,
                               params={"function_name": "constructor",
                                       "vulnerability_type": "reentrancy"}))
-    return {"seed": seed, "grader_score": 0.0,
-            "cumulative_reward": result.observation.cumulative_reward}

         if fn and fn.get("vulnerable"):
             vuln_issue = fn["vulnerability_details"]["issue"]
             break
     if verbose:
         print(f"    {obs.contract_name}.{fn_name}()  [{vuln_issue}]")
     env.step(Action(action_type=ActionType.LIST_FUNCTIONS))
+    env.step(Action(action_type=ActionType.GET_FUNCTION_CODE, params={"function_name": fn_name}))
+    result = env.step(Action(action_type=ActionType.SUBMIT, params={"function_name": fn_name,
+                                "vulnerability_type": vuln_issue}))
+    return {"seed": seed, "contract": obs.contract_name, "target_function": fn_name,
+            "vulnerability": vuln_issue, "grader_score": result.reward.value}
 def partial_t1(env: Task1Environment, seed: int) -> Dict[str, Any]:
     """Correct function, 'unknown' vuln type → score = 0.5."""
     fn_name = env.state().target_function
     result  = env.step(Action(action_type=ActionType.SUBMIT,
                                params={"function_name": fn_name, "vulnerability_type": "unknown"}))
+    return {"seed": seed, "grader_score": result.reward.value}
 def random_t1(env: Task1Environment, seed: int) -> Dict[str, Any]:
     result = env.step(Action(action_type=ActionType.SUBMIT,
                               params={"function_name": random_fn,
                                       "vulnerability_type": random_vuln}))
+    return {"seed": seed, "grader_score": result.reward.value, "submitted_fn":
+        random_fn, "submitted_vuln": random_vuln}
 def floor_t1(env: Task1Environment, seed: int) -> Dict[str, Any]:
+    """Always submits 'constructor' → guaranteed score = 0.0001"""
     env.reset(seed=seed)
     result = env.step(Action(action_type=ActionType.SUBMIT,
                               params={"function_name": "constructor",
                                       "vulnerability_type": "reentrancy"}))
+    return {"seed": seed, "grader_score": result.reward.value }

eval.py CHANGED Viewed

@@ -50,18 +50,18 @@ def run_task1_eval(n: int, seed_offset: int, verbose: bool) -> Dict[str, Any]:
     env = Task1Environment()
     # Oracle
-    print("▶ Oracle  (correct function + correct vuln → 1.0):")
     oracle_eps = []
     for i in range(n):
         ep = oracle_t1(env, seed_offset + i, verbose)
         oracle_eps.append(ep)
         print(f"  seed={ep['seed']:3d}  {ep['contract']:12s}.{ep['target_function']:18s}"
-              f"  score={ep['grader_score']:.1f}  reward={ep['cumulative_reward']:+.2f}")
     oracle_avg = _avg(oracle_eps)
     print(f"\n  Oracle   avg: {oracle_avg:.3f}")
     # Partial
-    print("\n▶ Partial (correct function, 'unknown' vuln → 0.5):")
     partial_eps = [partial_t1(env, seed_offset + i) for i in range(n)]
     partial_avg = _avg(partial_eps)
     print(f"  Partial  avg: {partial_avg:.3f}")
@@ -88,12 +88,12 @@ def run_task1_eval(n: int, seed_offset: int, verbose: bool) -> Dict[str, Any]:
     for v in sorted(vuln_seen):
         print(f"  {vuln_seen[v]:2d}×  {v}")
-   # assert oracle_avg == 1.0,  f"Oracle avg {oracle_avg:.3f} should be 1.0"
-   # assert partial_avg == 0.5, f"Partial avg {partial_avg:.3f} should be 0.5"
-   # assert floor_avg == 0.0,   f"Floor avg {floor_avg:.3f} should be 0.0"
-   # assert oracle_avg >= random_avg >= floor_avg, \
-        # f"Score ordering violated: oracle={oracle_avg}, random={random_avg}, floor={floor_avg}"
-    print(f"\n  ✅ Task 1: oracle(1.0) ≥ partial(0.5) ≥ random({random_avg:.3f}) ≥ floor(0.0)")
     return {
         "task_id": "task1_vuln_detection",

     env = Task1Environment()
     # Oracle
+    print("▶ Oracle  (correct function + correct vuln = ~1.0):")
     oracle_eps = []
     for i in range(n):
         ep = oracle_t1(env, seed_offset + i, verbose)
         oracle_eps.append(ep)
         print(f"  seed={ep['seed']:3d}  {ep['contract']:12s}.{ep['target_function']:18s}"
+              f"  score={ep['grader_score']:.1f}")
     oracle_avg = _avg(oracle_eps)
     print(f"\n  Oracle   avg: {oracle_avg:.3f}")
     # Partial
+    print("\n▶ Partial (correct function, 'unknown' vuln = ~0.5):")
     partial_eps = [partial_t1(env, seed_offset + i) for i in range(n)]
     partial_avg = _avg(partial_eps)
     print(f"  Partial  avg: {partial_avg:.3f}")
     for v in sorted(vuln_seen):
         print(f"  {vuln_seen[v]:2d}×  {v}")
+    assert oracle_avg == 1.0,  f"Oracle avg {oracle_avg:.3f} should be 1.0"
+    assert partial_avg == 0.5, f"Partial avg {partial_avg:.3f} should be 0.5"
+    assert floor_avg == 0.0,   f"Floor avg {floor_avg:.3f} should be 0.0"
+    assert oracle_avg >= random_avg >= floor_avg, \
+        f"Score ordering violated: oracle={oracle_avg}, random={random_avg}, floor={floor_avg}"
+    print(f"\n  ✅ Task 1: oracle({oracle_avg}) ≥ partial({partial_avg}) ≥ random({random_avg:.3f}) ≥ floor({floor_avg})")
     return {
         "task_id": "task1_vuln_detection",

server/tasks/task1/environment.py CHANGED Viewed

@@ -50,7 +50,7 @@ class Task1Environment(BaseEnv):
     def __init__(self, contracts_path: Optional[str] = None) -> None:
         self._contracts = load_contracts(contracts_path) if contracts_path else load_contracts()
         self._rng = random.Random()
-        self._max_steps: int = 0
         # Episode state (initialised by reset)
         self._contract: Dict[str, Any] = {}

     def __init__(self, contracts_path: Optional[str] = None) -> None:
         self._contracts = load_contracts(contracts_path) if contracts_path else load_contracts()
         self._rng = random.Random()
+        self._max_steps: int = 40
         # Episode state (initialised by reset)
         self._contract: Dict[str, Any] = {}

server/tasks/task1/grader.py CHANGED Viewed

@@ -19,8 +19,8 @@ class Task1Grader:
     def grade(self, submitted_function: str, submitted_vuln_type: str, steps: int, cummulative_cost: int) -> float:
         """Returns grade strictly in (0, 1)."""
-        func_match = submitted_function.strip().lower() != self.target_function
-        issue_match = SemanticMatcher().match(self.vulnerability_issue, submitted_vuln_type)
         # Score formula
         free_budget = (cummulative_cost / steps) * (self.n + 2)

     def grade(self, submitted_function: str, submitted_vuln_type: str, steps: int, cummulative_cost: int) -> float:
         """Returns grade strictly in (0, 1)."""
+        func_match = submitted_function.strip().lower() == self.target_function
+        issue_match = SemanticMatcher().matchscore(self.vulnerability_issue, submitted_vuln_type)
         # Score formula
         free_budget = (cummulative_cost / steps) * (self.n + 2)