Spaces:
Running
Running
ajaxwin commited on
Commit ·
671787b
1
Parent(s): 056cf7b
task1, task2 evaluated
Browse files- agents/task1.py +113 -0
- agents/task2.py +105 -0
- agents/task3.py +106 -0
- app.py +4 -4
- data/contracts.json +2 -2
- data/data_loader.py +1 -6
- env/schemas.py +1 -1
- eval.py +133 -246
- inference.py +342 -221
- openenv.yaml +4 -4
- tasks/task1/environment.py +2 -9
- tasks/task1/grader.py +1 -9
- tasks/task2/actions.py +1 -1
- tasks/task2/environment.py +1 -1
- tasks/task3/environment.py +6 -4
- utils/prompts.py +18 -27
- utils/propertyretriever.py +1 -3
- utils/semanticmatcher.py +1 -0
- validate.py +3 -3
agents/task1.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Agents for Task 1: Function + Vulnerability Identification."""
|
| 2 |
+
|
| 3 |
+
import random as _random
|
| 4 |
+
from typing import Any, Dict, List
|
| 5 |
+
|
| 6 |
+
from tasks.task1 import Task1Environment
|
| 7 |
+
from env.schemas import Action, ActionType
|
| 8 |
+
from data.data_loader import load_contracts, get_function_by_name
|
| 9 |
+
|
| 10 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 11 |
+
# Helpers
|
| 12 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 13 |
+
|
| 14 |
+
def _parse_fn_list(result_text: str) -> List[str]:
|
| 15 |
+
"""Parse 'Functions in X: f1, f2, f3' into [f1, f2, f3]."""
|
| 16 |
+
if ": " in result_text:
|
| 17 |
+
return [f.strip() for f in result_text.split(": ", 1)[-1].split(", ") if f.strip()]
|
| 18 |
+
return []
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 22 |
+
# Task 1 agents
|
| 23 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 24 |
+
|
| 25 |
+
def oracle_t1(env: Task1Environment, seed: int, verbose: bool = False) -> Dict[str, Any]:
|
| 26 |
+
"""Submits exact ground-truth function + vulnerability type → score = 1.0."""
|
| 27 |
+
r = env.reset(seed=seed)
|
| 28 |
+
obs = r.observation
|
| 29 |
+
fn_name = env.state().target_function
|
| 30 |
+
contracts = load_contracts()
|
| 31 |
+
vuln_issue = ""
|
| 32 |
+
for c in contracts:
|
| 33 |
+
fn = get_function_by_name(c, fn_name)
|
| 34 |
+
if fn and fn.get("vulnerable"):
|
| 35 |
+
vuln_issue = fn["vulnerability_details"]["issue"]
|
| 36 |
+
break
|
| 37 |
+
if verbose:
|
| 38 |
+
print(f" {obs.contract_name}.{fn_name}() [{vuln_issue}]")
|
| 39 |
+
env.step(Action(action_type=ActionType.LIST_FUNCTIONS))
|
| 40 |
+
env.step(Action(action_type=ActionType.GET_FUNCTION_CODE,
|
| 41 |
+
params={"function_name": fn_name}))
|
| 42 |
+
result = env.step(Action(action_type=ActionType.SUBMIT,
|
| 43 |
+
params={"function_name": fn_name,
|
| 44 |
+
"vulnerability_type": vuln_issue}))
|
| 45 |
+
v = result.reward.value
|
| 46 |
+
score = 1.0 if v >= 4.9 else (0.5 if v >= 0.9 else 0.0)
|
| 47 |
+
return {"seed": seed, "contract": obs.contract_name, "target_function": fn_name,
|
| 48 |
+
"vulnerability": vuln_issue, "grader_score": score,
|
| 49 |
+
"cumulative_reward": result.observation.cumulative_reward}
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def partial_t1(env: Task1Environment, seed: int) -> Dict[str, Any]:
|
| 53 |
+
"""Correct function, 'unknown' vuln type → score = 0.5."""
|
| 54 |
+
env.reset(seed=seed)
|
| 55 |
+
fn_name = env.state().target_function
|
| 56 |
+
result = env.step(Action(action_type=ActionType.SUBMIT,
|
| 57 |
+
params={"function_name": fn_name, "vulnerability_type": "unknown"}))
|
| 58 |
+
v = result.reward.value
|
| 59 |
+
return {"seed": seed, "grader_score": 0.5 if v >= 0.9 else 0.0,
|
| 60 |
+
"cumulative_reward": result.observation.cumulative_reward}
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def random_t1(env: Task1Environment, seed: int) -> Dict[str, Any]:
|
| 64 |
+
"""Genuine random agent: random browse then submits a random function + random vuln type.
|
| 65 |
+
|
| 66 |
+
Uses a seeded RNG (offset from episode seed) so results are reproducible.
|
| 67 |
+
Expected score: low (~0–5%) since must randomly hit both right function and right keyword, plus
|
| 68 |
+
can submit only once per episode.
|
| 69 |
+
"""
|
| 70 |
+
rng = _random.Random(seed ^ 0x5A1AD) # different RNG stream from episode seed
|
| 71 |
+
env.reset(seed=seed)
|
| 72 |
+
|
| 73 |
+
# Step 1: list functions to get real candidates
|
| 74 |
+
s = env.step(Action(action_type=ActionType.LIST_FUNCTIONS))
|
| 75 |
+
fns = _parse_fn_list(s.observation.last_action_result or "")
|
| 76 |
+
if not fns:
|
| 77 |
+
fns = ["deposit", "withdraw", "constructor"] # fallback
|
| 78 |
+
|
| 79 |
+
# Step 2: do 1–2 random browse actions (not repeated)
|
| 80 |
+
browse_pool = [
|
| 81 |
+
(ActionType.GET_FILE_METADATA, {}),
|
| 82 |
+
(ActionType.GET_CALL_GRAPH, {}),
|
| 83 |
+
(ActionType.GET_STATE_VARIABLE, {}),
|
| 84 |
+
]
|
| 85 |
+
_random.Random(seed).shuffle(browse_pool) # deterministic order
|
| 86 |
+
for at, params in browse_pool[:rng.randint(1, 2)]:
|
| 87 |
+
env.step(Action(action_type=at, params=params))
|
| 88 |
+
|
| 89 |
+
# Step 3: submit a random function from the real list, random vuln type
|
| 90 |
+
random_fn = rng.choice(fns)
|
| 91 |
+
vuln_pool = [
|
| 92 |
+
"bad logic", "incorrect check", "overflow", "no guard", "wrong order",
|
| 93 |
+
"missing event", "unprotected", "stale data", "unsafe cast",
|
| 94 |
+
]
|
| 95 |
+
random_vuln = rng.choice(vuln_pool)
|
| 96 |
+
result = env.step(Action(action_type=ActionType.SUBMIT,
|
| 97 |
+
params={"function_name": random_fn,
|
| 98 |
+
"vulnerability_type": random_vuln}))
|
| 99 |
+
v = result.reward.value
|
| 100 |
+
score = 1.0 if v >= 4.9 else (0.5 if v >= 0.9 else 0.0)
|
| 101 |
+
return {"seed": seed, "grader_score": score, "submitted_fn": random_fn,
|
| 102 |
+
"submitted_vuln": random_vuln,
|
| 103 |
+
"cumulative_reward": result.observation.cumulative_reward}
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def floor_t1(env: Task1Environment, seed: int) -> Dict[str, Any]:
|
| 107 |
+
"""Always submits 'constructor' → guaranteed score = 0.0."""
|
| 108 |
+
env.reset(seed=seed)
|
| 109 |
+
result = env.step(Action(action_type=ActionType.SUBMIT,
|
| 110 |
+
params={"function_name": "constructor",
|
| 111 |
+
"vulnerability_type": "reentrancy"}))
|
| 112 |
+
return {"seed": seed, "grader_score": 0.0,
|
| 113 |
+
"cumulative_reward": result.observation.cumulative_reward}
|
agents/task2.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Agents for Task2: Property Discovery"""
|
| 2 |
+
|
| 3 |
+
import random as _random
|
| 4 |
+
from typing import Any, Dict, List
|
| 5 |
+
|
| 6 |
+
from tasks.task2 import Task2Environment
|
| 7 |
+
from env.schemas import Action, ActionType
|
| 8 |
+
from data.data_loader import load_contracts, get_function_by_name
|
| 9 |
+
|
| 10 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 11 |
+
# Task 2 agents
|
| 12 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 13 |
+
|
| 14 |
+
def oracle_t2(env: Task2Environment, seed: int, verbose: bool = False) -> Dict[str, Any]:
|
| 15 |
+
"""Submits ground-truth in natural langugage (English) → score ≥ 0.70."""
|
| 16 |
+
r = env.reset(seed=seed)
|
| 17 |
+
obs = r.observation
|
| 18 |
+
fn_name = obs.extra["target_function"]
|
| 19 |
+
contract = obs.contract_name
|
| 20 |
+
contracts = load_contracts()
|
| 21 |
+
gt_text = ""
|
| 22 |
+
for c in contracts:
|
| 23 |
+
if c["contract_name"] == contract:
|
| 24 |
+
fn = get_function_by_name(c, fn_name)
|
| 25 |
+
if fn and fn.get("property"):
|
| 26 |
+
gt_text = fn["property"]
|
| 27 |
+
break
|
| 28 |
+
if verbose:
|
| 29 |
+
print(f" {contract}.{fn_name}()")
|
| 30 |
+
env.step(Action(action_type=ActionType.GET_FUNCTION_CODE))
|
| 31 |
+
result = env.step(Action(action_type=ActionType.SUBMIT_PROPERTY,
|
| 32 |
+
params={"property": gt_text}))
|
| 33 |
+
r_val = result.reward.value
|
| 34 |
+
score = round(r_val / 5.0, 4) if r_val > 0 else 0.0
|
| 35 |
+
return {"seed": seed, "contract": contract, "function": fn_name,
|
| 36 |
+
"grader_score": score,
|
| 37 |
+
"cumulative_reward": result.observation.cumulative_reward}
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def partial_t2(env: Task2Environment, seed: int) -> Dict[str, Any]:
|
| 41 |
+
"""Submits only the function's short NatSpec comment — partial credit."""
|
| 42 |
+
r = env.reset(seed=seed)
|
| 43 |
+
obs = r.observation
|
| 44 |
+
contracts = load_contracts()
|
| 45 |
+
comment = ""
|
| 46 |
+
for c in contracts:
|
| 47 |
+
if c["contract_name"] == obs.contract_name:
|
| 48 |
+
fn = get_function_by_name(c, obs.extra["target_function"])
|
| 49 |
+
if fn:
|
| 50 |
+
comment = fn.get("comment", "")
|
| 51 |
+
break
|
| 52 |
+
result = env.step(Action(action_type=ActionType.SUBMIT_PROPERTY,
|
| 53 |
+
params={"property": comment}))
|
| 54 |
+
r_val = result.reward.value
|
| 55 |
+
return {"seed": seed, "grader_score": round(r_val / 5.0, 4) if r_val > 0 else 0.0,
|
| 56 |
+
"cumulative_reward": result.observation.cumulative_reward}
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def random_t2(env: Task2Environment, seed: int) -> Dict[str, Any]:
|
| 60 |
+
"""Genuine random agent: random browse then submits a generic property template.
|
| 61 |
+
|
| 62 |
+
The submitted text contains high-frequency words that are unlikely to match
|
| 63 |
+
task-specific key phrases. Expected score: near 0 (coincidental matches only).
|
| 64 |
+
Uses a seeded RNG for reproducibility.
|
| 65 |
+
"""
|
| 66 |
+
rng = _random.Random(seed ^ 0xBEEF1)
|
| 67 |
+
|
| 68 |
+
r = env.reset(seed=seed)
|
| 69 |
+
obs = r.observation
|
| 70 |
+
fn_name = obs.extra.get("target_function", "this function")
|
| 71 |
+
|
| 72 |
+
# Random browse: pick 1–2 actions at random
|
| 73 |
+
browse_pool = [
|
| 74 |
+
ActionType.GET_FILE_NATSPEC,
|
| 75 |
+
ActionType.GET_RELATED_FUNCTIONS,
|
| 76 |
+
ActionType.GET_SIGNATURE,
|
| 77 |
+
]
|
| 78 |
+
rng.shuffle(browse_pool)
|
| 79 |
+
for at in browse_pool[:rng.randint(1, 2)]:
|
| 80 |
+
env.step(Action(action_type=at))
|
| 81 |
+
|
| 82 |
+
# Submit a randomly assembled generic property (won't match specific key phrases)
|
| 83 |
+
templates = [
|
| 84 |
+
f"The {fn_name} operation completes the intended computation on the input data.",
|
| 85 |
+
f"When {fn_name} executes, it processes the provided arguments and updates the contract.",
|
| 86 |
+
f"The {fn_name} function validates inputs and performs the expected operation.",
|
| 87 |
+
f"Calling {fn_name} causes the contract to execute its designated logic.",
|
| 88 |
+
f"{fn_name} runs when invoked and modifies internal state as designed.",
|
| 89 |
+
]
|
| 90 |
+
prop = rng.choice(templates)
|
| 91 |
+
result = env.step(Action(action_type=ActionType.SUBMIT_PROPERTY,
|
| 92 |
+
params={"property": prop}))
|
| 93 |
+
r_val = result.reward.value
|
| 94 |
+
return {"seed": seed, "grader_score": round(r_val / 5.0, 4) if r_val > 0 else 0.0,
|
| 95 |
+
"submitted": prop[:60],
|
| 96 |
+
"cumulative_reward": result.observation.cumulative_reward}
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def floor_t2(env: Task2Environment, seed: int) -> Dict[str, Any]:
|
| 100 |
+
"""Submits empty string → score = 0.0 guaranteed."""
|
| 101 |
+
env.reset(seed=seed)
|
| 102 |
+
result = env.step(Action(action_type=ActionType.SUBMIT_PROPERTY,
|
| 103 |
+
params={"property": ""}))
|
| 104 |
+
return {"seed": seed, "grader_score": 0.0,
|
| 105 |
+
"cumulative_reward": result.observation.cumulative_reward}
|
agents/task3.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
""" Agents for Task3 : Rule Checking for a function """
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import random as _random
|
| 5 |
+
from typing import Any, Dict, List
|
| 6 |
+
|
| 7 |
+
from tasks.task3 import Task3Environment
|
| 8 |
+
from env.schemas import Action, ActionType
|
| 9 |
+
from data.data_loader import load_contracts, get_function_by_name
|
| 10 |
+
|
| 11 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 12 |
+
# Helpers
|
| 13 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 14 |
+
|
| 15 |
+
def _parse_fn_list(result_text: str) -> List[str]:
|
| 16 |
+
"""Parse 'Functions in X: f1, f2, f3' into [f1, f2, f3]."""
|
| 17 |
+
if ": " in result_text:
|
| 18 |
+
return [f.strip() for f in result_text.split(": ", 1)[-1].split(", ") if f.strip()]
|
| 19 |
+
return []
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 23 |
+
# Task 3 agents
|
| 24 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 25 |
+
|
| 26 |
+
def oracle_t3(env: Task3Environment, seed: int, verbose: bool = False) -> Dict[str, Any]:
|
| 27 |
+
"""Submits exact target function → score = 1.0."""
|
| 28 |
+
r = env.reset(seed=seed)
|
| 29 |
+
obs = r.observation
|
| 30 |
+
fn_name = env.state().target_function
|
| 31 |
+
contract = obs.contract_name
|
| 32 |
+
if verbose:
|
| 33 |
+
prop = obs.extra.get("property_english", "")[:60]
|
| 34 |
+
print(f" {contract}.{fn_name}() \"{prop}\"")
|
| 35 |
+
env.step(Action(action_type=ActionType.GET_PROPERTY_SPECIFICATION))
|
| 36 |
+
env.step(Action(action_type=ActionType.LIST_FUNCTIONS))
|
| 37 |
+
result = env.step(Action(action_type=ActionType.SUBMIT_FUNCTION,
|
| 38 |
+
params={"function_name": fn_name}))
|
| 39 |
+
v = result.reward.value
|
| 40 |
+
score = 1.0 if v >= 4.9 else (0.3 if v >= 1.0 else 0.0)
|
| 41 |
+
return {"seed": seed, "contract": contract, "target_function": fn_name,
|
| 42 |
+
"grader_score": score,
|
| 43 |
+
"cumulative_reward": result.observation.cumulative_reward}
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def subfunction_t3(env: Task3Environment, seed: int) -> Dict[str, Any]:
|
| 47 |
+
"""Submits the first partial-credit subfunction if one exists, else 'constructor'."""
|
| 48 |
+
r = env.reset(seed=seed)
|
| 49 |
+
obs = r.observation
|
| 50 |
+
contracts = load_contracts()
|
| 51 |
+
partial_fns = []
|
| 52 |
+
for c in contracts:
|
| 53 |
+
if c["contract_name"] == obs.contract_name:
|
| 54 |
+
fn = get_function_by_name(c, env.state().target_function)
|
| 55 |
+
if fn:
|
| 56 |
+
partial_fns = fn.get("task3", {}).get("partial_credit_functions", [])
|
| 57 |
+
break
|
| 58 |
+
submit_name = partial_fns[0] if partial_fns else "constructor"
|
| 59 |
+
result = env.step(Action(action_type=ActionType.SUBMIT_FUNCTION,
|
| 60 |
+
params={"function_name": submit_name}))
|
| 61 |
+
v = result.reward.value
|
| 62 |
+
score = 1.0 if v >= 4.9 else (0.3 if v >= 1.0 else 0.0)
|
| 63 |
+
return {"seed": seed, "grader_score": score, "submitted": submit_name,
|
| 64 |
+
"cumulative_reward": result.observation.cumulative_reward}
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def random_t3(env: Task3Environment, seed: int) -> Dict[str, Any]:
|
| 68 |
+
"""Genuine random agent: lists functions, picks one at random, submits.
|
| 69 |
+
|
| 70 |
+
With N functions per contract and 1 target, expected score ≈ 1/N ≈ 0.20–0.25.
|
| 71 |
+
Uses a seeded RNG for reproducibility.
|
| 72 |
+
"""
|
| 73 |
+
rng = _random.Random(seed ^ 0xCAFE1)
|
| 74 |
+
env.reset(seed=seed)
|
| 75 |
+
|
| 76 |
+
# Step 1: get function list (necessary to pick a real candidate)
|
| 77 |
+
s = env.step(Action(action_type=ActionType.LIST_FUNCTIONS))
|
| 78 |
+
fns = _parse_fn_list(s.observation.last_action_result or "")
|
| 79 |
+
if not fns:
|
| 80 |
+
fns = ["constructor"]
|
| 81 |
+
|
| 82 |
+
# Step 2: optionally do 1 cheap browse action (formalized or call_graph)
|
| 83 |
+
browse_options = [
|
| 84 |
+
(ActionType.GET_PROPERTY_SPECIFICATION, {}),
|
| 85 |
+
(ActionType.GET_CALL_GRAPH, {}),
|
| 86 |
+
]
|
| 87 |
+
at, params = rng.choice(browse_options)
|
| 88 |
+
env.step(Action(action_type=at, params=params))
|
| 89 |
+
|
| 90 |
+
# Step 3: submit a uniformly random function from the real list
|
| 91 |
+
chosen = rng.choice(fns)
|
| 92 |
+
result = env.step(Action(action_type=ActionType.SUBMIT_FUNCTION,
|
| 93 |
+
params={"function_name": chosen}))
|
| 94 |
+
v = result.reward.value
|
| 95 |
+
score = 1.0 if v >= 4.9 else (0.3 if v >= 1.0 else 0.0)
|
| 96 |
+
return {"seed": seed, "grader_score": score, "submitted": chosen,
|
| 97 |
+
"cumulative_reward": result.observation.cumulative_reward}
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def floor_t3(env: Task3Environment, seed: int) -> Dict[str, Any]:
|
| 101 |
+
"""Always submits 'constructor' → guaranteed score = 0.0."""
|
| 102 |
+
env.reset(seed=seed)
|
| 103 |
+
result = env.step(Action(action_type=ActionType.SUBMIT_FUNCTION,
|
| 104 |
+
params={"function_name": "constructor"}))
|
| 105 |
+
return {"seed": seed, "grader_score": 0.0,
|
| 106 |
+
"cumulative_reward": result.observation.cumulative_reward}
|
app.py
CHANGED
|
@@ -22,9 +22,9 @@ from fastapi import FastAPI, HTTPException, Query
|
|
| 22 |
from pydantic import BaseModel
|
| 23 |
|
| 24 |
from env.schemas import Action, ActionType, TaskInfo
|
| 25 |
-
from tasks.task1
|
| 26 |
-
from tasks.task2
|
| 27 |
-
from tasks.task3
|
| 28 |
|
| 29 |
# ─────────────────────────────────────────────────────────────────────────────
|
| 30 |
# App
|
|
@@ -191,7 +191,7 @@ def action_space(task_id: str = "task1_vuln_detection"):
|
|
| 191 |
{"type": "get_function_natspec", "params": {}, "reward": -0.08, "description": "Read NatSpec + expected behaviour"},
|
| 192 |
{"type": "get_file_natspec", "params": {}, "reward": -0.03, "description": "Read contract-level NatSpec"},
|
| 193 |
{"type": "get_related_functions", "params": {}, "reward": -0.06, "description": "List caller/callee functions with summaries"},
|
| 194 |
-
{"type": "
|
| 195 |
{"type": "get_similar_rule", "params": {}, "reward": -0.20, "description": "Get a similar property from another contract"},
|
| 196 |
{"type": "submit_property", "params": {"property": "string"}, "reward": "0.0–5.0 (scored)", "description": "Submit property. ONE attempt. Ends episode."},
|
| 197 |
],
|
|
|
|
| 22 |
from pydantic import BaseModel
|
| 23 |
|
| 24 |
from env.schemas import Action, ActionType, TaskInfo
|
| 25 |
+
from tasks.task1 import Task1Environment
|
| 26 |
+
from tasks.task2 import Task2Environment
|
| 27 |
+
from tasks.task3 import Task3Environment
|
| 28 |
|
| 29 |
# ─────────────────────────────────────────────────────────────────────────────
|
| 30 |
# App
|
|
|
|
| 191 |
{"type": "get_function_natspec", "params": {}, "reward": -0.08, "description": "Read NatSpec + expected behaviour"},
|
| 192 |
{"type": "get_file_natspec", "params": {}, "reward": -0.03, "description": "Read contract-level NatSpec"},
|
| 193 |
{"type": "get_related_functions", "params": {}, "reward": -0.06, "description": "List caller/callee functions with summaries"},
|
| 194 |
+
{"type": "get_signature", "params": {}, "reward": -0.04, "description": "Get structured I/O + expected behaviour"},
|
| 195 |
{"type": "get_similar_rule", "params": {}, "reward": -0.20, "description": "Get a similar property from another contract"},
|
| 196 |
{"type": "submit_property", "params": {"property": "string"}, "reward": "0.0–5.0 (scored)", "description": "Submit property. ONE attempt. Ends episode."},
|
| 197 |
],
|
data/contracts.json
CHANGED
|
@@ -4827,7 +4827,7 @@
|
|
| 4827 |
]
|
| 4828 |
},
|
| 4829 |
{
|
| 4830 |
-
"contract_name": "
|
| 4831 |
"file_name": "StableDebtToken.sol",
|
| 4832 |
"metadata": {
|
| 4833 |
"license": "agpl-3.0",
|
|
@@ -5380,7 +5380,7 @@
|
|
| 5380 |
]
|
| 5381 |
},
|
| 5382 |
{
|
| 5383 |
-
"contract_name": "
|
| 5384 |
"file_name": "ATokenVault_old.sol",
|
| 5385 |
"metadata": {
|
| 5386 |
"license": "MIT",
|
|
|
|
| 4827 |
]
|
| 4828 |
},
|
| 4829 |
{
|
| 4830 |
+
"contract_name": "StableDebtToken_OLD",
|
| 4831 |
"file_name": "StableDebtToken.sol",
|
| 4832 |
"metadata": {
|
| 4833 |
"license": "agpl-3.0",
|
|
|
|
| 5380 |
]
|
| 5381 |
},
|
| 5382 |
{
|
| 5383 |
+
"contract_name": "ATokenVault_OLD",
|
| 5384 |
"file_name": "ATokenVault_old.sol",
|
| 5385 |
"metadata": {
|
| 5386 |
"license": "MIT",
|
data/data_loader.py
CHANGED
|
@@ -15,7 +15,7 @@ from typing import Any, Dict, List, Optional, Tuple
|
|
| 15 |
|
| 16 |
DATA_DIR = os.path.join(os.path.dirname(__file__))
|
| 17 |
DEFAULT_CONTRACTS_FILE = os.path.join(DATA_DIR, "contracts.json")
|
| 18 |
-
|
| 19 |
|
| 20 |
|
| 21 |
# ────────────────────────────────────────────────────────────────
|
|
@@ -62,11 +62,6 @@ def list_state_variable_names(contract: Dict[str, Any]) -> List[str]:
|
|
| 62 |
# Task 1 helpers
|
| 63 |
# ────────────────────────────────────────────────────────────────
|
| 64 |
|
| 65 |
-
def load_vulnerabilities(path: str = DEFAULT_VUNERABILITIES_FILE) -> List[Dict[str, Any]]:
|
| 66 |
-
"""Load and return all vulnerability entries from the JSON dataset."""
|
| 67 |
-
with open(path, "r") as f:
|
| 68 |
-
return json.load(f)
|
| 69 |
-
|
| 70 |
def get_all_vulnerable_entries(
|
| 71 |
contracts: List[Dict[str, Any]],
|
| 72 |
) -> List[Tuple[Dict[str, Any], Dict[str, Any]]]:
|
|
|
|
| 15 |
|
| 16 |
DATA_DIR = os.path.join(os.path.dirname(__file__))
|
| 17 |
DEFAULT_CONTRACTS_FILE = os.path.join(DATA_DIR, "contracts.json")
|
| 18 |
+
DEFAULT_CSV_PATH = os.path.join(DATA_DIR, "properties.csv")
|
| 19 |
|
| 20 |
|
| 21 |
# ────────────────────────────────────────────────────────────────
|
|
|
|
| 62 |
# Task 1 helpers
|
| 63 |
# ────────────────────────────────────────────────────────────────
|
| 64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
def get_all_vulnerable_entries(
|
| 66 |
contracts: List[Dict[str, Any]],
|
| 67 |
) -> List[Tuple[Dict[str, Any], Dict[str, Any]]]:
|
env/schemas.py
CHANGED
|
@@ -130,7 +130,7 @@ class ResetResult(BaseModel):
|
|
| 130 |
class StateResult(BaseModel):
|
| 131 |
task_id: str
|
| 132 |
contract_name: str
|
| 133 |
-
target_function:
|
| 134 |
step_count: int
|
| 135 |
cumulative_reward: float
|
| 136 |
done: bool
|
|
|
|
| 130 |
class StateResult(BaseModel):
|
| 131 |
task_id: str
|
| 132 |
contract_name: str
|
| 133 |
+
target_function: str # hidden in real eval, exposed here for debugging
|
| 134 |
step_count: int
|
| 135 |
cumulative_reward: float
|
| 136 |
done: bool
|
eval.py
CHANGED
|
@@ -3,197 +3,44 @@ eval.py
|
|
| 3 |
-------
|
| 4 |
Evaluation harness for all three tasks.
|
| 5 |
|
| 6 |
-
Runs
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
Usage:
|
| 10 |
-
python eval.py
|
| 11 |
-
python eval.py --task 1|2|3
|
| 12 |
python eval.py --episodes 16 --verbose
|
| 13 |
python eval.py --out results.json
|
| 14 |
"""
|
| 15 |
|
| 16 |
import argparse
|
| 17 |
import json
|
|
|
|
| 18 |
from typing import Any, Dict, List
|
| 19 |
|
| 20 |
-
from tasks.task1
|
| 21 |
-
from tasks.task2
|
| 22 |
-
from tasks.task3
|
| 23 |
-
from
|
|
|
|
|
|
|
| 24 |
from data.data_loader import (
|
| 25 |
load_contracts,
|
| 26 |
-
get_function_by_name,
|
| 27 |
get_all_vulnerable_entries,
|
| 28 |
get_all_property_entries,
|
| 29 |
get_all_task3_entries,
|
| 30 |
)
|
| 31 |
|
| 32 |
-
|
| 33 |
-
# ─────────────────────────────────────────────────────────────────────────────
|
| 34 |
-
# Task 1 agents
|
| 35 |
-
# ─────────────────────────────────────────────────────────────────────────────
|
| 36 |
-
|
| 37 |
-
def oracle_t1(env: Task1Environment, seed: int, verbose: bool = False) -> Dict[str, Any]:
|
| 38 |
-
"""Submits the exact ground-truth function + vulnerability → score = 1.0."""
|
| 39 |
-
r = env.reset(seed=seed)
|
| 40 |
-
obs = r.observation
|
| 41 |
-
fn_name = env.state().target_function
|
| 42 |
-
contracts = load_contracts()
|
| 43 |
-
vuln_issue = ""
|
| 44 |
-
for c in contracts:
|
| 45 |
-
fn = get_function_by_name(c, fn_name)
|
| 46 |
-
if fn and fn.get("vulnerable"):
|
| 47 |
-
vuln_issue = fn["vulnerability_details"]["issue"]
|
| 48 |
-
break
|
| 49 |
-
if verbose:
|
| 50 |
-
print(f" {obs.contract_name}.{fn_name}() [{vuln_issue}]")
|
| 51 |
-
env.step(Action(action_type=ActionType.LIST_FUNCTIONS))
|
| 52 |
-
env.step(Action(action_type=ActionType.GET_FUNCTION_CODE,
|
| 53 |
-
params={"function_name": fn_name}))
|
| 54 |
-
result = env.step(Action(action_type=ActionType.SUBMIT,
|
| 55 |
-
params={"function_name": fn_name,
|
| 56 |
-
"vulnerability_type": vuln_issue}))
|
| 57 |
-
v = result.reward.value
|
| 58 |
-
score = 1.0 if v >= 4.9 else (0.5 if v >= 0.9 else 0.0)
|
| 59 |
-
return {"seed": seed, "contract": obs.contract_name, "target_function": fn_name,
|
| 60 |
-
"vulnerability": vuln_issue, "grader_score": score,
|
| 61 |
-
"cumulative_reward": result.observation.cumulative_reward}
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
def partial_t1(env: Task1Environment, seed: int) -> Dict[str, Any]:
|
| 65 |
-
"""Right function, 'unknown' vuln type → score = 0.5."""
|
| 66 |
-
env.reset(seed=seed)
|
| 67 |
-
fn_name = env.state().target_function
|
| 68 |
-
result = env.step(Action(action_type=ActionType.SUBMIT,
|
| 69 |
-
params={"function_name": fn_name, "vulnerability_type": "unknown"}))
|
| 70 |
-
v = result.reward.value
|
| 71 |
-
return {"seed": seed, "grader_score": 0.5 if v >= 0.9 else 0.0,
|
| 72 |
-
"cumulative_reward": result.observation.cumulative_reward}
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
def wrong_t1(env: Task1Environment, seed: int) -> Dict[str, Any]:
|
| 76 |
-
"""Always submits 'constructor' → score = 0.0."""
|
| 77 |
-
env.reset(seed=seed)
|
| 78 |
-
result = env.step(Action(action_type=ActionType.SUBMIT,
|
| 79 |
-
params={"function_name": "constructor",
|
| 80 |
-
"vulnerability_type": "reentrancy"}))
|
| 81 |
-
return {"seed": seed, "grader_score": 0.0,
|
| 82 |
-
"cumulative_reward": result.observation.cumulative_reward}
|
| 83 |
-
|
| 84 |
-
|
| 85 |
# ─────────────────────────────────────────────────────────────────────────────
|
| 86 |
-
#
|
| 87 |
-
# ─────────────────────────────────────────────────────────────────────────────
|
| 88 |
-
|
| 89 |
-
def oracle_t2(env: Task2Environment, seed: int, verbose: bool = False) -> Dict[str, Any]:
|
| 90 |
-
"""Submits ground-truth natural_language → score ≥ 0.70."""
|
| 91 |
-
r = env.reset(seed=seed)
|
| 92 |
-
obs = r.observation
|
| 93 |
-
fn_name = obs.extra["target_function"]
|
| 94 |
-
contract = obs.contract_name
|
| 95 |
-
contracts = load_contracts()
|
| 96 |
-
gt_text = ""
|
| 97 |
-
for c in contracts:
|
| 98 |
-
if c["contract_name"] == contract:
|
| 99 |
-
fn = get_function_by_name(c, fn_name)
|
| 100 |
-
if fn and fn.get("property"):
|
| 101 |
-
gt_text = fn["property"]["natural_language"]
|
| 102 |
-
break
|
| 103 |
-
if verbose:
|
| 104 |
-
print(f" {contract}.{fn_name}()")
|
| 105 |
-
env.step(Action(action_type=ActionType.GET_FUNCTION_CODE))
|
| 106 |
-
result = env.step(Action(action_type=ActionType.SUBMIT_PROPERTY,
|
| 107 |
-
params={"property": gt_text}))
|
| 108 |
-
r_val = result.reward.value
|
| 109 |
-
score = round(r_val / 5.0, 4) if r_val > 0 else 0.0
|
| 110 |
-
return {"seed": seed, "contract": contract, "function": fn_name,
|
| 111 |
-
"grader_score": score, "cumulative_reward": result.observation.cumulative_reward}
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
def partial_t2(env: Task2Environment, seed: int) -> Dict[str, Any]:
|
| 115 |
-
"""Submits the function's NatSpec comment — partial credit."""
|
| 116 |
-
r = env.reset(seed=seed)
|
| 117 |
-
obs = r.observation
|
| 118 |
-
contracts = load_contracts()
|
| 119 |
-
comment = ""
|
| 120 |
-
for c in contracts:
|
| 121 |
-
if c["contract_name"] == obs.contract_name:
|
| 122 |
-
fn = get_function_by_name(c, obs.extra["target_function"])
|
| 123 |
-
if fn:
|
| 124 |
-
comment = fn.get("comment", "")
|
| 125 |
-
break
|
| 126 |
-
result = env.step(Action(action_type=ActionType.SUBMIT_PROPERTY,
|
| 127 |
-
params={"property": comment}))
|
| 128 |
-
r_val = result.reward.value
|
| 129 |
-
return {"seed": seed, "grader_score": round(r_val / 5.0, 4) if r_val > 0 else 0.0,
|
| 130 |
-
"cumulative_reward": result.observation.cumulative_reward}
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
def empty_t2(env: Task2Environment, seed: int) -> Dict[str, Any]:
|
| 134 |
-
"""Submits empty string → score = 0.0."""
|
| 135 |
-
env.reset(seed=seed)
|
| 136 |
-
result = env.step(Action(action_type=ActionType.SUBMIT_PROPERTY, params={"property": ""}))
|
| 137 |
-
return {"seed": seed, "grader_score": 0.0,
|
| 138 |
-
"cumulative_reward": result.observation.cumulative_reward}
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
# ─────────────────────────────────────────────────────────────────────────────
|
| 142 |
-
# Task 3 agents
|
| 143 |
# ─────────────────────────────────────────────────────────────────────────────
|
| 144 |
|
| 145 |
-
def
|
| 146 |
-
|
| 147 |
-
r = env.reset(seed=seed)
|
| 148 |
-
obs = r.observation
|
| 149 |
-
fn_name = env.state().target_function
|
| 150 |
-
contract = obs.contract_name
|
| 151 |
-
if verbose:
|
| 152 |
-
prop = obs.extra.get("property_english", "")[:60]
|
| 153 |
-
print(f" {contract}.{fn_name}() \"{prop}\"")
|
| 154 |
-
env.step(Action(action_type=ActionType.GET_PROPERTY_SPECIFICATION))
|
| 155 |
-
env.step(Action(action_type=ActionType.LIST_FUNCTIONS))
|
| 156 |
-
result = env.step(Action(action_type=ActionType.SUBMIT_FUNCTION,
|
| 157 |
-
params={"function_name": fn_name}))
|
| 158 |
-
v = result.reward.value
|
| 159 |
-
score = 1.0 if v >= 4.9 else (0.3 if v >= 1.0 else 0.0)
|
| 160 |
-
return {"seed": seed, "contract": contract, "target_function": fn_name,
|
| 161 |
-
"grader_score": score, "cumulative_reward": result.observation.cumulative_reward}
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
def subfunction_t3(env: Task3Environment, seed: int) -> Dict[str, Any]:
|
| 165 |
-
"""Submits the first partial-credit subfunction if it exists, else 'constructor'."""
|
| 166 |
-
r = env.reset(seed=seed)
|
| 167 |
-
obs = r.observation
|
| 168 |
-
contracts = load_contracts()
|
| 169 |
-
partial_fns = []
|
| 170 |
-
for c in contracts:
|
| 171 |
-
if c["contract_name"] == obs.contract_name:
|
| 172 |
-
fn = get_function_by_name(c, env.state().target_function)
|
| 173 |
-
if fn:
|
| 174 |
-
partial_fns = fn.get("task3", {}).get("partial_credit_functions", [])
|
| 175 |
-
break
|
| 176 |
-
submit_name = partial_fns[0] if partial_fns else "constructor"
|
| 177 |
-
result = env.step(Action(action_type=ActionType.SUBMIT_FUNCTION,
|
| 178 |
-
params={"function_name": submit_name}))
|
| 179 |
-
v = result.reward.value
|
| 180 |
-
score = 1.0 if v >= 4.9 else (0.3 if v >= 1.0 else 0.0)
|
| 181 |
-
return {"seed": seed, "grader_score": score, "submitted": submit_name,
|
| 182 |
-
"cumulative_reward": result.observation.cumulative_reward}
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
def wrong_t3(env: Task3Environment, seed: int) -> Dict[str, Any]:
|
| 186 |
-
"""Always submits 'constructor' → score = 0.0."""
|
| 187 |
-
env.reset(seed=seed)
|
| 188 |
-
result = env.step(Action(action_type=ActionType.SUBMIT_FUNCTION,
|
| 189 |
-
params={"function_name": "constructor"}))
|
| 190 |
-
return {"seed": seed, "grader_score": 0.0,
|
| 191 |
-
"cumulative_reward": result.observation.cumulative_reward}
|
| 192 |
-
|
| 193 |
|
| 194 |
-
# ─────────────────────────────────────────────────────────────────────────────
|
| 195 |
-
# Evaluation runners
|
| 196 |
-
# ─────────────────────────────────────────────────────────────────────────────
|
| 197 |
|
| 198 |
def run_task1_eval(n: int, seed_offset: int, verbose: bool) -> Dict[str, Any]:
|
| 199 |
print("\n" + "=" * 64)
|
|
@@ -204,27 +51,37 @@ def run_task1_eval(n: int, seed_offset: int, verbose: bool) -> Dict[str, Any]:
|
|
| 204 |
f"{len(get_all_vulnerable_entries(contracts))} vulnerable functions\n")
|
| 205 |
env = Task1Environment()
|
| 206 |
|
| 207 |
-
|
|
|
|
| 208 |
oracle_eps = []
|
| 209 |
for i in range(n):
|
| 210 |
ep = oracle_t1(env, seed_offset + i, verbose)
|
| 211 |
oracle_eps.append(ep)
|
| 212 |
print(f" seed={ep['seed']:3d} {ep['contract']:12s}.{ep['target_function']:18s}"
|
| 213 |
f" score={ep['grader_score']:.1f} reward={ep['cumulative_reward']:+.2f}")
|
| 214 |
-
oracle_avg
|
| 215 |
-
|
| 216 |
-
print(f"\n Oracle avg: {oracle_avg:.3f} reward: {oracle_avg_r:+.2f}")
|
| 217 |
|
| 218 |
-
|
|
|
|
| 219 |
partial_eps = [partial_t1(env, seed_offset + i) for i in range(n)]
|
| 220 |
-
partial_avg =
|
| 221 |
-
print(f" Partial
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
vuln_seen: Dict[str, int] = {}
|
| 229 |
for ep in oracle_eps:
|
| 230 |
v = ep.get("vulnerability", "unknown")
|
|
@@ -233,16 +90,19 @@ def run_task1_eval(n: int, seed_offset: int, verbose: bool) -> Dict[str, Any]:
|
|
| 233 |
for v in sorted(vuln_seen):
|
| 234 |
print(f" {vuln_seen[v]:2d}× {v}")
|
| 235 |
|
| 236 |
-
assert oracle_avg == 1.0
|
| 237 |
-
assert partial_avg == 0.5
|
| 238 |
-
assert
|
| 239 |
-
|
|
|
|
|
|
|
| 240 |
|
| 241 |
return {
|
| 242 |
"task_id": "task1_vuln_detection",
|
| 243 |
-
"oracle": {"avg_score": oracle_avg, "
|
| 244 |
"partial": {"avg_score": partial_avg, "episodes": partial_eps},
|
| 245 |
-
"
|
|
|
|
| 246 |
"vuln_coverage": vuln_seen,
|
| 247 |
}
|
| 248 |
|
|
@@ -255,7 +115,8 @@ def run_task2_eval(n: int, seed_offset: int, verbose: bool) -> Dict[str, Any]:
|
|
| 255 |
print(f" Dataset: {len(get_all_property_entries(contracts))} property entries\n")
|
| 256 |
env = Task2Environment()
|
| 257 |
|
| 258 |
-
|
|
|
|
| 259 |
oracle_eps = []
|
| 260 |
for i in range(n):
|
| 261 |
ep = oracle_t2(env, seed_offset + i, verbose)
|
|
@@ -263,31 +124,40 @@ def run_task2_eval(n: int, seed_offset: int, verbose: bool) -> Dict[str, Any]:
|
|
| 263 |
icon = "✅" if ep["grader_score"] >= 0.65 else "⚠️ "
|
| 264 |
print(f" {icon} seed={ep['seed']:3d} {ep['contract']:12s}.{ep['function']:18s}"
|
| 265 |
f" score={ep['grader_score']:.3f} reward={ep['cumulative_reward']:+.2f}")
|
| 266 |
-
oracle_avg
|
| 267 |
-
|
| 268 |
-
print(f"\n Oracle avg: {oracle_avg:.3f} reward: {oracle_avg_r:+.2f}")
|
| 269 |
|
|
|
|
| 270 |
print("\n▶ Partial (submits NatSpec comment):")
|
| 271 |
-
partial_eps
|
| 272 |
-
partial_avg
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
print("\n▶
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
print(f"
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
|
| 286 |
return {
|
| 287 |
"task_id": "task2_property_discovery",
|
| 288 |
-
"oracle": {"avg_score": oracle_avg, "
|
| 289 |
-
"partial": {"avg_score": partial_avg, "
|
| 290 |
-
"
|
|
|
|
| 291 |
}
|
| 292 |
|
| 293 |
|
|
@@ -299,40 +169,50 @@ def run_task3_eval(n: int, seed_offset: int, verbose: bool) -> Dict[str, Any]:
|
|
| 299 |
print(f" Dataset: {len(get_all_task3_entries(contracts))} rule-check episodes\n")
|
| 300 |
env = Task3Environment()
|
| 301 |
|
| 302 |
-
|
|
|
|
| 303 |
oracle_eps = []
|
| 304 |
for i in range(n):
|
| 305 |
ep = oracle_t3(env, seed_offset + i, verbose)
|
| 306 |
oracle_eps.append(ep)
|
| 307 |
print(f" seed={ep['seed']:3d} {ep['contract']:12s}.{ep['target_function']:18s}"
|
| 308 |
f" score={ep['grader_score']:.1f} reward={ep['cumulative_reward']:+.2f}")
|
| 309 |
-
oracle_avg
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
print("\n▶ Subfunction (partial-credit callee
|
| 314 |
-
sub_eps
|
| 315 |
-
sub_avg
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
print(f"
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 330 |
|
| 331 |
return {
|
| 332 |
"task_id": "task3_rule_checker",
|
| 333 |
-
"oracle": {"avg_score": oracle_avg,
|
| 334 |
-
"subfunction": {"avg_score": sub_avg,
|
| 335 |
-
"
|
|
|
|
| 336 |
}
|
| 337 |
|
| 338 |
|
|
@@ -342,13 +222,18 @@ def run_task3_eval(n: int, seed_offset: int, verbose: bool) -> Dict[str, Any]:
|
|
| 342 |
|
| 343 |
def main():
|
| 344 |
parser = argparse.ArgumentParser(
|
| 345 |
-
description="Evaluate Task 1, 2, and/or 3
|
| 346 |
)
|
| 347 |
-
parser.add_argument("--episodes", type=int,
|
| 348 |
-
|
| 349 |
-
parser.add_argument("--
|
| 350 |
-
|
| 351 |
-
parser.add_argument("--
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 352 |
args = parser.parse_args()
|
| 353 |
|
| 354 |
report: Dict[str, Any] = {"num_episodes": args.episodes, "seed_offset": args.seed}
|
|
@@ -360,14 +245,16 @@ def main():
|
|
| 360 |
if args.task in ("3", "all"):
|
| 361 |
report["task3"] = run_task3_eval(args.episodes, args.seed, args.verbose)
|
| 362 |
|
|
|
|
| 363 |
print("\n" + "=" * 64)
|
| 364 |
print("EVALUATION COMPLETE")
|
| 365 |
print("=" * 64)
|
| 366 |
-
|
| 367 |
-
("Task 1", "task1", ["oracle", "partial", "
|
| 368 |
-
("Task 2", "task2", ["oracle", "partial", "
|
| 369 |
-
("Task 3", "task3", ["oracle", "subfunction", "
|
| 370 |
-
]
|
|
|
|
| 371 |
if key in report:
|
| 372 |
scores = " ".join(
|
| 373 |
f"{t}={report[key][t]['avg_score']:.3f}" for t in tiers
|
|
|
|
| 3 |
-------
|
| 4 |
Evaluation harness for all three tasks.
|
| 5 |
|
| 6 |
+
Runs four agent tiers per task:
|
| 7 |
+
oracle – always submits the ground-truth answer (upper bound)
|
| 8 |
+
partial – right category, wrong detail (partial credit)
|
| 9 |
+
random – genuine random exploration + random submit (random baseline)
|
| 10 |
+
floor – always submits a guaranteed-wrong answer (lower bound)
|
| 11 |
|
| 12 |
Usage:
|
| 13 |
+
python eval.py # all tasks, 8 episodes each
|
| 14 |
+
python eval.py --task 1|2|3 # single task
|
| 15 |
python eval.py --episodes 16 --verbose
|
| 16 |
python eval.py --out results.json
|
| 17 |
"""
|
| 18 |
|
| 19 |
import argparse
|
| 20 |
import json
|
| 21 |
+
import random as _random
|
| 22 |
from typing import Any, Dict, List
|
| 23 |
|
| 24 |
+
from tasks.task1 import Task1Environment
|
| 25 |
+
from tasks.task2 import Task2Environment
|
| 26 |
+
from tasks.task3 import Task3Environment
|
| 27 |
+
from agents.task1 import oracle_t1, partial_t1, random_t1, floor_t1
|
| 28 |
+
from agents.task2 import oracle_t2, partial_t2, random_t2, floor_t2
|
| 29 |
+
from agents.task3 import oracle_t3, subfunction_t3, random_t3, floor_t3
|
| 30 |
from data.data_loader import (
|
| 31 |
load_contracts,
|
|
|
|
| 32 |
get_all_vulnerable_entries,
|
| 33 |
get_all_property_entries,
|
| 34 |
get_all_task3_entries,
|
| 35 |
)
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
# ─────────────────────────────────────────────────────────────────────────────
|
| 38 |
+
# Evaluation runners
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
# ─────────────────────────────────────────────────────────────────────────────
|
| 40 |
|
| 41 |
+
def _avg(episodes: List[Dict[str, Any]], key: str = "grader_score") -> float:
|
| 42 |
+
return sum(e[key] for e in episodes) / len(episodes) if episodes else 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
def run_task1_eval(n: int, seed_offset: int, verbose: bool) -> Dict[str, Any]:
|
| 46 |
print("\n" + "=" * 64)
|
|
|
|
| 51 |
f"{len(get_all_vulnerable_entries(contracts))} vulnerable functions\n")
|
| 52 |
env = Task1Environment()
|
| 53 |
|
| 54 |
+
# Oracle
|
| 55 |
+
print("▶ Oracle (correct function + correct vuln → 1.0):")
|
| 56 |
oracle_eps = []
|
| 57 |
for i in range(n):
|
| 58 |
ep = oracle_t1(env, seed_offset + i, verbose)
|
| 59 |
oracle_eps.append(ep)
|
| 60 |
print(f" seed={ep['seed']:3d} {ep['contract']:12s}.{ep['target_function']:18s}"
|
| 61 |
f" score={ep['grader_score']:.1f} reward={ep['cumulative_reward']:+.2f}")
|
| 62 |
+
oracle_avg = _avg(oracle_eps)
|
| 63 |
+
print(f"\n Oracle avg: {oracle_avg:.3f}")
|
|
|
|
| 64 |
|
| 65 |
+
# Partial
|
| 66 |
+
print("\n▶ Partial (correct function, 'unknown' vuln → 0.5):")
|
| 67 |
partial_eps = [partial_t1(env, seed_offset + i) for i in range(n)]
|
| 68 |
+
partial_avg = _avg(partial_eps)
|
| 69 |
+
print(f" Partial avg: {partial_avg:.3f}")
|
| 70 |
+
|
| 71 |
+
# Random
|
| 72 |
+
print("\n▶ Random (random fn from list + random vuln type):")
|
| 73 |
+
random_eps = [random_t1(env, seed_offset + i) for i in range(n)]
|
| 74 |
+
random_avg = _avg(random_eps)
|
| 75 |
+
submitted = [(e.get("submitted_fn", "?"), e.get("submitted_vuln", "?")) for e in random_eps]
|
| 76 |
+
print(f" Random avg: {random_avg:.3f} submissions: {submitted}")
|
| 77 |
+
|
| 78 |
+
# Floor
|
| 79 |
+
print("\n▶ Floor (always 'constructor' → 0.0):")
|
| 80 |
+
floor_eps = [floor_t1(env, seed_offset + i) for i in range(n)]
|
| 81 |
+
floor_avg = _avg(floor_eps)
|
| 82 |
+
print(f" Floor avg: {floor_avg:.3f}")
|
| 83 |
+
|
| 84 |
+
# Vulnerability type coverage
|
| 85 |
vuln_seen: Dict[str, int] = {}
|
| 86 |
for ep in oracle_eps:
|
| 87 |
v = ep.get("vulnerability", "unknown")
|
|
|
|
| 90 |
for v in sorted(vuln_seen):
|
| 91 |
print(f" {vuln_seen[v]:2d}× {v}")
|
| 92 |
|
| 93 |
+
assert oracle_avg == 1.0, f"Oracle avg {oracle_avg:.3f} should be 1.0"
|
| 94 |
+
assert partial_avg == 0.5, f"Partial avg {partial_avg:.3f} should be 0.5"
|
| 95 |
+
assert floor_avg == 0.0, f"Floor avg {floor_avg:.3f} should be 0.0"
|
| 96 |
+
assert oracle_avg >= random_avg >= floor_avg, \
|
| 97 |
+
f"Score ordering violated: oracle={oracle_avg}, random={random_avg}, floor={floor_avg}"
|
| 98 |
+
print(f"\n ✅ Task 1: oracle(1.0) ≥ partial(0.5) ≥ random({random_avg:.3f}) ≥ floor(0.0)")
|
| 99 |
|
| 100 |
return {
|
| 101 |
"task_id": "task1_vuln_detection",
|
| 102 |
+
"oracle": {"avg_score": oracle_avg, "episodes": oracle_eps},
|
| 103 |
"partial": {"avg_score": partial_avg, "episodes": partial_eps},
|
| 104 |
+
"random": {"avg_score": random_avg, "episodes": random_eps},
|
| 105 |
+
"floor": {"avg_score": floor_avg, "episodes": floor_eps},
|
| 106 |
"vuln_coverage": vuln_seen,
|
| 107 |
}
|
| 108 |
|
|
|
|
| 115 |
print(f" Dataset: {len(get_all_property_entries(contracts))} property entries\n")
|
| 116 |
env = Task2Environment()
|
| 117 |
|
| 118 |
+
# Oracle
|
| 119 |
+
print("▶ Oracle (submits ground-truth natural language):")
|
| 120 |
oracle_eps = []
|
| 121 |
for i in range(n):
|
| 122 |
ep = oracle_t2(env, seed_offset + i, verbose)
|
|
|
|
| 124 |
icon = "✅" if ep["grader_score"] >= 0.65 else "⚠️ "
|
| 125 |
print(f" {icon} seed={ep['seed']:3d} {ep['contract']:12s}.{ep['function']:18s}"
|
| 126 |
f" score={ep['grader_score']:.3f} reward={ep['cumulative_reward']:+.2f}")
|
| 127 |
+
oracle_avg = _avg(oracle_eps)
|
| 128 |
+
print(f"\n Oracle avg: {oracle_avg:.3f}")
|
|
|
|
| 129 |
|
| 130 |
+
# Partial
|
| 131 |
print("\n▶ Partial (submits NatSpec comment):")
|
| 132 |
+
partial_eps = [partial_t2(env, seed_offset + i) for i in range(n)]
|
| 133 |
+
partial_avg = _avg(partial_eps)
|
| 134 |
+
print(f" Partial avg: {partial_avg:.3f}")
|
| 135 |
+
|
| 136 |
+
# Random
|
| 137 |
+
print("\n▶ Random (random browse + generic property template):")
|
| 138 |
+
random_eps = [random_t2(env, seed_offset + i) for i in range(n)]
|
| 139 |
+
random_avg = _avg(random_eps)
|
| 140 |
+
print(f" Random avg: {random_avg:.3f}")
|
| 141 |
+
|
| 142 |
+
# Floor
|
| 143 |
+
print("\n▶ Floor (submits empty string → 0.0):")
|
| 144 |
+
floor_eps = [floor_t2(env, seed_offset + i) for i in range(n)]
|
| 145 |
+
floor_avg = _avg(floor_eps)
|
| 146 |
+
print(f" Floor avg: {floor_avg:.3f}")
|
| 147 |
+
|
| 148 |
+
assert oracle_avg > 0.60, f"Oracle avg {oracle_avg:.3f} should be > 0.60"
|
| 149 |
+
assert oracle_avg > partial_avg >= floor_avg, \
|
| 150 |
+
"Score ordering violated: oracle > partial >= floor"
|
| 151 |
+
assert floor_avg == 0.0, f"Floor avg {floor_avg:.3f} should be 0.0"
|
| 152 |
+
print(f"\n ✅ Task 2: oracle({oracle_avg:.3f}) > partial({partial_avg:.3f})"
|
| 153 |
+
f" ≥ random({random_avg:.3f}) ≥ floor(0.0)")
|
| 154 |
|
| 155 |
return {
|
| 156 |
"task_id": "task2_property_discovery",
|
| 157 |
+
"oracle": {"avg_score": oracle_avg, "episodes": oracle_eps},
|
| 158 |
+
"partial": {"avg_score": partial_avg, "episodes": partial_eps},
|
| 159 |
+
"random": {"avg_score": random_avg, "episodes": random_eps},
|
| 160 |
+
"floor": {"avg_score": floor_avg, "episodes": floor_eps},
|
| 161 |
}
|
| 162 |
|
| 163 |
|
|
|
|
| 169 |
print(f" Dataset: {len(get_all_task3_entries(contracts))} rule-check episodes\n")
|
| 170 |
env = Task3Environment()
|
| 171 |
|
| 172 |
+
# Oracle
|
| 173 |
+
print("▶ Oracle (exact target function → 1.0):")
|
| 174 |
oracle_eps = []
|
| 175 |
for i in range(n):
|
| 176 |
ep = oracle_t3(env, seed_offset + i, verbose)
|
| 177 |
oracle_eps.append(ep)
|
| 178 |
print(f" seed={ep['seed']:3d} {ep['contract']:12s}.{ep['target_function']:18s}"
|
| 179 |
f" score={ep['grader_score']:.1f} reward={ep['cumulative_reward']:+.2f}")
|
| 180 |
+
oracle_avg = _avg(oracle_eps)
|
| 181 |
+
print(f"\n Oracle avg: {oracle_avg:.3f}")
|
| 182 |
+
|
| 183 |
+
# Subfunction (partial credit)
|
| 184 |
+
print("\n▶ Subfunction (partial-credit callee if exists, else constructor):")
|
| 185 |
+
sub_eps = [subfunction_t3(env, seed_offset + i) for i in range(n)]
|
| 186 |
+
sub_avg = _avg(sub_eps)
|
| 187 |
+
submitted_sub = list({e.get("submitted", "?") for e in sub_eps})
|
| 188 |
+
print(f" Subfunction avg: {sub_avg:.3f} submitted: {submitted_sub}")
|
| 189 |
+
|
| 190 |
+
# Random
|
| 191 |
+
print("\n▶ Random (lists functions, submits uniformly random one):")
|
| 192 |
+
random_eps = [random_t3(env, seed_offset + i) for i in range(n)]
|
| 193 |
+
random_avg = _avg(random_eps)
|
| 194 |
+
submitted_rand = [e.get("submitted", "?") for e in random_eps]
|
| 195 |
+
print(f" Random avg: {random_avg:.3f} submitted: {submitted_rand}")
|
| 196 |
+
|
| 197 |
+
# Floor
|
| 198 |
+
print("\n▶ Floor (always 'constructor' → 0.0):")
|
| 199 |
+
floor_eps = [floor_t3(env, seed_offset + i) for i in range(n)]
|
| 200 |
+
floor_avg = _avg(floor_eps)
|
| 201 |
+
print(f" Floor avg: {floor_avg:.3f}")
|
| 202 |
+
|
| 203 |
+
assert oracle_avg == 1.0, f"Oracle avg {oracle_avg:.3f} should be 1.0"
|
| 204 |
+
assert floor_avg == 0.0, f"Floor avg {floor_avg:.3f} should be 0.0"
|
| 205 |
+
assert oracle_avg >= random_avg >= floor_avg, \
|
| 206 |
+
f"Score ordering violated: oracle={oracle_avg}, random={random_avg}, floor={floor_avg}"
|
| 207 |
+
print(f"\n ✅ Task 3: oracle(1.0) ≥ subfunction({sub_avg:.3f})"
|
| 208 |
+
f" ≥ random({random_avg:.3f}) ≥ floor(0.0)")
|
| 209 |
|
| 210 |
return {
|
| 211 |
"task_id": "task3_rule_checker",
|
| 212 |
+
"oracle": {"avg_score": oracle_avg, "episodes": oracle_eps},
|
| 213 |
+
"subfunction": {"avg_score": sub_avg, "episodes": sub_eps},
|
| 214 |
+
"random": {"avg_score": random_avg, "episodes": random_eps},
|
| 215 |
+
"floor": {"avg_score": floor_avg, "episodes": floor_eps},
|
| 216 |
}
|
| 217 |
|
| 218 |
|
|
|
|
| 222 |
|
| 223 |
def main():
|
| 224 |
parser = argparse.ArgumentParser(
|
| 225 |
+
description="Evaluate Task 1, 2, and/or 3 — oracle / partial / random / floor"
|
| 226 |
)
|
| 227 |
+
parser.add_argument("--episodes", type=int, default=8,
|
| 228 |
+
help="Episodes per agent tier (default: 8)")
|
| 229 |
+
parser.add_argument("--seed", type=int, default=42,
|
| 230 |
+
help="Starting RNG seed (default: 42)")
|
| 231 |
+
parser.add_argument("--task", choices=["1", "2", "3", "all"], default="all",
|
| 232 |
+
help="Which task(s) from [1, 2, 3] to evaluate (default: all)")
|
| 233 |
+
parser.add_argument("--verbose", action="store_true",
|
| 234 |
+
help="Print per-episode target details for oracle agents")
|
| 235 |
+
parser.add_argument("--out", default="eval_results.json",
|
| 236 |
+
help="Output JSON file (default: eval_results.json)")
|
| 237 |
args = parser.parse_args()
|
| 238 |
|
| 239 |
report: Dict[str, Any] = {"num_episodes": args.episodes, "seed_offset": args.seed}
|
|
|
|
| 245 |
if args.task in ("3", "all"):
|
| 246 |
report["task3"] = run_task3_eval(args.episodes, args.seed, args.verbose)
|
| 247 |
|
| 248 |
+
# ── Summary ──────────────────────────────────────────────────────────────
|
| 249 |
print("\n" + "=" * 64)
|
| 250 |
print("EVALUATION COMPLETE")
|
| 251 |
print("=" * 64)
|
| 252 |
+
rows = [
|
| 253 |
+
("Task 1", "task1", ["oracle", "partial", "random", "floor"]),
|
| 254 |
+
("Task 2", "task2", ["oracle", "partial", "random", "floor"]),
|
| 255 |
+
("Task 3", "task3", ["oracle", "subfunction", "random", "floor"]),
|
| 256 |
+
]
|
| 257 |
+
for label, key, tiers in rows:
|
| 258 |
if key in report:
|
| 259 |
scores = " ".join(
|
| 260 |
f"{t}={report[key][t]['avg_score']:.3f}" for t in tiers
|
inference.py
CHANGED
|
@@ -1,36 +1,40 @@
|
|
| 1 |
"""
|
| 2 |
inference.py
|
| 3 |
------------
|
| 4 |
-
Baseline inference script
|
| 5 |
-
Implements Task 1 (Vulnerability Detection), Task 2 (Property Discovery),
|
| 6 |
-
and Task 3 (Rule Checker).
|
| 7 |
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
Usage:
|
| 14 |
python inference.py
|
| 15 |
|
| 16 |
Output:
|
| 17 |
-
|
| 18 |
-
Final baseline scores written to baseline_scores.json.
|
| 19 |
-
|
| 20 |
-
Runtime: < 5 minutes on 3 episodes per task with gpt-4o-mini.
|
| 21 |
"""
|
| 22 |
|
|
|
|
| 23 |
import json
|
| 24 |
import os
|
| 25 |
import sys
|
| 26 |
import time
|
| 27 |
-
from typing import Any, Dict, List
|
| 28 |
|
| 29 |
from openai import OpenAI
|
| 30 |
|
| 31 |
-
from tasks.task1
|
| 32 |
-
from tasks.task2
|
| 33 |
-
from tasks.task3
|
| 34 |
from env.schemas import Action, ActionType
|
| 35 |
from utils import T1_SYSTEM, T2_SYSTEM, T3_SYSTEM
|
| 36 |
|
|
@@ -38,83 +42,152 @@ from utils import T1_SYSTEM, T2_SYSTEM, T3_SYSTEM
|
|
| 38 |
# Configuration
|
| 39 |
# ─────────────────────────────────────────────────────────────────────────────
|
| 40 |
|
| 41 |
-
API_BASE_URL = os.
|
| 42 |
-
MODEL_NAME = os.
|
| 43 |
-
HF_TOKEN = os.
|
| 44 |
|
| 45 |
if not HF_TOKEN:
|
| 46 |
-
print("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
NUM_EPISODES = 3
|
| 51 |
-
SEED_BASE_T1 = 42
|
| 52 |
-
SEED_BASE_T2 = 10
|
| 53 |
|
| 54 |
client = OpenAI(api_key=HF_TOKEN, base_url=API_BASE_URL)
|
| 55 |
|
|
|
|
| 56 |
# ─────────────────────────────────────────────────────────────────────────────
|
| 57 |
-
#
|
| 58 |
# ─────────────────────────────────────────────────────────────────────────────
|
| 59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
def _t1_user_msg(obs: Dict[str, Any]) -> str:
|
| 62 |
return (
|
| 63 |
f"Contract: {obs['contract_name']}\n"
|
| 64 |
f"Description: {obs['contract_description']}\n"
|
| 65 |
-
f"Step: {obs['step_count']} | Reward: {obs['cumulative_reward']:.2f}\n\n"
|
| 66 |
-
f"Last action: {obs['last_action'] or 'None'}\n"
|
| 67 |
-
f"
|
| 68 |
)
|
| 69 |
|
| 70 |
|
| 71 |
-
def
|
| 72 |
-
|
|
|
|
| 73 |
obs = r.observation.model_dump()
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
|
| 116 |
# ──────────────────────────────────────────���──────────────────────────────────
|
| 117 |
-
# Task 2
|
| 118 |
# ─────────────────────────────────────────────────────────────────────────────
|
| 119 |
|
| 120 |
|
|
@@ -124,198 +197,246 @@ def _t2_user_msg(obs: Dict[str, Any]) -> str:
|
|
| 124 |
f"Contract : {obs['contract_name']}\n"
|
| 125 |
f"Function : {extra.get('target_function', '?')} "
|
| 126 |
f"({extra.get('target_signature', '')})\n"
|
| 127 |
-
f"Step: {obs['step_count']} | Reward: {obs['cumulative_reward']:.2f}\n\n"
|
| 128 |
-
f"Last action: {obs['last_action'] or 'None'}\n"
|
| 129 |
-
f"
|
| 130 |
)
|
| 131 |
|
| 132 |
|
| 133 |
-
def
|
| 134 |
-
|
|
|
|
| 135 |
obs = r.observation.model_dump()
|
| 136 |
fn = obs["extra"].get("target_function", "?")
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
|
| 179 |
|
| 180 |
# ─────────────────────────────────────────────────────────────────────────────
|
| 181 |
-
# Task
|
| 182 |
# ─────────────────────────────────────────────────────────────────────────────
|
| 183 |
|
| 184 |
-
def run_task1(n: int = NUM_EPISODES) -> Dict[str, Any]:
|
| 185 |
-
print("\n" + "="*60)
|
| 186 |
-
print("TASK 1: Targeted Vulnerability Detection")
|
| 187 |
-
print("="*60)
|
| 188 |
-
env = Task1Environment()
|
| 189 |
-
episodes = [run_t1_episode(env, SEED_BASE_T1 + i, i+1) for i in range(n)]
|
| 190 |
-
avg_s = sum(e["grader_score"] for e in episodes) / n
|
| 191 |
-
avg_r = sum(e["cumulative_reward"] for e in episodes) / n
|
| 192 |
-
print(f"\n Avg grader score : {avg_s:.3f}")
|
| 193 |
-
print(f" Avg cum reward : {avg_r:.2f}")
|
| 194 |
-
return {"task_id": "task1_vuln_detection", "name": "Targeted Vulnerability Detection",
|
| 195 |
-
"status": "active", "num_episodes": n, "episodes": episodes,
|
| 196 |
-
"avg_grader_score": avg_s, "avg_cumulative_reward": avg_r}
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
def run_task2(n: int = NUM_EPISODES) -> Dict[str, Any]:
|
| 200 |
-
print("\n" + "="*60)
|
| 201 |
-
print("TASK 2: Property Discovery")
|
| 202 |
-
print("="*60)
|
| 203 |
-
env = Task2Environment()
|
| 204 |
-
episodes = [run_t2_episode(env, SEED_BASE_T2 + i, i+1) for i in range(n)]
|
| 205 |
-
avg_s = sum(e["grader_score"] for e in episodes) / n
|
| 206 |
-
avg_r = sum(e["cumulative_reward"] for e in episodes) / n
|
| 207 |
-
print(f"\n Avg grader score : {avg_s:.3f}")
|
| 208 |
-
print(f" Avg cum reward : {avg_r:.2f}")
|
| 209 |
-
return {"task_id": "task2_property_discovery", "name": "Property Discovery",
|
| 210 |
-
"status": "active", "num_episodes": n, "episodes": episodes,
|
| 211 |
-
"avg_grader_score": avg_s, "avg_cumulative_reward": avg_r}
|
| 212 |
-
|
| 213 |
|
| 214 |
def _t3_user_msg(obs: Dict[str, Any]) -> str:
|
| 215 |
extra = obs.get("extra", {})
|
| 216 |
return (
|
| 217 |
f"Contract : {obs['contract_name']}\n"
|
| 218 |
-
f"Property : {extra.get('property_english', '(
|
| 219 |
-
f"Step: {obs['step_count']} | Reward: {obs['cumulative_reward']:.2f}\n\n"
|
| 220 |
-
f"Last action: {obs['last_action'] or 'None'}\n"
|
| 221 |
-
f"
|
| 222 |
)
|
| 223 |
|
| 224 |
|
| 225 |
-
def
|
|
|
|
| 226 |
r = env.reset(seed=seed)
|
| 227 |
obs = r.observation.model_dump()
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
messages
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
|
| 270 |
|
| 271 |
def run_task3(n: int = NUM_EPISODES) -> Dict[str, Any]:
|
| 272 |
-
print("\n" + "="*60)
|
| 273 |
-
print("TASK 3: Rule Checker")
|
| 274 |
-
print("="*60)
|
| 275 |
-
env
|
| 276 |
-
episodes = [
|
| 277 |
-
avg_s
|
| 278 |
-
avg_r
|
| 279 |
-
print(f"\n Avg grader score
|
| 280 |
-
print(f" Avg cum reward
|
| 281 |
-
return {
|
| 282 |
-
|
| 283 |
-
|
|
|
|
|
|
|
| 284 |
|
| 285 |
|
| 286 |
# ─────────────────────────────────────────────────────────────────────────────
|
| 287 |
# Main
|
| 288 |
# ─────────────────────────────────────────────────────────────────────────────
|
| 289 |
|
| 290 |
-
def main():
|
| 291 |
-
|
| 292 |
-
print(
|
|
|
|
| 293 |
|
| 294 |
t1 = run_task1(NUM_EPISODES)
|
| 295 |
t2 = run_task2(NUM_EPISODES)
|
| 296 |
t3 = run_task3(NUM_EPISODES)
|
| 297 |
|
| 298 |
results = {
|
| 299 |
-
"model":
|
| 300 |
-
"
|
|
|
|
| 301 |
}
|
| 302 |
-
|
| 303 |
-
active = results["tasks"]
|
| 304 |
-
overall = sum(t["avg_grader_score"] for t in active) / len(active)
|
| 305 |
results["overall_avg_score"] = overall
|
| 306 |
|
| 307 |
-
print("\n" + "="*60)
|
| 308 |
-
print("BASELINE SUMMARY")
|
| 309 |
-
print("="*60)
|
| 310 |
for t in results["tasks"]:
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
print(f"\n Overall (active tasks): {overall:.3f}")
|
| 314 |
|
| 315 |
with open("baseline_scores.json", "w") as f:
|
| 316 |
json.dump(results, f, indent=2)
|
| 317 |
-
print("\n Scores written to baseline_scores.json")
|
| 318 |
|
| 319 |
|
| 320 |
if __name__ == "__main__":
|
| 321 |
-
main()
|
|
|
|
| 1 |
"""
|
| 2 |
inference.py
|
| 3 |
------------
|
| 4 |
+
Baseline inference script — Smart Contract Audit RL Environment.
|
|
|
|
|
|
|
| 5 |
|
| 6 |
+
Implements agents for all three tasks using the OpenAI-compatible client.
|
| 7 |
+
Emits mandatory structured stdout in the OpenEnv format.
|
| 8 |
+
|
| 9 |
+
MANDATORY ENV VARS:
|
| 10 |
+
API_BASE_URL LLM API endpoint (default: https://api.openai.com/v1)
|
| 11 |
+
MODEL_NAME Model identifier (default: gpt-4o-mini)
|
| 12 |
+
HF_TOKEN API key / HF token
|
| 13 |
+
|
| 14 |
+
MANDATORY STDOUT FORMAT (per episode):
|
| 15 |
+
[START] task=<id> env=smart-contract-audit model=<model>
|
| 16 |
+
[STEP] step=<n> action=<str> reward=<0.00> done=<true|false> error=<str|null>
|
| 17 |
+
[END] success=<true|false> steps=<n> score=<0.000> rewards=<r1,r2,...>
|
| 18 |
|
| 19 |
Usage:
|
| 20 |
python inference.py
|
| 21 |
|
| 22 |
Output:
|
| 23 |
+
Structured stdout per episode, plus baseline_scores.json summary.
|
|
|
|
|
|
|
|
|
|
| 24 |
"""
|
| 25 |
|
| 26 |
+
import asyncio
|
| 27 |
import json
|
| 28 |
import os
|
| 29 |
import sys
|
| 30 |
import time
|
| 31 |
+
from typing import Any, Dict, List, Optional
|
| 32 |
|
| 33 |
from openai import OpenAI
|
| 34 |
|
| 35 |
+
from tasks.task1 import Task1Environment
|
| 36 |
+
from tasks.task2 import Task2Environment
|
| 37 |
+
from tasks.task3 import Task3Environment
|
| 38 |
from env.schemas import Action, ActionType
|
| 39 |
from utils import T1_SYSTEM, T2_SYSTEM, T3_SYSTEM
|
| 40 |
|
|
|
|
| 42 |
# Configuration
|
| 43 |
# ─────────────────────────────────────────────────────────────────────────────
|
| 44 |
|
| 45 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "https://api.openai.com/v1")
|
| 46 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "gpt-4o-mini")
|
| 47 |
+
HF_TOKEN = os.getenv("HF_TOKEN", "")
|
| 48 |
|
| 49 |
if not HF_TOKEN:
|
| 50 |
+
print("[WARN] HF_TOKEN not set — API calls may fail.", file=sys.stderr)
|
| 51 |
+
|
| 52 |
+
# Benchmark / environment identifier (constant for this env)
|
| 53 |
+
ENV_BENCHMARK = "smart-contract-audit"
|
| 54 |
+
|
| 55 |
+
# Episodes per task
|
| 56 |
+
NUM_EPISODES = 3
|
| 57 |
+
SEED_BASE = 42
|
| 58 |
+
|
| 59 |
+
# Max steps per task
|
| 60 |
+
MAX_STEPS_T1 = 15
|
| 61 |
+
MAX_STEPS_T2 = 10
|
| 62 |
+
MAX_STEPS_T3 = 12
|
| 63 |
|
| 64 |
+
# A grader_score >= this is considered a "success" for the [END] line
|
| 65 |
+
SUCCESS_SCORE_THRESHOLD = 0.5
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
client = OpenAI(api_key=HF_TOKEN, base_url=API_BASE_URL)
|
| 68 |
|
| 69 |
+
|
| 70 |
# ─────────────────────────────────────────────────────────────────────────────
|
| 71 |
+
# Mandatory stdout helpers
|
| 72 |
# ─────────────────────────────────────────────────────────────────────────────
|
| 73 |
|
| 74 |
+
def log_start(task: str, env: str, model: str) -> None:
|
| 75 |
+
"""Emit the [START] line — one per episode."""
|
| 76 |
+
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def log_step(
|
| 80 |
+
step: int,
|
| 81 |
+
action: str,
|
| 82 |
+
reward: float,
|
| 83 |
+
done: bool,
|
| 84 |
+
error: Optional[str] = None,
|
| 85 |
+
) -> None:
|
| 86 |
+
"""Emit a [STEP] line — one per env.step() call."""
|
| 87 |
+
error_val = error if error else "null"
|
| 88 |
+
print(
|
| 89 |
+
f"[STEP] step={step} action={action} "
|
| 90 |
+
f"reward={reward:.2f} done={str(done).lower()} error={error_val}",
|
| 91 |
+
flush=True,
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def log_end(
|
| 96 |
+
success: bool,
|
| 97 |
+
steps: int,
|
| 98 |
+
score: float,
|
| 99 |
+
rewards: List[float],
|
| 100 |
+
) -> None:
|
| 101 |
+
"""Emit the [END] line — one per episode, always emitted."""
|
| 102 |
+
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 103 |
+
print(
|
| 104 |
+
f"[END] success={str(success).lower()} steps={steps} "
|
| 105 |
+
f"score={score:.3f} rewards={rewards_str}",
|
| 106 |
+
flush=True,
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 111 |
+
# Task 1 — Targeted Vulnerability Detection
|
| 112 |
+
# ──────────────���──────────────────────────────────────────────────────────────
|
| 113 |
|
| 114 |
def _t1_user_msg(obs: Dict[str, Any]) -> str:
|
| 115 |
return (
|
| 116 |
f"Contract: {obs['contract_name']}\n"
|
| 117 |
f"Description: {obs['contract_description']}\n"
|
| 118 |
+
f"Step: {obs['step_count']} | Reward so far: {obs['cumulative_reward']:.2f}\n\n"
|
| 119 |
+
f"Last action : {obs['last_action'] or 'None'}\n"
|
| 120 |
+
f"Last result : {obs['last_action_result'] or 'Episode just started.'}"
|
| 121 |
)
|
| 122 |
|
| 123 |
|
| 124 |
+
def _run_t1_episode(env: Task1Environment, seed: int, ep_num: int) -> Dict[str, Any]:
|
| 125 |
+
"""Run one Task 1 episode; emit [START]/[STEP]/[END]."""
|
| 126 |
+
r = env.reset(seed=seed)
|
| 127 |
obs = r.observation.model_dump()
|
| 128 |
+
|
| 129 |
+
log_start(task="task1_vuln_detection", env=ENV_BENCHMARK, model=MODEL_NAME)
|
| 130 |
+
|
| 131 |
+
messages = [{"role": "system", "content": T1_SYSTEM}]
|
| 132 |
+
step_rewards: List[float] = []
|
| 133 |
+
grader_score = 0.0
|
| 134 |
+
steps_taken = 0
|
| 135 |
+
error_msg: Optional[str] = None
|
| 136 |
+
|
| 137 |
+
try:
|
| 138 |
+
for step in range(1, MAX_STEPS_T1 + 1):
|
| 139 |
+
messages.append({"role": "user", "content": _t1_user_msg(obs)})
|
| 140 |
+
try:
|
| 141 |
+
resp = client.chat.completions.create(
|
| 142 |
+
model=MODEL_NAME, messages=messages,
|
| 143 |
+
max_tokens=200, temperature=0.0,
|
| 144 |
+
)
|
| 145 |
+
raw = resp.choices[0].message.content.strip() # type: ignore
|
| 146 |
+
error_msg = None
|
| 147 |
+
except Exception as e:
|
| 148 |
+
raw = ""
|
| 149 |
+
error_msg = str(e)[:80]
|
| 150 |
+
print(f"[DEBUG] T1 LLM error ep={ep_num} step={step}: {e}", file=sys.stderr)
|
| 151 |
+
|
| 152 |
+
try:
|
| 153 |
+
parsed = json.loads(raw)
|
| 154 |
+
at = ActionType(parsed["action"])
|
| 155 |
+
params = parsed.get("params", {})
|
| 156 |
+
except Exception:
|
| 157 |
+
at, params = ActionType.LIST_FUNCTIONS, {}
|
| 158 |
+
|
| 159 |
+
messages.append({"role": "assistant", "content": raw})
|
| 160 |
+
result = env.step(Action(action_type=at, params=params))
|
| 161 |
+
obs = result.observation.model_dump()
|
| 162 |
+
r_val = result.reward.value
|
| 163 |
+
done = result.done
|
| 164 |
+
|
| 165 |
+
step_rewards.append(r_val)
|
| 166 |
+
steps_taken = step
|
| 167 |
+
log_step(step=step, action=at.value, reward=r_val, done=done, error=error_msg)
|
| 168 |
+
|
| 169 |
+
if done:
|
| 170 |
+
v = r_val
|
| 171 |
+
grader_score = 1.0 if v >= 4.9 else (0.5 if v >= 0.9 else 0.0)
|
| 172 |
+
break
|
| 173 |
+
|
| 174 |
+
time.sleep(0.3)
|
| 175 |
+
|
| 176 |
+
finally:
|
| 177 |
+
success = grader_score >= SUCCESS_SCORE_THRESHOLD
|
| 178 |
+
log_end(success=success, steps=steps_taken, score=grader_score, rewards=step_rewards)
|
| 179 |
+
|
| 180 |
+
return {
|
| 181 |
+
"episode": ep_num,
|
| 182 |
+
"seed": seed,
|
| 183 |
+
"contract": obs["contract_name"],
|
| 184 |
+
"grader_score": grader_score,
|
| 185 |
+
"cumulative_reward": obs["cumulative_reward"],
|
| 186 |
+
}
|
| 187 |
|
| 188 |
|
| 189 |
# ──────────────────────────────────────────���──────────────────────────────────
|
| 190 |
+
# Task 2 — Property Discovery
|
| 191 |
# ─────────────────────────────────────────────────────────────────────────────
|
| 192 |
|
| 193 |
|
|
|
|
| 197 |
f"Contract : {obs['contract_name']}\n"
|
| 198 |
f"Function : {extra.get('target_function', '?')} "
|
| 199 |
f"({extra.get('target_signature', '')})\n"
|
| 200 |
+
f"Step: {obs['step_count']} | Reward so far: {obs['cumulative_reward']:.2f}\n\n"
|
| 201 |
+
f"Last action : {obs['last_action'] or 'None'}\n"
|
| 202 |
+
f"Last result :\n{obs['last_action_result'] or 'Episode just started.'}"
|
| 203 |
)
|
| 204 |
|
| 205 |
|
| 206 |
+
def _run_t2_episode(env: Task2Environment, seed: int, ep_num: int) -> Dict[str, Any]:
|
| 207 |
+
"""Run one Task 2 episode; emit [START]/[STEP]/[END]."""
|
| 208 |
+
r = env.reset(seed=seed)
|
| 209 |
obs = r.observation.model_dump()
|
| 210 |
fn = obs["extra"].get("target_function", "?")
|
| 211 |
+
|
| 212 |
+
log_start(task="task2_property_discovery", env=ENV_BENCHMARK, model=MODEL_NAME)
|
| 213 |
+
|
| 214 |
+
messages = [{"role": "system", "content": T2_SYSTEM}]
|
| 215 |
+
step_rewards: List[float] = []
|
| 216 |
+
grader_score = 0.0
|
| 217 |
+
steps_taken = 0
|
| 218 |
+
error_msg: Optional[str] = None
|
| 219 |
+
|
| 220 |
+
try:
|
| 221 |
+
for step in range(1, MAX_STEPS_T2 + 1):
|
| 222 |
+
messages.append({"role": "user", "content": _t2_user_msg(obs)})
|
| 223 |
+
try:
|
| 224 |
+
resp = client.chat.completions.create(
|
| 225 |
+
model=MODEL_NAME, messages=messages,
|
| 226 |
+
max_tokens=400, temperature=0.0,
|
| 227 |
+
)
|
| 228 |
+
raw = resp.choices[0].message.content.strip() # type: ignore
|
| 229 |
+
error_msg = None
|
| 230 |
+
except Exception as e:
|
| 231 |
+
raw = ""
|
| 232 |
+
error_msg = str(e)[:80]
|
| 233 |
+
print(f"[DEBUG] T2 LLM error ep={ep_num} step={step}: {e}", file=sys.stderr)
|
| 234 |
+
|
| 235 |
+
try:
|
| 236 |
+
parsed = json.loads(raw)
|
| 237 |
+
at = ActionType(parsed["action"])
|
| 238 |
+
params = parsed.get("params", {})
|
| 239 |
+
except Exception:
|
| 240 |
+
at, params = ActionType.GET_FUNCTION_CODE, {}
|
| 241 |
+
|
| 242 |
+
messages.append({"role": "assistant", "content": raw})
|
| 243 |
+
result = env.step(Action(action_type=at, params=params))
|
| 244 |
+
obs = result.observation.model_dump()
|
| 245 |
+
r_val = result.reward.value
|
| 246 |
+
done = result.done
|
| 247 |
+
|
| 248 |
+
step_rewards.append(r_val)
|
| 249 |
+
steps_taken = step
|
| 250 |
+
log_step(step=step, action=at.value, reward=r_val, done=done, error=error_msg)
|
| 251 |
+
|
| 252 |
+
if done:
|
| 253 |
+
grader_score = round(r_val / 5.0, 3) if r_val > 0 else 0.0
|
| 254 |
+
break
|
| 255 |
+
|
| 256 |
+
time.sleep(0.3)
|
| 257 |
+
|
| 258 |
+
finally:
|
| 259 |
+
success = grader_score >= SUCCESS_SCORE_THRESHOLD
|
| 260 |
+
log_end(success=success, steps=steps_taken, score=grader_score, rewards=step_rewards)
|
| 261 |
+
|
| 262 |
+
return {
|
| 263 |
+
"episode": ep_num,
|
| 264 |
+
"seed": seed,
|
| 265 |
+
"contract": obs["contract_name"],
|
| 266 |
+
"function": fn,
|
| 267 |
+
"grader_score": grader_score,
|
| 268 |
+
"cumulative_reward": obs["cumulative_reward"],
|
| 269 |
+
}
|
| 270 |
|
| 271 |
|
| 272 |
# ─────────────────────────────────────────────────────────────────────────────
|
| 273 |
+
# Task 3 — Rule Checker
|
| 274 |
# ─────────────────────────────────────────────────────────────────────────────
|
| 275 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 276 |
|
| 277 |
def _t3_user_msg(obs: Dict[str, Any]) -> str:
|
| 278 |
extra = obs.get("extra", {})
|
| 279 |
return (
|
| 280 |
f"Contract : {obs['contract_name']}\n"
|
| 281 |
+
f"Property : {extra.get('property_english', '(none)')}\n"
|
| 282 |
+
f"Step: {obs['step_count']} | Reward so far: {obs['cumulative_reward']:.2f}\n\n"
|
| 283 |
+
f"Last action : {obs['last_action'] or 'None'}\n"
|
| 284 |
+
f"Last result :\n{obs['last_action_result'] or 'Episode just started.'}"
|
| 285 |
)
|
| 286 |
|
| 287 |
|
| 288 |
+
def _run_t3_episode(env: Task3Environment, seed: int, ep_num: int) -> Dict[str, Any]:
|
| 289 |
+
"""Run one Task 3 episode; emit [START]/[STEP]/[END]."""
|
| 290 |
r = env.reset(seed=seed)
|
| 291 |
obs = r.observation.model_dump()
|
| 292 |
+
|
| 293 |
+
log_start(task="task3_rule_checker", env=ENV_BENCHMARK, model=MODEL_NAME)
|
| 294 |
+
|
| 295 |
+
messages = [{"role": "system", "content": T3_SYSTEM}]
|
| 296 |
+
step_rewards: List[float] = []
|
| 297 |
+
grader_score = 0.0
|
| 298 |
+
steps_taken = 0
|
| 299 |
+
error_msg: Optional[str] = None
|
| 300 |
+
|
| 301 |
+
try:
|
| 302 |
+
for step in range(1, MAX_STEPS_T3 + 1):
|
| 303 |
+
messages.append({"role": "user", "content": _t3_user_msg(obs)})
|
| 304 |
+
try:
|
| 305 |
+
resp = client.chat.completions.create(
|
| 306 |
+
model=MODEL_NAME, messages=messages,
|
| 307 |
+
max_tokens=200, temperature=0.0,
|
| 308 |
+
)
|
| 309 |
+
raw = resp.choices[0].message.content.strip() # type: ignore
|
| 310 |
+
error_msg = None
|
| 311 |
+
except Exception as e:
|
| 312 |
+
raw = ""
|
| 313 |
+
error_msg = str(e)[:80]
|
| 314 |
+
print(f"[DEBUG] T3 LLM error ep={ep_num} step={step}: {e}", file=sys.stderr)
|
| 315 |
+
|
| 316 |
+
try:
|
| 317 |
+
parsed = json.loads(raw)
|
| 318 |
+
at = ActionType(parsed["action"])
|
| 319 |
+
params = parsed.get("params", {})
|
| 320 |
+
except Exception:
|
| 321 |
+
at, params = ActionType.LIST_FUNCTIONS, {}
|
| 322 |
+
|
| 323 |
+
messages.append({"role": "assistant", "content": raw})
|
| 324 |
+
result = env.step(Action(action_type=at, params=params))
|
| 325 |
+
obs = result.observation.model_dump()
|
| 326 |
+
r_val = result.reward.value
|
| 327 |
+
done = result.done
|
| 328 |
+
|
| 329 |
+
step_rewards.append(r_val)
|
| 330 |
+
steps_taken = step
|
| 331 |
+
log_step(step=step, action=at.value, reward=r_val, done=done, error=error_msg)
|
| 332 |
+
|
| 333 |
+
if done:
|
| 334 |
+
v = r_val
|
| 335 |
+
grader_score = 1.0 if v >= 4.9 else (0.3 if v >= 1.0 else 0.0)
|
| 336 |
+
break
|
| 337 |
+
|
| 338 |
+
time.sleep(0.3)
|
| 339 |
+
|
| 340 |
+
finally:
|
| 341 |
+
success = grader_score >= SUCCESS_SCORE_THRESHOLD
|
| 342 |
+
log_end(success=success, steps=steps_taken, score=grader_score, rewards=step_rewards)
|
| 343 |
+
|
| 344 |
+
return {
|
| 345 |
+
"episode": ep_num,
|
| 346 |
+
"seed": seed,
|
| 347 |
+
"contract": obs["contract_name"],
|
| 348 |
+
"grader_score": grader_score,
|
| 349 |
+
"cumulative_reward": obs["cumulative_reward"],
|
| 350 |
+
}
|
| 351 |
+
|
| 352 |
+
|
| 353 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 354 |
+
# Task runners
|
| 355 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 356 |
+
|
| 357 |
+
def run_task1(n: int = NUM_EPISODES) -> Dict[str, Any]:
|
| 358 |
+
print("\n" + "="*60, flush=True)
|
| 359 |
+
print("TASK 1: Targeted Vulnerability Detection", flush=True)
|
| 360 |
+
print("="*60, flush=True)
|
| 361 |
+
env = Task1Environment()
|
| 362 |
+
episodes = [_run_t1_episode(env, SEED_BASE + i, i + 1) for i in range(n)]
|
| 363 |
+
avg_s = sum(e["grader_score"] for e in episodes) / n
|
| 364 |
+
avg_r = sum(e["cumulative_reward"] for e in episodes) / n
|
| 365 |
+
print(f"\n Avg grader score : {avg_s:.3f}", flush=True)
|
| 366 |
+
print(f" Avg cum reward : {avg_r:.2f}", flush=True)
|
| 367 |
+
return {
|
| 368 |
+
"task_id": "task1_vuln_detection", "name": "Targeted Vulnerability Detection",
|
| 369 |
+
"status": "active", "num_episodes": n, "episodes": episodes,
|
| 370 |
+
"avg_grader_score": avg_s, "avg_cumulative_reward": avg_r,
|
| 371 |
+
}
|
| 372 |
+
|
| 373 |
+
|
| 374 |
+
def run_task2(n: int = NUM_EPISODES) -> Dict[str, Any]:
|
| 375 |
+
print("\n" + "="*60, flush=True)
|
| 376 |
+
print("TASK 2: Property Discovery", flush=True)
|
| 377 |
+
print("="*60, flush=True)
|
| 378 |
+
env = Task2Environment()
|
| 379 |
+
episodes = [_run_t2_episode(env, SEED_BASE + i, i + 1) for i in range(n)]
|
| 380 |
+
avg_s = sum(e["grader_score"] for e in episodes) / n
|
| 381 |
+
avg_r = sum(e["cumulative_reward"] for e in episodes) / n
|
| 382 |
+
print(f"\n Avg grader score : {avg_s:.3f}", flush=True)
|
| 383 |
+
print(f" Avg cum reward : {avg_r:.2f}", flush=True)
|
| 384 |
+
return {
|
| 385 |
+
"task_id": "task2_property_discovery", "name": "Property Discovery",
|
| 386 |
+
"status": "active", "num_episodes": n, "episodes": episodes,
|
| 387 |
+
"avg_grader_score": avg_s, "avg_cumulative_reward": avg_r,
|
| 388 |
+
}
|
| 389 |
|
| 390 |
|
| 391 |
def run_task3(n: int = NUM_EPISODES) -> Dict[str, Any]:
|
| 392 |
+
print("\n" + "="*60, flush=True)
|
| 393 |
+
print("TASK 3: Rule Checker", flush=True)
|
| 394 |
+
print("="*60, flush=True)
|
| 395 |
+
env = Task3Environment()
|
| 396 |
+
episodes = [_run_t3_episode(env, SEED_BASE + i, i + 1) for i in range(n)]
|
| 397 |
+
avg_s = sum(e["grader_score"] for e in episodes) / n
|
| 398 |
+
avg_r = sum(e["cumulative_reward"] for e in episodes) / n
|
| 399 |
+
print(f"\n Avg grader score : {avg_s:.3f}", flush=True)
|
| 400 |
+
print(f" Avg cum reward : {avg_r:.2f}", flush=True)
|
| 401 |
+
return {
|
| 402 |
+
"task_id": "task3_rule_checker", "name": "Rule Checker",
|
| 403 |
+
"status": "active", "num_episodes": n, "episodes": episodes,
|
| 404 |
+
"avg_grader_score": avg_s, "avg_cumulative_reward": avg_r,
|
| 405 |
+
}
|
| 406 |
|
| 407 |
|
| 408 |
# ─────────────────────────────────────────────────────────────────────────────
|
| 409 |
# Main
|
| 410 |
# ─────────────────────────────────────────────────────────────────────────────
|
| 411 |
|
| 412 |
+
async def main() -> None:
|
| 413 |
+
"""Async entry point (wraps sync env calls; asyncio.run() expected by caller)."""
|
| 414 |
+
print("Smart Contract Audit RL Environment — Baseline Inference", flush=True)
|
| 415 |
+
print(f"Model: {MODEL_NAME} | Base URL: {API_BASE_URL}", flush=True)
|
| 416 |
|
| 417 |
t1 = run_task1(NUM_EPISODES)
|
| 418 |
t2 = run_task2(NUM_EPISODES)
|
| 419 |
t3 = run_task3(NUM_EPISODES)
|
| 420 |
|
| 421 |
results = {
|
| 422 |
+
"model": MODEL_NAME,
|
| 423 |
+
"base_url": API_BASE_URL,
|
| 424 |
+
"tasks": [t1, t2, t3],
|
| 425 |
}
|
| 426 |
+
overall = sum(t["avg_grader_score"] for t in results["tasks"]) / 3
|
|
|
|
|
|
|
| 427 |
results["overall_avg_score"] = overall
|
| 428 |
|
| 429 |
+
print("\n" + "="*60, flush=True)
|
| 430 |
+
print("BASELINE SUMMARY", flush=True)
|
| 431 |
+
print("="*60, flush=True)
|
| 432 |
for t in results["tasks"]:
|
| 433 |
+
print(f" ✅ {t['name']:40s}: {t['avg_grader_score']:.3f}", flush=True)
|
| 434 |
+
print(f"\n Overall avg grader score: {overall:.3f}", flush=True)
|
|
|
|
| 435 |
|
| 436 |
with open("baseline_scores.json", "w") as f:
|
| 437 |
json.dump(results, f, indent=2)
|
| 438 |
+
print("\n Scores written to baseline_scores.json", flush=True)
|
| 439 |
|
| 440 |
|
| 441 |
if __name__ == "__main__":
|
| 442 |
+
asyncio.run(main())
|
openenv.yaml
CHANGED
|
@@ -5,7 +5,7 @@ description: >
|
|
| 5 |
Agents interact with real-world Solidity contract data from Certora-audited
|
| 6 |
projects, practising three real audit tasks: vulnerability detection,
|
| 7 |
property discovery, and rule checking.
|
| 8 |
-
author: "
|
| 9 |
license: MIT
|
| 10 |
|
| 11 |
tasks:
|
|
@@ -73,7 +73,7 @@ action_space:
|
|
| 73 |
get_function_natspec: {params: {}, reward: -0.08}
|
| 74 |
get_file_natspec: {params: {}, reward: -0.03}
|
| 75 |
get_related_functions: {params: {}, reward: -0.06}
|
| 76 |
-
|
| 77 |
get_similar_rule: {params: {}, reward: -0.20}
|
| 78 |
submit_property: {params: {property: string}, reward: "0.0-5.0 keyword-weighted, one attempt"}
|
| 79 |
task3:
|
|
@@ -82,7 +82,7 @@ action_space:
|
|
| 82 |
get_function_code: {params: {function_name: string}, reward: -0.10}
|
| 83 |
get_state_variable: {params: {variable_name: "string opt"}, reward: -0.05}
|
| 84 |
get_call_graph: {params: {}, reward: -0.08}
|
| 85 |
-
|
| 86 |
submit_function: {params: {function_name: string}, reward: "+5.0 / +1.5 / -1.5, one attempt"}
|
| 87 |
|
| 88 |
reward:
|
|
@@ -135,4 +135,4 @@ interface:
|
|
| 135 |
python:
|
| 136 |
reset: "env.reset(seed=None) -> ResetResult"
|
| 137 |
step: "env.step(action) -> StepResult"
|
| 138 |
-
state: "env.state() -> StateResult"
|
|
|
|
| 5 |
Agents interact with real-world Solidity contract data from Certora-audited
|
| 6 |
projects, practising three real audit tasks: vulnerability detection,
|
| 7 |
property discovery, and rule checking.
|
| 8 |
+
author: "Codex47"
|
| 9 |
license: MIT
|
| 10 |
|
| 11 |
tasks:
|
|
|
|
| 73 |
get_function_natspec: {params: {}, reward: -0.08}
|
| 74 |
get_file_natspec: {params: {}, reward: -0.03}
|
| 75 |
get_related_functions: {params: {}, reward: -0.06}
|
| 76 |
+
get_io: {params: {}, reward: -0.04}
|
| 77 |
get_similar_rule: {params: {}, reward: -0.20}
|
| 78 |
submit_property: {params: {property: string}, reward: "0.0-5.0 keyword-weighted, one attempt"}
|
| 79 |
task3:
|
|
|
|
| 82 |
get_function_code: {params: {function_name: string}, reward: -0.10}
|
| 83 |
get_state_variable: {params: {variable_name: "string opt"}, reward: -0.05}
|
| 84 |
get_call_graph: {params: {}, reward: -0.08}
|
| 85 |
+
get_formalized_property: {params: {}, reward: -0.03}
|
| 86 |
submit_function: {params: {function_name: string}, reward: "+5.0 / +1.5 / -1.5, one attempt"}
|
| 87 |
|
| 88 |
reward:
|
|
|
|
| 135 |
python:
|
| 136 |
reset: "env.reset(seed=None) -> ResetResult"
|
| 137 |
step: "env.step(action) -> StepResult"
|
| 138 |
+
state: "env.state() -> StateResult"
|
tasks/task1/environment.py
CHANGED
|
@@ -27,14 +27,7 @@ from __future__ import annotations
|
|
| 27 |
import random
|
| 28 |
from typing import Any, Dict, List, Optional, Set
|
| 29 |
|
| 30 |
-
from data.data_loader import
|
| 31 |
-
load_contracts,
|
| 32 |
-
sample_episode,
|
| 33 |
-
get_function_by_name,
|
| 34 |
-
get_state_variable_by_name,
|
| 35 |
-
list_function_names,
|
| 36 |
-
list_state_variable_names,
|
| 37 |
-
)
|
| 38 |
from env.base_env import BaseEnv
|
| 39 |
from env.schemas import (
|
| 40 |
Action,
|
|
@@ -138,7 +131,7 @@ class Task1Environment(BaseEnv):
|
|
| 138 |
return StateResult(
|
| 139 |
task_id=TASK_ID,
|
| 140 |
contract_name=self._contract.get("contract_name", ""),
|
| 141 |
-
target_function=self._target_fn.get("name"),
|
| 142 |
step_count=self._step_count,
|
| 143 |
cumulative_reward=self._cumulative_reward,
|
| 144 |
done=self._done,
|
|
|
|
| 27 |
import random
|
| 28 |
from typing import Any, Dict, List, Optional, Set
|
| 29 |
|
| 30 |
+
from data.data_loader import load_contracts, sample_episode
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
from env.base_env import BaseEnv
|
| 32 |
from env.schemas import (
|
| 33 |
Action,
|
|
|
|
| 131 |
return StateResult(
|
| 132 |
task_id=TASK_ID,
|
| 133 |
contract_name=self._contract.get("contract_name", ""),
|
| 134 |
+
target_function=self._target_fn.get("name", ""),
|
| 135 |
step_count=self._step_count,
|
| 136 |
cumulative_reward=self._cumulative_reward,
|
| 137 |
done=self._done,
|
tasks/task1/grader.py
CHANGED
|
@@ -10,14 +10,6 @@ Deterministic grader. Score range: 0.0 – 1.0
|
|
| 10 |
from __future__ import annotations
|
| 11 |
from typing import Dict
|
| 12 |
from utils import SemanticMatcher
|
| 13 |
-
from data.data_loader import load_vulnerabilities
|
| 14 |
-
|
| 15 |
-
def match_vuln_keywords(submitted: str, expected: str) -> bool:
|
| 16 |
-
"""Checks if the submitted vulnerability type matches the expected one using keyword matching."""
|
| 17 |
-
for types in load_vulnerabilities():
|
| 18 |
-
if types["vulnerability"] == expected:
|
| 19 |
-
return SemanticMatcher().match(types["terms"], submitted)
|
| 20 |
-
return False
|
| 21 |
|
| 22 |
class Task1Grader:
|
| 23 |
def __init__(self, target_function: str, vulnerability_issue: str) -> None:
|
|
@@ -27,7 +19,7 @@ class Task1Grader:
|
|
| 27 |
def grade_submission(self, submitted_function: str, submitted_vuln_type: str) -> float:
|
| 28 |
if submitted_function.strip().lower() != self.target_function:
|
| 29 |
return 0.0
|
| 30 |
-
return 1.0 if
|
| 31 |
|
| 32 |
def reward_for_score(self, score: float) -> float:
|
| 33 |
if score == 1.0: return 5.0
|
|
|
|
| 10 |
from __future__ import annotations
|
| 11 |
from typing import Dict
|
| 12 |
from utils import SemanticMatcher
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
class Task1Grader:
|
| 15 |
def __init__(self, target_function: str, vulnerability_issue: str) -> None:
|
|
|
|
| 19 |
def grade_submission(self, submitted_function: str, submitted_vuln_type: str) -> float:
|
| 20 |
if submitted_function.strip().lower() != self.target_function:
|
| 21 |
return 0.0
|
| 22 |
+
return 1.0 if SemanticMatcher().match(self.vulnerability_issue, submitted_vuln_type) else 0.5
|
| 23 |
|
| 24 |
def reward_for_score(self, score: float) -> float:
|
| 25 |
if score == 1.0: return 5.0
|
tasks/task2/actions.py
CHANGED
|
@@ -124,7 +124,7 @@ def submit_property(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
|
|
| 124 |
score, confidence = ctx._grader.grade(submitted_text)
|
| 125 |
reward = round(score * 5.0, 4)
|
| 126 |
|
| 127 |
-
msg = f'Score: {score:.2f}/1.00 → Confidence: {confidence
|
| 128 |
return msg, Reward(
|
| 129 |
value=reward,
|
| 130 |
reason=f"Property submission score={score:.3f}",
|
|
|
|
| 124 |
score, confidence = ctx._grader.grade(submitted_text)
|
| 125 |
reward = round(score * 5.0, 4)
|
| 126 |
|
| 127 |
+
msg = f'Score: {score:.2f}/1.00 → Confidence: {confidence}\n'
|
| 128 |
return msg, Reward(
|
| 129 |
value=reward,
|
| 130 |
reason=f"Property submission score={score:.3f}",
|
tasks/task2/environment.py
CHANGED
|
@@ -134,7 +134,7 @@ class Task2Environment(BaseEnv):
|
|
| 134 |
return StateResult(
|
| 135 |
task_id=TASK_ID,
|
| 136 |
contract_name=self._contract.get("contract_name", ""),
|
| 137 |
-
target_function=self._target_fn.get("name"),
|
| 138 |
step_count=self._step_count,
|
| 139 |
cumulative_reward=self._cum_reward,
|
| 140 |
done=self._done,
|
|
|
|
| 134 |
return StateResult(
|
| 135 |
task_id=TASK_ID,
|
| 136 |
contract_name=self._contract.get("contract_name", ""),
|
| 137 |
+
target_function=self._target_fn.get("name", ""),
|
| 138 |
step_count=self._step_count,
|
| 139 |
cumulative_reward=self._cum_reward,
|
| 140 |
done=self._done,
|
tasks/task3/environment.py
CHANGED
|
@@ -35,7 +35,7 @@ from __future__ import annotations
|
|
| 35 |
|
| 36 |
import random
|
| 37 |
from typing import Any, Dict, List, Optional, Set
|
| 38 |
-
import actions
|
| 39 |
|
| 40 |
from data.data_loader import load_contracts, sample_task3_episode
|
| 41 |
from env.base_env import BaseEnv
|
|
@@ -83,6 +83,8 @@ class Task3Environment(BaseEnv):
|
|
| 83 |
self._seen: Set[str] = set()
|
| 84 |
|
| 85 |
# ── OpenEnv interface ─────────────────────────────────────────────────────
|
|
|
|
|
|
|
| 86 |
|
| 87 |
def reset(self, seed: Optional[int] = None) -> ResetResult:
|
| 88 |
if seed is not None:
|
|
@@ -94,8 +96,8 @@ class Task3Environment(BaseEnv):
|
|
| 94 |
t3 = self._target_fn["task3"]
|
| 95 |
self._grader = Task3Grader(
|
| 96 |
target_function=self._target_fn["name"],
|
| 97 |
-
partial_credit_functions=t3.get("partial_credit_functions", []),
|
| 98 |
-
property_english=t3.get("property_english", ""),
|
| 99 |
)
|
| 100 |
self._step_count = 0
|
| 101 |
self._cum_reward = 0.0
|
|
@@ -142,7 +144,7 @@ class Task3Environment(BaseEnv):
|
|
| 142 |
return StateResult(
|
| 143 |
task_id=TASK_ID,
|
| 144 |
contract_name=self._contract.get("contract_name", ""),
|
| 145 |
-
target_function=self._target_fn.get("name"),
|
| 146 |
step_count=self._step_count,
|
| 147 |
cumulative_reward=self._cum_reward,
|
| 148 |
done=self._done,
|
|
|
|
| 35 |
|
| 36 |
import random
|
| 37 |
from typing import Any, Dict, List, Optional, Set
|
| 38 |
+
from tasks.task3 import actions
|
| 39 |
|
| 40 |
from data.data_loader import load_contracts, sample_task3_episode
|
| 41 |
from env.base_env import BaseEnv
|
|
|
|
| 83 |
self._seen: Set[str] = set()
|
| 84 |
|
| 85 |
# ── OpenEnv interface ─────────────────────────────────────────────────────
|
| 86 |
+
|
| 87 |
+
# ! Need to change alot here
|
| 88 |
|
| 89 |
def reset(self, seed: Optional[int] = None) -> ResetResult:
|
| 90 |
if seed is not None:
|
|
|
|
| 96 |
t3 = self._target_fn["task3"]
|
| 97 |
self._grader = Task3Grader(
|
| 98 |
target_function=self._target_fn["name"],
|
| 99 |
+
partial_credit_functions=t3.get("partial_credit_functions", []), # ! doesn't exists
|
| 100 |
+
property_english=t3.get("property_english", ""), # ! doesn't exist
|
| 101 |
)
|
| 102 |
self._step_count = 0
|
| 103 |
self._cum_reward = 0.0
|
|
|
|
| 144 |
return StateResult(
|
| 145 |
task_id=TASK_ID,
|
| 146 |
contract_name=self._contract.get("contract_name", ""),
|
| 147 |
+
target_function=self._target_fn.get("name", ""),
|
| 148 |
step_count=self._step_count,
|
| 149 |
cumulative_reward=self._cum_reward,
|
| 150 |
done=self._done,
|
utils/prompts.py
CHANGED
|
@@ -18,7 +18,16 @@ Given a contract, identify the ONE vulnerable function and its vulnerability typ
|
|
| 18 |
timestamp dependence, denial of service, unchecked return value
|
| 19 |
4. Submit when confident
|
| 20 |
|
| 21 |
-
Respond ONLY with valid JSON. No explanation, no markdown.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
T2_SYSTEM = """You are a formal methods engineer specialising in Solidity smart contracts.
|
| 24 |
|
|
@@ -36,31 +45,6 @@ A good property covers:
|
|
| 36 |
{"action": "get_function_code", "params": {}}
|
| 37 |
{"action": "get_function_natspec", "params": {}}
|
| 38 |
{"action": "get_file_natspec", "params": {}}
|
| 39 |
-
|
| 40 |
-
def _t3_user_msg(obs: Dict[str, Any]) -> str:
|
| 41 |
-
extra = obs.get("extra", {})
|
| 42 |
-
return (
|
| 43 |
-
f"Contract : {obs['contract_name']}\n"
|
| 44 |
-
f"Property : {extra.get('property_english', '(no property)')}\n"
|
| 45 |
-
f"Step: {obs['step_count']} | Reward: {obs['cumulative_reward']:.2f}\n\n"
|
| 46 |
-
f"Last action: {obs['last_action'] or 'None'}\n"
|
| 47 |
-
f"Result:\n{obs['last_action_result'] or 'Episode started.'}"
|
| 48 |
-
)
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
def run_t3_episode(env: Task3Environment, seed: int, ep: int) -> Dict[str, Any]:
|
| 52 |
-
r = env.reset(seed=seed)
|
| 53 |
-
obs = r.observation.model_dump()
|
| 54 |
-
prop_preview = obs['extra'].get('property_english', '')[:55]
|
| 55 |
-
print(f" ep={ep} seed={seed} {obs['contract_name']} \"{prop_preview}...\"")
|
| 56 |
-
|
| 57 |
-
messages = [{"role": "system", "content": T3_SYSTEM}]
|
| 58 |
-
grader_score = 0.0
|
| 59 |
-
cum_reward = 0.0
|
| 60 |
-
|
| 61 |
-
for step in range(15):
|
| 62 |
-
messages.append({"role": "user", "content": _t3_user_msg(obs)})
|
| 63 |
-
|
| 64 |
{"action": "get_related_functions", "params": {}}
|
| 65 |
{"action": "get_io", "params": {}}
|
| 66 |
{"action": "get_similar_rule", "params": {}}
|
|
@@ -73,7 +57,14 @@ def run_t3_episode(env: Task3Environment, seed: int, ep: int) -> Dict[str, Any]:
|
|
| 73 |
- Write 2–4 sentences. Be specific about variable names and amounts.
|
| 74 |
- Do NOT guess — read the code first.
|
| 75 |
|
| 76 |
-
Respond ONLY with valid JSON. No markdown, no explanation.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
|
| 79 |
T3_SYSTEM = """You are a smart contract security auditor checking rule compliance.
|
|
|
|
| 18 |
timestamp dependence, denial of service, unchecked return value
|
| 19 |
4. Submit when confident
|
| 20 |
|
| 21 |
+
Respond ONLY with valid JSON. No explanation, no markdown.
|
| 22 |
+
|
| 23 |
+
## Evaluation Strategy:
|
| 24 |
+
Your output vulnerability_type will be compared to ground truth using a deterministic semantic matcher with
|
| 25 |
+
three weighted components:
|
| 26 |
+
- Lexical Jaccard (20%) - overlap of lemmatized, stopword‑removed tokens.
|
| 27 |
+
- Synonym Jaccard (25%) - overlap after expanding each word with WordNet synonyms.
|
| 28 |
+
- Semantic cosine (55%) - sentence‑embedding similarity (all‑MiniLM‑L6‑v2).
|
| 29 |
+
Match Threshold: score ≥ 0.72 → "match" (partial credit); score ≥ 0.88 → "strong match" (full credit).
|
| 30 |
+
"""
|
| 31 |
|
| 32 |
T2_SYSTEM = """You are a formal methods engineer specialising in Solidity smart contracts.
|
| 33 |
|
|
|
|
| 45 |
{"action": "get_function_code", "params": {}}
|
| 46 |
{"action": "get_function_natspec", "params": {}}
|
| 47 |
{"action": "get_file_natspec", "params": {}}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
{"action": "get_related_functions", "params": {}}
|
| 49 |
{"action": "get_io", "params": {}}
|
| 50 |
{"action": "get_similar_rule", "params": {}}
|
|
|
|
| 57 |
- Write 2–4 sentences. Be specific about variable names and amounts.
|
| 58 |
- Do NOT guess — read the code first.
|
| 59 |
|
| 60 |
+
Respond ONLY with valid JSON. No markdown, no explanation.
|
| 61 |
+
|
| 62 |
+
## Evaluation Strategy:
|
| 63 |
+
Your output vulnerability_type will be compared to ground truth using a deterministic semantic matcher with three weighted components:
|
| 64 |
+
- Lexical Jaccard (20%) - overlap of lemmatized, stopword‑removed tokens.
|
| 65 |
+
- Synonym Jaccard (25%) - overlap after expanding each word with WordNet synonyms.
|
| 66 |
+
- Semantic cosine (55%) - sentence‑embedding similarity (all‑MiniLM‑L6‑v2).
|
| 67 |
+
Match Threshold: score ≥ 0.72 → "match" (partial credit); score ≥ 0.88 → "strong match" (full credit)."""
|
| 68 |
|
| 69 |
|
| 70 |
T3_SYSTEM = """You are a smart contract security auditor checking rule compliance.
|
utils/propertyretriever.py
CHANGED
|
@@ -10,10 +10,8 @@ import pandas as pd
|
|
| 10 |
import numpy as np
|
| 11 |
from sentence_transformers import SentenceTransformer
|
| 12 |
from sklearn.preprocessing import normalize
|
| 13 |
-
import
|
| 14 |
|
| 15 |
-
DATA_DIR = os.path.join(os.path.dirname(__file__))
|
| 16 |
-
DEFAULT_CSV_PATH = os.path.join(DATA_DIR, "properties.csv")
|
| 17 |
SIMILARITY_THRESHOLD = 0.8 # Adjust as needed based on validation
|
| 18 |
|
| 19 |
# -------------------------------------------------------------------
|
|
|
|
| 10 |
import numpy as np
|
| 11 |
from sentence_transformers import SentenceTransformer
|
| 12 |
from sklearn.preprocessing import normalize
|
| 13 |
+
from data.data_loader import DEFAULT_CSV_PATH
|
| 14 |
|
|
|
|
|
|
|
| 15 |
SIMILARITY_THRESHOLD = 0.8 # Adjust as needed based on validation
|
| 16 |
|
| 17 |
# -------------------------------------------------------------------
|
utils/semanticmatcher.py
CHANGED
|
@@ -200,6 +200,7 @@ class SemanticMatcher:
|
|
| 200 |
"""
|
| 201 |
# Fast-path: normalized exact match
|
| 202 |
if normalize(text_a) == normalize(text_b):
|
|
|
|
| 203 |
return True
|
| 204 |
|
| 205 |
tokens_a = tokenize_and_lemmatize(text_a)
|
|
|
|
| 200 |
"""
|
| 201 |
# Fast-path: normalized exact match
|
| 202 |
if normalize(text_a) == normalize(text_b):
|
| 203 |
+
self.confidence_level = "strong"
|
| 204 |
return True
|
| 205 |
|
| 206 |
tokens_a = tokenize_and_lemmatize(text_a)
|
validate.py
CHANGED
|
@@ -77,7 +77,7 @@ def check_t2_env():
|
|
| 77 |
assert r.observation.task_id == "task2_property_discovery"
|
| 78 |
assert "target_function" in r.observation.extra
|
| 79 |
for at in [ActionType.GET_FUNCTION_CODE, ActionType.GET_FUNCTION_NATSPEC,
|
| 80 |
-
ActionType.GET_FILE_NATSPEC, ActionType.
|
| 81 |
ActionType.GET_RELATED_FUNCTIONS, ActionType.GET_SIMILAR_RULE]:
|
| 82 |
env.step(Action(action_type=at))
|
| 83 |
|
|
@@ -180,7 +180,7 @@ def check_t2_grader():
|
|
| 180 |
from data.data_loader import load_contracts, get_all_property_entries
|
| 181 |
for c, fn in get_all_property_entries(load_contracts()):
|
| 182 |
g = Task2Grader(fn["name"], fn["property"])
|
| 183 |
-
assert g.grade(fn["property"][
|
| 184 |
assert g.grade("") == 0.0
|
| 185 |
s = g.grade("test"); assert s == g.grade("test") # deterministic
|
| 186 |
|
|
@@ -299,4 +299,4 @@ def main():
|
|
| 299 |
sys.exit(0)
|
| 300 |
|
| 301 |
if __name__ == "__main__":
|
| 302 |
-
main()
|
|
|
|
| 77 |
assert r.observation.task_id == "task2_property_discovery"
|
| 78 |
assert "target_function" in r.observation.extra
|
| 79 |
for at in [ActionType.GET_FUNCTION_CODE, ActionType.GET_FUNCTION_NATSPEC,
|
| 80 |
+
ActionType.GET_FILE_NATSPEC, ActionType.GET_SIGNATURE,
|
| 81 |
ActionType.GET_RELATED_FUNCTIONS, ActionType.GET_SIMILAR_RULE]:
|
| 82 |
env.step(Action(action_type=at))
|
| 83 |
|
|
|
|
| 180 |
from data.data_loader import load_contracts, get_all_property_entries
|
| 181 |
for c, fn in get_all_property_entries(load_contracts()):
|
| 182 |
g = Task2Grader(fn["name"], fn["property"])
|
| 183 |
+
assert g.grade(fn["property"])[0] >= 0.65
|
| 184 |
assert g.grade("") == 0.0
|
| 185 |
s = g.grade("test"); assert s == g.grade("test") # deterministic
|
| 186 |
|
|
|
|
| 299 |
sys.exit(0)
|
| 300 |
|
| 301 |
if __name__ == "__main__":
|
| 302 |
+
main()
|