ajaxwin commited on
Commit
671787b
·
1 Parent(s): 056cf7b

task1, task2 evaluated

Browse files
agents/task1.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Agents for Task 1: Function + Vulnerability Identification."""
2
+
3
+ import random as _random
4
+ from typing import Any, Dict, List
5
+
6
+ from tasks.task1 import Task1Environment
7
+ from env.schemas import Action, ActionType
8
+ from data.data_loader import load_contracts, get_function_by_name
9
+
10
+ # ─────────────────────────────────────────────────────────────────────────────
11
+ # Helpers
12
+ # ─────────────────────────────────────────────────────────────────────────────
13
+
14
+ def _parse_fn_list(result_text: str) -> List[str]:
15
+ """Parse 'Functions in X: f1, f2, f3' into [f1, f2, f3]."""
16
+ if ": " in result_text:
17
+ return [f.strip() for f in result_text.split(": ", 1)[-1].split(", ") if f.strip()]
18
+ return []
19
+
20
+
21
+ # ─────────────────────────────────────────────────────────────────────────────
22
+ # Task 1 agents
23
+ # ─────────────────────────────────────────────────────────────────────────────
24
+
25
+ def oracle_t1(env: Task1Environment, seed: int, verbose: bool = False) -> Dict[str, Any]:
26
+ """Submits exact ground-truth function + vulnerability type → score = 1.0."""
27
+ r = env.reset(seed=seed)
28
+ obs = r.observation
29
+ fn_name = env.state().target_function
30
+ contracts = load_contracts()
31
+ vuln_issue = ""
32
+ for c in contracts:
33
+ fn = get_function_by_name(c, fn_name)
34
+ if fn and fn.get("vulnerable"):
35
+ vuln_issue = fn["vulnerability_details"]["issue"]
36
+ break
37
+ if verbose:
38
+ print(f" {obs.contract_name}.{fn_name}() [{vuln_issue}]")
39
+ env.step(Action(action_type=ActionType.LIST_FUNCTIONS))
40
+ env.step(Action(action_type=ActionType.GET_FUNCTION_CODE,
41
+ params={"function_name": fn_name}))
42
+ result = env.step(Action(action_type=ActionType.SUBMIT,
43
+ params={"function_name": fn_name,
44
+ "vulnerability_type": vuln_issue}))
45
+ v = result.reward.value
46
+ score = 1.0 if v >= 4.9 else (0.5 if v >= 0.9 else 0.0)
47
+ return {"seed": seed, "contract": obs.contract_name, "target_function": fn_name,
48
+ "vulnerability": vuln_issue, "grader_score": score,
49
+ "cumulative_reward": result.observation.cumulative_reward}
50
+
51
+
52
+ def partial_t1(env: Task1Environment, seed: int) -> Dict[str, Any]:
53
+ """Correct function, 'unknown' vuln type → score = 0.5."""
54
+ env.reset(seed=seed)
55
+ fn_name = env.state().target_function
56
+ result = env.step(Action(action_type=ActionType.SUBMIT,
57
+ params={"function_name": fn_name, "vulnerability_type": "unknown"}))
58
+ v = result.reward.value
59
+ return {"seed": seed, "grader_score": 0.5 if v >= 0.9 else 0.0,
60
+ "cumulative_reward": result.observation.cumulative_reward}
61
+
62
+
63
+ def random_t1(env: Task1Environment, seed: int) -> Dict[str, Any]:
64
+ """Genuine random agent: random browse then submits a random function + random vuln type.
65
+
66
+ Uses a seeded RNG (offset from episode seed) so results are reproducible.
67
+ Expected score: low (~0–5%) since must randomly hit both right function and right keyword, plus
68
+ can submit only once per episode.
69
+ """
70
+ rng = _random.Random(seed ^ 0x5A1AD) # different RNG stream from episode seed
71
+ env.reset(seed=seed)
72
+
73
+ # Step 1: list functions to get real candidates
74
+ s = env.step(Action(action_type=ActionType.LIST_FUNCTIONS))
75
+ fns = _parse_fn_list(s.observation.last_action_result or "")
76
+ if not fns:
77
+ fns = ["deposit", "withdraw", "constructor"] # fallback
78
+
79
+ # Step 2: do 1–2 random browse actions (not repeated)
80
+ browse_pool = [
81
+ (ActionType.GET_FILE_METADATA, {}),
82
+ (ActionType.GET_CALL_GRAPH, {}),
83
+ (ActionType.GET_STATE_VARIABLE, {}),
84
+ ]
85
+ _random.Random(seed).shuffle(browse_pool) # deterministic order
86
+ for at, params in browse_pool[:rng.randint(1, 2)]:
87
+ env.step(Action(action_type=at, params=params))
88
+
89
+ # Step 3: submit a random function from the real list, random vuln type
90
+ random_fn = rng.choice(fns)
91
+ vuln_pool = [
92
+ "bad logic", "incorrect check", "overflow", "no guard", "wrong order",
93
+ "missing event", "unprotected", "stale data", "unsafe cast",
94
+ ]
95
+ random_vuln = rng.choice(vuln_pool)
96
+ result = env.step(Action(action_type=ActionType.SUBMIT,
97
+ params={"function_name": random_fn,
98
+ "vulnerability_type": random_vuln}))
99
+ v = result.reward.value
100
+ score = 1.0 if v >= 4.9 else (0.5 if v >= 0.9 else 0.0)
101
+ return {"seed": seed, "grader_score": score, "submitted_fn": random_fn,
102
+ "submitted_vuln": random_vuln,
103
+ "cumulative_reward": result.observation.cumulative_reward}
104
+
105
+
106
+ def floor_t1(env: Task1Environment, seed: int) -> Dict[str, Any]:
107
+ """Always submits 'constructor' → guaranteed score = 0.0."""
108
+ env.reset(seed=seed)
109
+ result = env.step(Action(action_type=ActionType.SUBMIT,
110
+ params={"function_name": "constructor",
111
+ "vulnerability_type": "reentrancy"}))
112
+ return {"seed": seed, "grader_score": 0.0,
113
+ "cumulative_reward": result.observation.cumulative_reward}
agents/task2.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Agents for Task2: Property Discovery"""
2
+
3
+ import random as _random
4
+ from typing import Any, Dict, List
5
+
6
+ from tasks.task2 import Task2Environment
7
+ from env.schemas import Action, ActionType
8
+ from data.data_loader import load_contracts, get_function_by_name
9
+
10
+ # ─────────────────────────────────────────────────────────────────────────────
11
+ # Task 2 agents
12
+ # ─────────────────────────────────────────────────────────────────────────────
13
+
14
+ def oracle_t2(env: Task2Environment, seed: int, verbose: bool = False) -> Dict[str, Any]:
15
+ """Submits ground-truth in natural langugage (English) → score ≥ 0.70."""
16
+ r = env.reset(seed=seed)
17
+ obs = r.observation
18
+ fn_name = obs.extra["target_function"]
19
+ contract = obs.contract_name
20
+ contracts = load_contracts()
21
+ gt_text = ""
22
+ for c in contracts:
23
+ if c["contract_name"] == contract:
24
+ fn = get_function_by_name(c, fn_name)
25
+ if fn and fn.get("property"):
26
+ gt_text = fn["property"]
27
+ break
28
+ if verbose:
29
+ print(f" {contract}.{fn_name}()")
30
+ env.step(Action(action_type=ActionType.GET_FUNCTION_CODE))
31
+ result = env.step(Action(action_type=ActionType.SUBMIT_PROPERTY,
32
+ params={"property": gt_text}))
33
+ r_val = result.reward.value
34
+ score = round(r_val / 5.0, 4) if r_val > 0 else 0.0
35
+ return {"seed": seed, "contract": contract, "function": fn_name,
36
+ "grader_score": score,
37
+ "cumulative_reward": result.observation.cumulative_reward}
38
+
39
+
40
+ def partial_t2(env: Task2Environment, seed: int) -> Dict[str, Any]:
41
+ """Submits only the function's short NatSpec comment — partial credit."""
42
+ r = env.reset(seed=seed)
43
+ obs = r.observation
44
+ contracts = load_contracts()
45
+ comment = ""
46
+ for c in contracts:
47
+ if c["contract_name"] == obs.contract_name:
48
+ fn = get_function_by_name(c, obs.extra["target_function"])
49
+ if fn:
50
+ comment = fn.get("comment", "")
51
+ break
52
+ result = env.step(Action(action_type=ActionType.SUBMIT_PROPERTY,
53
+ params={"property": comment}))
54
+ r_val = result.reward.value
55
+ return {"seed": seed, "grader_score": round(r_val / 5.0, 4) if r_val > 0 else 0.0,
56
+ "cumulative_reward": result.observation.cumulative_reward}
57
+
58
+
59
+ def random_t2(env: Task2Environment, seed: int) -> Dict[str, Any]:
60
+ """Genuine random agent: random browse then submits a generic property template.
61
+
62
+ The submitted text contains high-frequency words that are unlikely to match
63
+ task-specific key phrases. Expected score: near 0 (coincidental matches only).
64
+ Uses a seeded RNG for reproducibility.
65
+ """
66
+ rng = _random.Random(seed ^ 0xBEEF1)
67
+
68
+ r = env.reset(seed=seed)
69
+ obs = r.observation
70
+ fn_name = obs.extra.get("target_function", "this function")
71
+
72
+ # Random browse: pick 1–2 actions at random
73
+ browse_pool = [
74
+ ActionType.GET_FILE_NATSPEC,
75
+ ActionType.GET_RELATED_FUNCTIONS,
76
+ ActionType.GET_SIGNATURE,
77
+ ]
78
+ rng.shuffle(browse_pool)
79
+ for at in browse_pool[:rng.randint(1, 2)]:
80
+ env.step(Action(action_type=at))
81
+
82
+ # Submit a randomly assembled generic property (won't match specific key phrases)
83
+ templates = [
84
+ f"The {fn_name} operation completes the intended computation on the input data.",
85
+ f"When {fn_name} executes, it processes the provided arguments and updates the contract.",
86
+ f"The {fn_name} function validates inputs and performs the expected operation.",
87
+ f"Calling {fn_name} causes the contract to execute its designated logic.",
88
+ f"{fn_name} runs when invoked and modifies internal state as designed.",
89
+ ]
90
+ prop = rng.choice(templates)
91
+ result = env.step(Action(action_type=ActionType.SUBMIT_PROPERTY,
92
+ params={"property": prop}))
93
+ r_val = result.reward.value
94
+ return {"seed": seed, "grader_score": round(r_val / 5.0, 4) if r_val > 0 else 0.0,
95
+ "submitted": prop[:60],
96
+ "cumulative_reward": result.observation.cumulative_reward}
97
+
98
+
99
+ def floor_t2(env: Task2Environment, seed: int) -> Dict[str, Any]:
100
+ """Submits empty string → score = 0.0 guaranteed."""
101
+ env.reset(seed=seed)
102
+ result = env.step(Action(action_type=ActionType.SUBMIT_PROPERTY,
103
+ params={"property": ""}))
104
+ return {"seed": seed, "grader_score": 0.0,
105
+ "cumulative_reward": result.observation.cumulative_reward}
agents/task3.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ Agents for Task3 : Rule Checking for a function """
2
+
3
+ import json
4
+ import random as _random
5
+ from typing import Any, Dict, List
6
+
7
+ from tasks.task3 import Task3Environment
8
+ from env.schemas import Action, ActionType
9
+ from data.data_loader import load_contracts, get_function_by_name
10
+
11
+ # ─────────────────────────────────────────────────────────────────────────────
12
+ # Helpers
13
+ # ─────────────────────────────────────────────────────────────────────────────
14
+
15
+ def _parse_fn_list(result_text: str) -> List[str]:
16
+ """Parse 'Functions in X: f1, f2, f3' into [f1, f2, f3]."""
17
+ if ": " in result_text:
18
+ return [f.strip() for f in result_text.split(": ", 1)[-1].split(", ") if f.strip()]
19
+ return []
20
+
21
+
22
+ # ─────────────────────────────────────────────────────────────────────────────
23
+ # Task 3 agents
24
+ # ─────────────────────────────────────────────────────────────────────────────
25
+
26
+ def oracle_t3(env: Task3Environment, seed: int, verbose: bool = False) -> Dict[str, Any]:
27
+ """Submits exact target function → score = 1.0."""
28
+ r = env.reset(seed=seed)
29
+ obs = r.observation
30
+ fn_name = env.state().target_function
31
+ contract = obs.contract_name
32
+ if verbose:
33
+ prop = obs.extra.get("property_english", "")[:60]
34
+ print(f" {contract}.{fn_name}() \"{prop}\"")
35
+ env.step(Action(action_type=ActionType.GET_PROPERTY_SPECIFICATION))
36
+ env.step(Action(action_type=ActionType.LIST_FUNCTIONS))
37
+ result = env.step(Action(action_type=ActionType.SUBMIT_FUNCTION,
38
+ params={"function_name": fn_name}))
39
+ v = result.reward.value
40
+ score = 1.0 if v >= 4.9 else (0.3 if v >= 1.0 else 0.0)
41
+ return {"seed": seed, "contract": contract, "target_function": fn_name,
42
+ "grader_score": score,
43
+ "cumulative_reward": result.observation.cumulative_reward}
44
+
45
+
46
+ def subfunction_t3(env: Task3Environment, seed: int) -> Dict[str, Any]:
47
+ """Submits the first partial-credit subfunction if one exists, else 'constructor'."""
48
+ r = env.reset(seed=seed)
49
+ obs = r.observation
50
+ contracts = load_contracts()
51
+ partial_fns = []
52
+ for c in contracts:
53
+ if c["contract_name"] == obs.contract_name:
54
+ fn = get_function_by_name(c, env.state().target_function)
55
+ if fn:
56
+ partial_fns = fn.get("task3", {}).get("partial_credit_functions", [])
57
+ break
58
+ submit_name = partial_fns[0] if partial_fns else "constructor"
59
+ result = env.step(Action(action_type=ActionType.SUBMIT_FUNCTION,
60
+ params={"function_name": submit_name}))
61
+ v = result.reward.value
62
+ score = 1.0 if v >= 4.9 else (0.3 if v >= 1.0 else 0.0)
63
+ return {"seed": seed, "grader_score": score, "submitted": submit_name,
64
+ "cumulative_reward": result.observation.cumulative_reward}
65
+
66
+
67
+ def random_t3(env: Task3Environment, seed: int) -> Dict[str, Any]:
68
+ """Genuine random agent: lists functions, picks one at random, submits.
69
+
70
+ With N functions per contract and 1 target, expected score ≈ 1/N ≈ 0.20–0.25.
71
+ Uses a seeded RNG for reproducibility.
72
+ """
73
+ rng = _random.Random(seed ^ 0xCAFE1)
74
+ env.reset(seed=seed)
75
+
76
+ # Step 1: get function list (necessary to pick a real candidate)
77
+ s = env.step(Action(action_type=ActionType.LIST_FUNCTIONS))
78
+ fns = _parse_fn_list(s.observation.last_action_result or "")
79
+ if not fns:
80
+ fns = ["constructor"]
81
+
82
+ # Step 2: optionally do 1 cheap browse action (formalized or call_graph)
83
+ browse_options = [
84
+ (ActionType.GET_PROPERTY_SPECIFICATION, {}),
85
+ (ActionType.GET_CALL_GRAPH, {}),
86
+ ]
87
+ at, params = rng.choice(browse_options)
88
+ env.step(Action(action_type=at, params=params))
89
+
90
+ # Step 3: submit a uniformly random function from the real list
91
+ chosen = rng.choice(fns)
92
+ result = env.step(Action(action_type=ActionType.SUBMIT_FUNCTION,
93
+ params={"function_name": chosen}))
94
+ v = result.reward.value
95
+ score = 1.0 if v >= 4.9 else (0.3 if v >= 1.0 else 0.0)
96
+ return {"seed": seed, "grader_score": score, "submitted": chosen,
97
+ "cumulative_reward": result.observation.cumulative_reward}
98
+
99
+
100
+ def floor_t3(env: Task3Environment, seed: int) -> Dict[str, Any]:
101
+ """Always submits 'constructor' → guaranteed score = 0.0."""
102
+ env.reset(seed=seed)
103
+ result = env.step(Action(action_type=ActionType.SUBMIT_FUNCTION,
104
+ params={"function_name": "constructor"}))
105
+ return {"seed": seed, "grader_score": 0.0,
106
+ "cumulative_reward": result.observation.cumulative_reward}
app.py CHANGED
@@ -22,9 +22,9 @@ from fastapi import FastAPI, HTTPException, Query
22
  from pydantic import BaseModel
23
 
24
  from env.schemas import Action, ActionType, TaskInfo
25
- from tasks.task1.environment import Task1Environment
26
- from tasks.task2.environment import Task2Environment
27
- from tasks.task3.environment import Task3Environment
28
 
29
  # ─────────────────────────────────────────────────────────────────────────────
30
  # App
@@ -191,7 +191,7 @@ def action_space(task_id: str = "task1_vuln_detection"):
191
  {"type": "get_function_natspec", "params": {}, "reward": -0.08, "description": "Read NatSpec + expected behaviour"},
192
  {"type": "get_file_natspec", "params": {}, "reward": -0.03, "description": "Read contract-level NatSpec"},
193
  {"type": "get_related_functions", "params": {}, "reward": -0.06, "description": "List caller/callee functions with summaries"},
194
- {"type": "get_io", "params": {}, "reward": -0.04, "description": "Get structured I/O + expected behaviour"},
195
  {"type": "get_similar_rule", "params": {}, "reward": -0.20, "description": "Get a similar property from another contract"},
196
  {"type": "submit_property", "params": {"property": "string"}, "reward": "0.0–5.0 (scored)", "description": "Submit property. ONE attempt. Ends episode."},
197
  ],
 
22
  from pydantic import BaseModel
23
 
24
  from env.schemas import Action, ActionType, TaskInfo
25
+ from tasks.task1 import Task1Environment
26
+ from tasks.task2 import Task2Environment
27
+ from tasks.task3 import Task3Environment
28
 
29
  # ─────────────────────────────────────────────────────────────────────────────
30
  # App
 
191
  {"type": "get_function_natspec", "params": {}, "reward": -0.08, "description": "Read NatSpec + expected behaviour"},
192
  {"type": "get_file_natspec", "params": {}, "reward": -0.03, "description": "Read contract-level NatSpec"},
193
  {"type": "get_related_functions", "params": {}, "reward": -0.06, "description": "List caller/callee functions with summaries"},
194
+ {"type": "get_signature", "params": {}, "reward": -0.04, "description": "Get structured I/O + expected behaviour"},
195
  {"type": "get_similar_rule", "params": {}, "reward": -0.20, "description": "Get a similar property from another contract"},
196
  {"type": "submit_property", "params": {"property": "string"}, "reward": "0.0–5.0 (scored)", "description": "Submit property. ONE attempt. Ends episode."},
197
  ],
data/contracts.json CHANGED
@@ -4827,7 +4827,7 @@
4827
  ]
4828
  },
4829
  {
4830
- "contract_name": "StableDebtToken",
4831
  "file_name": "StableDebtToken.sol",
4832
  "metadata": {
4833
  "license": "agpl-3.0",
@@ -5380,7 +5380,7 @@
5380
  ]
5381
  },
5382
  {
5383
- "contract_name": "ATokenVault",
5384
  "file_name": "ATokenVault_old.sol",
5385
  "metadata": {
5386
  "license": "MIT",
 
4827
  ]
4828
  },
4829
  {
4830
+ "contract_name": "StableDebtToken_OLD",
4831
  "file_name": "StableDebtToken.sol",
4832
  "metadata": {
4833
  "license": "agpl-3.0",
 
5380
  ]
5381
  },
5382
  {
5383
+ "contract_name": "ATokenVault_OLD",
5384
  "file_name": "ATokenVault_old.sol",
5385
  "metadata": {
5386
  "license": "MIT",
data/data_loader.py CHANGED
@@ -15,7 +15,7 @@ from typing import Any, Dict, List, Optional, Tuple
15
 
16
  DATA_DIR = os.path.join(os.path.dirname(__file__))
17
  DEFAULT_CONTRACTS_FILE = os.path.join(DATA_DIR, "contracts.json")
18
- DEFAULT_VUNERABILITIES_FILE = os.path.join(DATA_DIR, "vulnerabilities.json")
19
 
20
 
21
  # ────────────────────────────────────────────────────────────────
@@ -62,11 +62,6 @@ def list_state_variable_names(contract: Dict[str, Any]) -> List[str]:
62
  # Task 1 helpers
63
  # ────────────────────────────────────────────────────────────────
64
 
65
- def load_vulnerabilities(path: str = DEFAULT_VUNERABILITIES_FILE) -> List[Dict[str, Any]]:
66
- """Load and return all vulnerability entries from the JSON dataset."""
67
- with open(path, "r") as f:
68
- return json.load(f)
69
-
70
  def get_all_vulnerable_entries(
71
  contracts: List[Dict[str, Any]],
72
  ) -> List[Tuple[Dict[str, Any], Dict[str, Any]]]:
 
15
 
16
  DATA_DIR = os.path.join(os.path.dirname(__file__))
17
  DEFAULT_CONTRACTS_FILE = os.path.join(DATA_DIR, "contracts.json")
18
+ DEFAULT_CSV_PATH = os.path.join(DATA_DIR, "properties.csv")
19
 
20
 
21
  # ────────────────────────────────────────────────────────────────
 
62
  # Task 1 helpers
63
  # ────────────────────────────────────────────────────────────────
64
 
 
 
 
 
 
65
  def get_all_vulnerable_entries(
66
  contracts: List[Dict[str, Any]],
67
  ) -> List[Tuple[Dict[str, Any], Dict[str, Any]]]:
env/schemas.py CHANGED
@@ -130,7 +130,7 @@ class ResetResult(BaseModel):
130
  class StateResult(BaseModel):
131
  task_id: str
132
  contract_name: str
133
- target_function: Optional[str] = None # hidden in real eval, exposed here for debugging
134
  step_count: int
135
  cumulative_reward: float
136
  done: bool
 
130
  class StateResult(BaseModel):
131
  task_id: str
132
  contract_name: str
133
+ target_function: str # hidden in real eval, exposed here for debugging
134
  step_count: int
135
  cumulative_reward: float
136
  done: bool
eval.py CHANGED
@@ -3,197 +3,44 @@ eval.py
3
  -------
4
  Evaluation harness for all three tasks.
5
 
6
- Runs oracle / partial / baseline agents, verifying score orderings and
7
- that reward shaping is meaningful across the trajectory.
 
 
 
8
 
9
  Usage:
10
- python eval.py # all tasks, 8 episodes each
11
- python eval.py --task 1|2|3 # single task
12
  python eval.py --episodes 16 --verbose
13
  python eval.py --out results.json
14
  """
15
 
16
  import argparse
17
  import json
 
18
  from typing import Any, Dict, List
19
 
20
- from tasks.task1.environment import Task1Environment
21
- from tasks.task2.environment import Task2Environment
22
- from tasks.task3.environment import Task3Environment
23
- from env.schemas import Action, ActionType
 
 
24
  from data.data_loader import (
25
  load_contracts,
26
- get_function_by_name,
27
  get_all_vulnerable_entries,
28
  get_all_property_entries,
29
  get_all_task3_entries,
30
  )
31
 
32
-
33
- # ─────────────────────────────────────────────────────────────────────────────
34
- # Task 1 agents
35
- # ─────────────────────────────────────────────────────────────────────────────
36
-
37
- def oracle_t1(env: Task1Environment, seed: int, verbose: bool = False) -> Dict[str, Any]:
38
- """Submits the exact ground-truth function + vulnerability → score = 1.0."""
39
- r = env.reset(seed=seed)
40
- obs = r.observation
41
- fn_name = env.state().target_function
42
- contracts = load_contracts()
43
- vuln_issue = ""
44
- for c in contracts:
45
- fn = get_function_by_name(c, fn_name)
46
- if fn and fn.get("vulnerable"):
47
- vuln_issue = fn["vulnerability_details"]["issue"]
48
- break
49
- if verbose:
50
- print(f" {obs.contract_name}.{fn_name}() [{vuln_issue}]")
51
- env.step(Action(action_type=ActionType.LIST_FUNCTIONS))
52
- env.step(Action(action_type=ActionType.GET_FUNCTION_CODE,
53
- params={"function_name": fn_name}))
54
- result = env.step(Action(action_type=ActionType.SUBMIT,
55
- params={"function_name": fn_name,
56
- "vulnerability_type": vuln_issue}))
57
- v = result.reward.value
58
- score = 1.0 if v >= 4.9 else (0.5 if v >= 0.9 else 0.0)
59
- return {"seed": seed, "contract": obs.contract_name, "target_function": fn_name,
60
- "vulnerability": vuln_issue, "grader_score": score,
61
- "cumulative_reward": result.observation.cumulative_reward}
62
-
63
-
64
- def partial_t1(env: Task1Environment, seed: int) -> Dict[str, Any]:
65
- """Right function, 'unknown' vuln type → score = 0.5."""
66
- env.reset(seed=seed)
67
- fn_name = env.state().target_function
68
- result = env.step(Action(action_type=ActionType.SUBMIT,
69
- params={"function_name": fn_name, "vulnerability_type": "unknown"}))
70
- v = result.reward.value
71
- return {"seed": seed, "grader_score": 0.5 if v >= 0.9 else 0.0,
72
- "cumulative_reward": result.observation.cumulative_reward}
73
-
74
-
75
- def wrong_t1(env: Task1Environment, seed: int) -> Dict[str, Any]:
76
- """Always submits 'constructor' → score = 0.0."""
77
- env.reset(seed=seed)
78
- result = env.step(Action(action_type=ActionType.SUBMIT,
79
- params={"function_name": "constructor",
80
- "vulnerability_type": "reentrancy"}))
81
- return {"seed": seed, "grader_score": 0.0,
82
- "cumulative_reward": result.observation.cumulative_reward}
83
-
84
-
85
  # ─────────────────────────────────────────────────────────────────────────────
86
- # Task 2 agents
87
- # ─────────────────────────────────────────────────────────────────────────────
88
-
89
- def oracle_t2(env: Task2Environment, seed: int, verbose: bool = False) -> Dict[str, Any]:
90
- """Submits ground-truth natural_language → score ≥ 0.70."""
91
- r = env.reset(seed=seed)
92
- obs = r.observation
93
- fn_name = obs.extra["target_function"]
94
- contract = obs.contract_name
95
- contracts = load_contracts()
96
- gt_text = ""
97
- for c in contracts:
98
- if c["contract_name"] == contract:
99
- fn = get_function_by_name(c, fn_name)
100
- if fn and fn.get("property"):
101
- gt_text = fn["property"]["natural_language"]
102
- break
103
- if verbose:
104
- print(f" {contract}.{fn_name}()")
105
- env.step(Action(action_type=ActionType.GET_FUNCTION_CODE))
106
- result = env.step(Action(action_type=ActionType.SUBMIT_PROPERTY,
107
- params={"property": gt_text}))
108
- r_val = result.reward.value
109
- score = round(r_val / 5.0, 4) if r_val > 0 else 0.0
110
- return {"seed": seed, "contract": contract, "function": fn_name,
111
- "grader_score": score, "cumulative_reward": result.observation.cumulative_reward}
112
-
113
-
114
- def partial_t2(env: Task2Environment, seed: int) -> Dict[str, Any]:
115
- """Submits the function's NatSpec comment — partial credit."""
116
- r = env.reset(seed=seed)
117
- obs = r.observation
118
- contracts = load_contracts()
119
- comment = ""
120
- for c in contracts:
121
- if c["contract_name"] == obs.contract_name:
122
- fn = get_function_by_name(c, obs.extra["target_function"])
123
- if fn:
124
- comment = fn.get("comment", "")
125
- break
126
- result = env.step(Action(action_type=ActionType.SUBMIT_PROPERTY,
127
- params={"property": comment}))
128
- r_val = result.reward.value
129
- return {"seed": seed, "grader_score": round(r_val / 5.0, 4) if r_val > 0 else 0.0,
130
- "cumulative_reward": result.observation.cumulative_reward}
131
-
132
-
133
- def empty_t2(env: Task2Environment, seed: int) -> Dict[str, Any]:
134
- """Submits empty string → score = 0.0."""
135
- env.reset(seed=seed)
136
- result = env.step(Action(action_type=ActionType.SUBMIT_PROPERTY, params={"property": ""}))
137
- return {"seed": seed, "grader_score": 0.0,
138
- "cumulative_reward": result.observation.cumulative_reward}
139
-
140
-
141
- # ─────────────────────────────────────────────────────────────────────────────
142
- # Task 3 agents
143
  # ─────────────────────────────────────────────────────────────────────────────
144
 
145
- def oracle_t3(env: Task3Environment, seed: int, verbose: bool = False) -> Dict[str, Any]:
146
- """Always submits the exact target function score = 1.0."""
147
- r = env.reset(seed=seed)
148
- obs = r.observation
149
- fn_name = env.state().target_function
150
- contract = obs.contract_name
151
- if verbose:
152
- prop = obs.extra.get("property_english", "")[:60]
153
- print(f" {contract}.{fn_name}() \"{prop}\"")
154
- env.step(Action(action_type=ActionType.GET_PROPERTY_SPECIFICATION))
155
- env.step(Action(action_type=ActionType.LIST_FUNCTIONS))
156
- result = env.step(Action(action_type=ActionType.SUBMIT_FUNCTION,
157
- params={"function_name": fn_name}))
158
- v = result.reward.value
159
- score = 1.0 if v >= 4.9 else (0.3 if v >= 1.0 else 0.0)
160
- return {"seed": seed, "contract": contract, "target_function": fn_name,
161
- "grader_score": score, "cumulative_reward": result.observation.cumulative_reward}
162
-
163
-
164
- def subfunction_t3(env: Task3Environment, seed: int) -> Dict[str, Any]:
165
- """Submits the first partial-credit subfunction if it exists, else 'constructor'."""
166
- r = env.reset(seed=seed)
167
- obs = r.observation
168
- contracts = load_contracts()
169
- partial_fns = []
170
- for c in contracts:
171
- if c["contract_name"] == obs.contract_name:
172
- fn = get_function_by_name(c, env.state().target_function)
173
- if fn:
174
- partial_fns = fn.get("task3", {}).get("partial_credit_functions", [])
175
- break
176
- submit_name = partial_fns[0] if partial_fns else "constructor"
177
- result = env.step(Action(action_type=ActionType.SUBMIT_FUNCTION,
178
- params={"function_name": submit_name}))
179
- v = result.reward.value
180
- score = 1.0 if v >= 4.9 else (0.3 if v >= 1.0 else 0.0)
181
- return {"seed": seed, "grader_score": score, "submitted": submit_name,
182
- "cumulative_reward": result.observation.cumulative_reward}
183
-
184
-
185
- def wrong_t3(env: Task3Environment, seed: int) -> Dict[str, Any]:
186
- """Always submits 'constructor' → score = 0.0."""
187
- env.reset(seed=seed)
188
- result = env.step(Action(action_type=ActionType.SUBMIT_FUNCTION,
189
- params={"function_name": "constructor"}))
190
- return {"seed": seed, "grader_score": 0.0,
191
- "cumulative_reward": result.observation.cumulative_reward}
192
-
193
 
194
- # ─────────────────────────────────────────────────────────────────────────────
195
- # Evaluation runners
196
- # ─────────────────────────────────────────────────────────────────────────────
197
 
198
  def run_task1_eval(n: int, seed_offset: int, verbose: bool) -> Dict[str, Any]:
199
  print("\n" + "=" * 64)
@@ -204,27 +51,37 @@ def run_task1_eval(n: int, seed_offset: int, verbose: bool) -> Dict[str, Any]:
204
  f"{len(get_all_vulnerable_entries(contracts))} vulnerable functions\n")
205
  env = Task1Environment()
206
 
207
- print("▶ Oracle (correct function + correct vuln type → 1.0):")
 
208
  oracle_eps = []
209
  for i in range(n):
210
  ep = oracle_t1(env, seed_offset + i, verbose)
211
  oracle_eps.append(ep)
212
  print(f" seed={ep['seed']:3d} {ep['contract']:12s}.{ep['target_function']:18s}"
213
  f" score={ep['grader_score']:.1f} reward={ep['cumulative_reward']:+.2f}")
214
- oracle_avg = sum(e["grader_score"] for e in oracle_eps) / n
215
- oracle_avg_r = sum(e["cumulative_reward"] for e in oracle_eps) / n
216
- print(f"\n Oracle avg: {oracle_avg:.3f} reward: {oracle_avg_r:+.2f}")
217
 
218
- print("\n▶ Partial (right function, wrong vuln → 0.5):")
 
219
  partial_eps = [partial_t1(env, seed_offset + i) for i in range(n)]
220
- partial_avg = sum(e["grader_score"] for e in partial_eps) / n
221
- print(f" Partial avg: {partial_avg:.3f}")
222
-
223
- print("\n▶ Wrong (always 'constructor' → 0.0):")
224
- wrong_eps = [wrong_t1(env, seed_offset + i) for i in range(n)]
225
- wrong_avg = sum(e["grader_score"] for e in wrong_eps) / n
226
- print(f" Wrong avg: {wrong_avg:.3f}")
227
-
 
 
 
 
 
 
 
 
 
228
  vuln_seen: Dict[str, int] = {}
229
  for ep in oracle_eps:
230
  v = ep.get("vulnerability", "unknown")
@@ -233,16 +90,19 @@ def run_task1_eval(n: int, seed_offset: int, verbose: bool) -> Dict[str, Any]:
233
  for v in sorted(vuln_seen):
234
  print(f" {vuln_seen[v]:2d}× {v}")
235
 
236
- assert oracle_avg == 1.0
237
- assert partial_avg == 0.5
238
- assert wrong_avg == 0.0
239
- print("\n ✅ Task 1: oracle(1.0) > partial(0.5) > wrong(0.0)")
 
 
240
 
241
  return {
242
  "task_id": "task1_vuln_detection",
243
- "oracle": {"avg_score": oracle_avg, "avg_reward": oracle_avg_r, "episodes": oracle_eps},
244
  "partial": {"avg_score": partial_avg, "episodes": partial_eps},
245
- "wrong": {"avg_score": wrong_avg, "episodes": wrong_eps},
 
246
  "vuln_coverage": vuln_seen,
247
  }
248
 
@@ -255,7 +115,8 @@ def run_task2_eval(n: int, seed_offset: int, verbose: bool) -> Dict[str, Any]:
255
  print(f" Dataset: {len(get_all_property_entries(contracts))} property entries\n")
256
  env = Task2Environment()
257
 
258
- print("▶ Oracle (submits ground-truth natural language):")
 
259
  oracle_eps = []
260
  for i in range(n):
261
  ep = oracle_t2(env, seed_offset + i, verbose)
@@ -263,31 +124,40 @@ def run_task2_eval(n: int, seed_offset: int, verbose: bool) -> Dict[str, Any]:
263
  icon = "✅" if ep["grader_score"] >= 0.65 else "⚠️ "
264
  print(f" {icon} seed={ep['seed']:3d} {ep['contract']:12s}.{ep['function']:18s}"
265
  f" score={ep['grader_score']:.3f} reward={ep['cumulative_reward']:+.2f}")
266
- oracle_avg = sum(e["grader_score"] for e in oracle_eps) / n
267
- oracle_avg_r = sum(e["cumulative_reward"] for e in oracle_eps) / n
268
- print(f"\n Oracle avg: {oracle_avg:.3f} reward: {oracle_avg_r:+.2f}")
269
 
 
270
  print("\n▶ Partial (submits NatSpec comment):")
271
- partial_eps = [partial_t2(env, seed_offset + i) for i in range(n)]
272
- partial_avg = sum(e["grader_score"] for e in partial_eps) / n
273
- partial_avg_r = sum(e["cumulative_reward"] for e in partial_eps) / n
274
- print(f" Partial avg: {partial_avg:.3f} reward: {partial_avg_r:+.2f}")
275
-
276
- print("\n▶ Empty (submits nothing 0.0):")
277
- empty_eps = [empty_t2(env, seed_offset + i) for i in range(n)]
278
- empty_avg = sum(e["grader_score"] for e in empty_eps) / n
279
- print(f" Empty avg: {empty_avg:.3f}")
280
-
281
- assert oracle_avg > 0.60
282
- assert oracle_avg > partial_avg
283
- assert empty_avg == 0.0
284
- print(f"\n ✅ Task 2: oracle({oracle_avg:.3f}) > partial({partial_avg:.3f}) > empty(0.0)")
 
 
 
 
 
 
 
 
285
 
286
  return {
287
  "task_id": "task2_property_discovery",
288
- "oracle": {"avg_score": oracle_avg, "avg_reward": oracle_avg_r, "episodes": oracle_eps},
289
- "partial": {"avg_score": partial_avg, "avg_reward": partial_avg_r, "episodes": partial_eps},
290
- "empty": {"avg_score": empty_avg, "episodes": empty_eps},
 
291
  }
292
 
293
 
@@ -299,40 +169,50 @@ def run_task3_eval(n: int, seed_offset: int, verbose: bool) -> Dict[str, Any]:
299
  print(f" Dataset: {len(get_all_task3_entries(contracts))} rule-check episodes\n")
300
  env = Task3Environment()
301
 
302
- print("▶ Oracle (submits exact target function → 1.0):")
 
303
  oracle_eps = []
304
  for i in range(n):
305
  ep = oracle_t3(env, seed_offset + i, verbose)
306
  oracle_eps.append(ep)
307
  print(f" seed={ep['seed']:3d} {ep['contract']:12s}.{ep['target_function']:18s}"
308
  f" score={ep['grader_score']:.1f} reward={ep['cumulative_reward']:+.2f}")
309
- oracle_avg = sum(e["grader_score"] for e in oracle_eps) / n
310
- oracle_avg_r = sum(e["cumulative_reward"] for e in oracle_eps) / n
311
- print(f"\n Oracle avg: {oracle_avg:.3f} reward: {oracle_avg_r:+.2f}")
312
-
313
- print("\n▶ Subfunction (partial-credit callee or fallback to wrong):")
314
- sub_eps = [subfunction_t3(env, seed_offset + i) for i in range(n)]
315
- sub_avg = sum(e["grader_score"] for e in sub_eps) / n
316
- sub_avg_r = sum(e["cumulative_reward"] for e in sub_eps) / n
317
- submitted = list({e.get("submitted", "?") for e in sub_eps})
318
- print(f" Subfunction avg: {sub_avg:.3f} reward: {sub_avg_r:+.2f} "
319
- f"submitted fns: {submitted}")
320
-
321
- print("\n▶ Wrong (always 'constructor' 0.0):")
322
- wrong_eps = [wrong_t3(env, seed_offset + i) for i in range(n)]
323
- wrong_avg = sum(e["grader_score"] for e in wrong_eps) / n
324
- print(f" Wrong avg: {wrong_avg:.3f}")
325
-
326
- assert oracle_avg == 1.0
327
- assert 0.0 <= sub_avg <= oracle_avg
328
- assert wrong_avg == 0.0
329
- print(f"\n ✅ Task 3: oracle(1.0) ≥ subfunction({sub_avg:.3f}) > wrong(0.0)")
 
 
 
 
 
 
 
 
330
 
331
  return {
332
  "task_id": "task3_rule_checker",
333
- "oracle": {"avg_score": oracle_avg, "avg_reward": oracle_avg_r, "episodes": oracle_eps},
334
- "subfunction": {"avg_score": sub_avg, "avg_reward": sub_avg_r, "episodes": sub_eps},
335
- "wrong": {"avg_score": wrong_avg, "episodes": wrong_eps},
 
336
  }
337
 
338
 
@@ -342,13 +222,18 @@ def run_task3_eval(n: int, seed_offset: int, verbose: bool) -> Dict[str, Any]:
342
 
343
  def main():
344
  parser = argparse.ArgumentParser(
345
- description="Evaluate Task 1, 2, and/or 3 of the SC Audit RL Environment"
346
  )
347
- parser.add_argument("--episodes", type=int, default=8)
348
- parser.add_argument("--seed", type=int, default=42)
349
- parser.add_argument("--task", choices=["1", "2", "3", "all"], default="all")
350
- parser.add_argument("--verbose", action="store_true")
351
- parser.add_argument("--out", default="eval_results.json")
 
 
 
 
 
352
  args = parser.parse_args()
353
 
354
  report: Dict[str, Any] = {"num_episodes": args.episodes, "seed_offset": args.seed}
@@ -360,14 +245,16 @@ def main():
360
  if args.task in ("3", "all"):
361
  report["task3"] = run_task3_eval(args.episodes, args.seed, args.verbose)
362
 
 
363
  print("\n" + "=" * 64)
364
  print("EVALUATION COMPLETE")
365
  print("=" * 64)
366
- for label, key, tiers in [
367
- ("Task 1", "task1", ["oracle", "partial", "wrong"]),
368
- ("Task 2", "task2", ["oracle", "partial", "empty"]),
369
- ("Task 3", "task3", ["oracle", "subfunction", "wrong"]),
370
- ]:
 
371
  if key in report:
372
  scores = " ".join(
373
  f"{t}={report[key][t]['avg_score']:.3f}" for t in tiers
 
3
  -------
4
  Evaluation harness for all three tasks.
5
 
6
+ Runs four agent tiers per task:
7
+ oracle – always submits the ground-truth answer (upper bound)
8
+ partial – right category, wrong detail (partial credit)
9
+ random – genuine random exploration + random submit (random baseline)
10
+ floor – always submits a guaranteed-wrong answer (lower bound)
11
 
12
  Usage:
13
+ python eval.py # all tasks, 8 episodes each
14
+ python eval.py --task 1|2|3 # single task
15
  python eval.py --episodes 16 --verbose
16
  python eval.py --out results.json
17
  """
18
 
19
  import argparse
20
  import json
21
+ import random as _random
22
  from typing import Any, Dict, List
23
 
24
+ from tasks.task1 import Task1Environment
25
+ from tasks.task2 import Task2Environment
26
+ from tasks.task3 import Task3Environment
27
+ from agents.task1 import oracle_t1, partial_t1, random_t1, floor_t1
28
+ from agents.task2 import oracle_t2, partial_t2, random_t2, floor_t2
29
+ from agents.task3 import oracle_t3, subfunction_t3, random_t3, floor_t3
30
  from data.data_loader import (
31
  load_contracts,
 
32
  get_all_vulnerable_entries,
33
  get_all_property_entries,
34
  get_all_task3_entries,
35
  )
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  # ─────────────────────────────────────────────────────────────────────────────
38
+ # Evaluation runners
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  # ─────────────────────────────────────────────────────────────────────────────
40
 
41
+ def _avg(episodes: List[Dict[str, Any]], key: str = "grader_score") -> float:
42
+ return sum(e[key] for e in episodes) / len(episodes) if episodes else 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
 
 
 
44
 
45
  def run_task1_eval(n: int, seed_offset: int, verbose: bool) -> Dict[str, Any]:
46
  print("\n" + "=" * 64)
 
51
  f"{len(get_all_vulnerable_entries(contracts))} vulnerable functions\n")
52
  env = Task1Environment()
53
 
54
+ # Oracle
55
+ print("▶ Oracle (correct function + correct vuln → 1.0):")
56
  oracle_eps = []
57
  for i in range(n):
58
  ep = oracle_t1(env, seed_offset + i, verbose)
59
  oracle_eps.append(ep)
60
  print(f" seed={ep['seed']:3d} {ep['contract']:12s}.{ep['target_function']:18s}"
61
  f" score={ep['grader_score']:.1f} reward={ep['cumulative_reward']:+.2f}")
62
+ oracle_avg = _avg(oracle_eps)
63
+ print(f"\n Oracle avg: {oracle_avg:.3f}")
 
64
 
65
+ # Partial
66
+ print("\n▶ Partial (correct function, 'unknown' vuln → 0.5):")
67
  partial_eps = [partial_t1(env, seed_offset + i) for i in range(n)]
68
+ partial_avg = _avg(partial_eps)
69
+ print(f" Partial avg: {partial_avg:.3f}")
70
+
71
+ # Random
72
+ print("\n▶ Random (random fn from list + random vuln type):")
73
+ random_eps = [random_t1(env, seed_offset + i) for i in range(n)]
74
+ random_avg = _avg(random_eps)
75
+ submitted = [(e.get("submitted_fn", "?"), e.get("submitted_vuln", "?")) for e in random_eps]
76
+ print(f" Random avg: {random_avg:.3f} submissions: {submitted}")
77
+
78
+ # Floor
79
+ print("\n▶ Floor (always 'constructor' → 0.0):")
80
+ floor_eps = [floor_t1(env, seed_offset + i) for i in range(n)]
81
+ floor_avg = _avg(floor_eps)
82
+ print(f" Floor avg: {floor_avg:.3f}")
83
+
84
+ # Vulnerability type coverage
85
  vuln_seen: Dict[str, int] = {}
86
  for ep in oracle_eps:
87
  v = ep.get("vulnerability", "unknown")
 
90
  for v in sorted(vuln_seen):
91
  print(f" {vuln_seen[v]:2d}× {v}")
92
 
93
+ assert oracle_avg == 1.0, f"Oracle avg {oracle_avg:.3f} should be 1.0"
94
+ assert partial_avg == 0.5, f"Partial avg {partial_avg:.3f} should be 0.5"
95
+ assert floor_avg == 0.0, f"Floor avg {floor_avg:.3f} should be 0.0"
96
+ assert oracle_avg >= random_avg >= floor_avg, \
97
+ f"Score ordering violated: oracle={oracle_avg}, random={random_avg}, floor={floor_avg}"
98
+ print(f"\n ✅ Task 1: oracle(1.0) ≥ partial(0.5) ≥ random({random_avg:.3f}) ≥ floor(0.0)")
99
 
100
  return {
101
  "task_id": "task1_vuln_detection",
102
+ "oracle": {"avg_score": oracle_avg, "episodes": oracle_eps},
103
  "partial": {"avg_score": partial_avg, "episodes": partial_eps},
104
+ "random": {"avg_score": random_avg, "episodes": random_eps},
105
+ "floor": {"avg_score": floor_avg, "episodes": floor_eps},
106
  "vuln_coverage": vuln_seen,
107
  }
108
 
 
115
  print(f" Dataset: {len(get_all_property_entries(contracts))} property entries\n")
116
  env = Task2Environment()
117
 
118
+ # Oracle
119
+ print("▶ Oracle (submits ground-truth natural language):")
120
  oracle_eps = []
121
  for i in range(n):
122
  ep = oracle_t2(env, seed_offset + i, verbose)
 
124
  icon = "✅" if ep["grader_score"] >= 0.65 else "⚠️ "
125
  print(f" {icon} seed={ep['seed']:3d} {ep['contract']:12s}.{ep['function']:18s}"
126
  f" score={ep['grader_score']:.3f} reward={ep['cumulative_reward']:+.2f}")
127
+ oracle_avg = _avg(oracle_eps)
128
+ print(f"\n Oracle avg: {oracle_avg:.3f}")
 
129
 
130
+ # Partial
131
  print("\n▶ Partial (submits NatSpec comment):")
132
+ partial_eps = [partial_t2(env, seed_offset + i) for i in range(n)]
133
+ partial_avg = _avg(partial_eps)
134
+ print(f" Partial avg: {partial_avg:.3f}")
135
+
136
+ # Random
137
+ print("\n▶ Random (random browse + generic property template):")
138
+ random_eps = [random_t2(env, seed_offset + i) for i in range(n)]
139
+ random_avg = _avg(random_eps)
140
+ print(f" Random avg: {random_avg:.3f}")
141
+
142
+ # Floor
143
+ print("\n▶ Floor (submits empty string → 0.0):")
144
+ floor_eps = [floor_t2(env, seed_offset + i) for i in range(n)]
145
+ floor_avg = _avg(floor_eps)
146
+ print(f" Floor avg: {floor_avg:.3f}")
147
+
148
+ assert oracle_avg > 0.60, f"Oracle avg {oracle_avg:.3f} should be > 0.60"
149
+ assert oracle_avg > partial_avg >= floor_avg, \
150
+ "Score ordering violated: oracle > partial >= floor"
151
+ assert floor_avg == 0.0, f"Floor avg {floor_avg:.3f} should be 0.0"
152
+ print(f"\n ✅ Task 2: oracle({oracle_avg:.3f}) > partial({partial_avg:.3f})"
153
+ f" ≥ random({random_avg:.3f}) ≥ floor(0.0)")
154
 
155
  return {
156
  "task_id": "task2_property_discovery",
157
+ "oracle": {"avg_score": oracle_avg, "episodes": oracle_eps},
158
+ "partial": {"avg_score": partial_avg, "episodes": partial_eps},
159
+ "random": {"avg_score": random_avg, "episodes": random_eps},
160
+ "floor": {"avg_score": floor_avg, "episodes": floor_eps},
161
  }
162
 
163
 
 
169
  print(f" Dataset: {len(get_all_task3_entries(contracts))} rule-check episodes\n")
170
  env = Task3Environment()
171
 
172
+ # Oracle
173
+ print("▶ Oracle (exact target function → 1.0):")
174
  oracle_eps = []
175
  for i in range(n):
176
  ep = oracle_t3(env, seed_offset + i, verbose)
177
  oracle_eps.append(ep)
178
  print(f" seed={ep['seed']:3d} {ep['contract']:12s}.{ep['target_function']:18s}"
179
  f" score={ep['grader_score']:.1f} reward={ep['cumulative_reward']:+.2f}")
180
+ oracle_avg = _avg(oracle_eps)
181
+ print(f"\n Oracle avg: {oracle_avg:.3f}")
182
+
183
+ # Subfunction (partial credit)
184
+ print("\n▶ Subfunction (partial-credit callee if exists, else constructor):")
185
+ sub_eps = [subfunction_t3(env, seed_offset + i) for i in range(n)]
186
+ sub_avg = _avg(sub_eps)
187
+ submitted_sub = list({e.get("submitted", "?") for e in sub_eps})
188
+ print(f" Subfunction avg: {sub_avg:.3f} submitted: {submitted_sub}")
189
+
190
+ # Random
191
+ print("\n▶ Random (lists functions, submits uniformly random one):")
192
+ random_eps = [random_t3(env, seed_offset + i) for i in range(n)]
193
+ random_avg = _avg(random_eps)
194
+ submitted_rand = [e.get("submitted", "?") for e in random_eps]
195
+ print(f" Random avg: {random_avg:.3f} submitted: {submitted_rand}")
196
+
197
+ # Floor
198
+ print("\n▶ Floor (always 'constructor' 0.0):")
199
+ floor_eps = [floor_t3(env, seed_offset + i) for i in range(n)]
200
+ floor_avg = _avg(floor_eps)
201
+ print(f" Floor avg: {floor_avg:.3f}")
202
+
203
+ assert oracle_avg == 1.0, f"Oracle avg {oracle_avg:.3f} should be 1.0"
204
+ assert floor_avg == 0.0, f"Floor avg {floor_avg:.3f} should be 0.0"
205
+ assert oracle_avg >= random_avg >= floor_avg, \
206
+ f"Score ordering violated: oracle={oracle_avg}, random={random_avg}, floor={floor_avg}"
207
+ print(f"\n ✅ Task 3: oracle(1.0) ≥ subfunction({sub_avg:.3f})"
208
+ f" ≥ random({random_avg:.3f}) ≥ floor(0.0)")
209
 
210
  return {
211
  "task_id": "task3_rule_checker",
212
+ "oracle": {"avg_score": oracle_avg, "episodes": oracle_eps},
213
+ "subfunction": {"avg_score": sub_avg, "episodes": sub_eps},
214
+ "random": {"avg_score": random_avg, "episodes": random_eps},
215
+ "floor": {"avg_score": floor_avg, "episodes": floor_eps},
216
  }
217
 
218
 
 
222
 
223
  def main():
224
  parser = argparse.ArgumentParser(
225
+ description="Evaluate Task 1, 2, and/or 3 oracle / partial / random / floor"
226
  )
227
+ parser.add_argument("--episodes", type=int, default=8,
228
+ help="Episodes per agent tier (default: 8)")
229
+ parser.add_argument("--seed", type=int, default=42,
230
+ help="Starting RNG seed (default: 42)")
231
+ parser.add_argument("--task", choices=["1", "2", "3", "all"], default="all",
232
+ help="Which task(s) from [1, 2, 3] to evaluate (default: all)")
233
+ parser.add_argument("--verbose", action="store_true",
234
+ help="Print per-episode target details for oracle agents")
235
+ parser.add_argument("--out", default="eval_results.json",
236
+ help="Output JSON file (default: eval_results.json)")
237
  args = parser.parse_args()
238
 
239
  report: Dict[str, Any] = {"num_episodes": args.episodes, "seed_offset": args.seed}
 
245
  if args.task in ("3", "all"):
246
  report["task3"] = run_task3_eval(args.episodes, args.seed, args.verbose)
247
 
248
+ # ── Summary ──────────────────────────────────────────────────────────────
249
  print("\n" + "=" * 64)
250
  print("EVALUATION COMPLETE")
251
  print("=" * 64)
252
+ rows = [
253
+ ("Task 1", "task1", ["oracle", "partial", "random", "floor"]),
254
+ ("Task 2", "task2", ["oracle", "partial", "random", "floor"]),
255
+ ("Task 3", "task3", ["oracle", "subfunction", "random", "floor"]),
256
+ ]
257
+ for label, key, tiers in rows:
258
  if key in report:
259
  scores = " ".join(
260
  f"{t}={report[key][t]['avg_score']:.3f}" for t in tiers
inference.py CHANGED
@@ -1,36 +1,40 @@
1
  """
2
  inference.py
3
  ------------
4
- Baseline inference script for the Smart Contract Audit RL Environment.
5
- Implements Task 1 (Vulnerability Detection), Task 2 (Property Discovery),
6
- and Task 3 (Rule Checker).
7
 
8
- Environment variables:
9
- API_BASE_URL – LLM API endpoint (e.g. https://api.openai.com/v1)
10
- MODEL_NAME – model identifier (e.g. gpt-4o-mini)
11
- HF_TOKEN – API key
 
 
 
 
 
 
 
 
12
 
13
  Usage:
14
  python inference.py
15
 
16
  Output:
17
- Per-task scores printed to stdout.
18
- Final baseline scores written to baseline_scores.json.
19
-
20
- Runtime: < 5 minutes on 3 episodes per task with gpt-4o-mini.
21
  """
22
 
 
23
  import json
24
  import os
25
  import sys
26
  import time
27
- from typing import Any, Dict, List
28
 
29
  from openai import OpenAI
30
 
31
- from tasks.task1.environment import Task1Environment
32
- from tasks.task2.environment import Task2Environment
33
- from tasks.task3.environment import Task3Environment
34
  from env.schemas import Action, ActionType
35
  from utils import T1_SYSTEM, T2_SYSTEM, T3_SYSTEM
36
 
@@ -38,83 +42,152 @@ from utils import T1_SYSTEM, T2_SYSTEM, T3_SYSTEM
38
  # Configuration
39
  # ─────────────────────────────────────────────────────────────────────────────
40
 
41
- API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.openai.com/v1")
42
- MODEL_NAME = os.environ.get("MODEL_NAME", "gpt-4o-mini")
43
- HF_TOKEN = os.environ.get("HF_TOKEN", "")
44
 
45
  if not HF_TOKEN:
46
- print("WARNING: HF_TOKEN not set. API calls may fail.", file=sys.stderr)
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
- MAX_STEPS_T1 = 15
49
- MAX_STEPS_T2 = 10
50
- NUM_EPISODES = 3
51
- SEED_BASE_T1 = 42
52
- SEED_BASE_T2 = 10
53
 
54
  client = OpenAI(api_key=HF_TOKEN, base_url=API_BASE_URL)
55
 
 
56
  # ─────────────────────────────────────────────────────────────────────────────
57
- # Task 1 agent
58
  # ─────────────────────────────────────────────────────────────────────────────
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
  def _t1_user_msg(obs: Dict[str, Any]) -> str:
62
  return (
63
  f"Contract: {obs['contract_name']}\n"
64
  f"Description: {obs['contract_description']}\n"
65
- f"Step: {obs['step_count']} | Reward: {obs['cumulative_reward']:.2f}\n\n"
66
- f"Last action: {obs['last_action'] or 'None'}\n"
67
- f"Result: {obs['last_action_result'] or 'Episode started.'}"
68
  )
69
 
70
 
71
- def run_t1_episode(env: Task1Environment, seed: int, ep: int) -> Dict[str, Any]:
72
- r = env.reset(seed=seed)
 
73
  obs = r.observation.model_dump()
74
- print(f" ep={ep} seed={seed} contract={obs['contract_name']}")
75
-
76
- messages = [{"role": "system", "content": T1_SYSTEM}]
77
- grader_score = 0.0
78
- cum_reward = 0.0
79
-
80
- for step in range(MAX_STEPS_T1):
81
- messages.append({"role": "user", "content": _t1_user_msg(obs)})
82
- try:
83
- resp = client.chat.completions.create(
84
- model=MODEL_NAME, messages=messages,
85
- max_tokens=200, temperature=0.0,
86
- )
87
- raw = resp.choices[0].message.content.strip()
88
- except Exception as e:
89
- print(f" LLM error: {e}", file=sys.stderr)
90
- break
91
-
92
- try:
93
- parsed = json.loads(raw)
94
- at = ActionType(parsed["action"])
95
- params = parsed.get("params", {})
96
- except Exception:
97
- at, params = ActionType.LIST_FUNCTIONS, {}
98
-
99
- messages.append({"role": "assistant", "content": raw})
100
- result = env.step(Action(action_type=at, params=params))
101
- obs = result.observation.model_dump()
102
- print(f" step {step+1:2d}: {at.value:25s} r={result.reward.value:+.2f}")
103
-
104
- if result.done:
105
- v = result.reward.value
106
- grader_score = 1.0 if v >= 4.9 else (0.5 if v >= 0.9 else 0.0)
107
- cum_reward = obs["cumulative_reward"]
108
- break
109
- time.sleep(0.3)
110
-
111
- print(f" → grader_score={grader_score:.1f} cum_reward={cum_reward:.2f}")
112
- return {"episode": ep, "seed": seed, "contract": obs["contract_name"],
113
- "grader_score": grader_score, "cumulative_reward": cum_reward}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
115
 
116
  # ──────────────────────────────────────────���──────────────────────────────────
117
- # Task 2 agent
118
  # ─────────────────────────────────────────────────────────────────────────────
119
 
120
 
@@ -124,198 +197,246 @@ def _t2_user_msg(obs: Dict[str, Any]) -> str:
124
  f"Contract : {obs['contract_name']}\n"
125
  f"Function : {extra.get('target_function', '?')} "
126
  f"({extra.get('target_signature', '')})\n"
127
- f"Step: {obs['step_count']} | Reward: {obs['cumulative_reward']:.2f}\n\n"
128
- f"Last action: {obs['last_action'] or 'None'}\n"
129
- f"Result:\n{obs['last_action_result'] or 'Episode started — begin exploring.'}"
130
  )
131
 
132
 
133
- def run_t2_episode(env: Task2Environment, seed: int, ep: int) -> Dict[str, Any]:
134
- r = env.reset(seed=seed)
 
135
  obs = r.observation.model_dump()
136
  fn = obs["extra"].get("target_function", "?")
137
- print(f" ep={ep} seed={seed} {obs['contract_name']}.{fn}()")
138
-
139
- messages = [{"role": "system", "content": T2_SYSTEM}]
140
- grader_score = 0.0
141
- cum_reward = 0.0
142
-
143
- for step in range(MAX_STEPS_T2):
144
- messages.append({"role": "user", "content": _t2_user_msg(obs)})
145
- try:
146
- resp = client.chat.completions.create(
147
- model=MODEL_NAME, messages=messages,
148
- max_tokens=400, temperature=0.0,
149
- )
150
- raw = resp.choices[0].message.content.strip()
151
- except Exception as e:
152
- print(f" LLM error: {e}", file=sys.stderr)
153
- break
154
-
155
- try:
156
- parsed = json.loads(raw)
157
- at = ActionType(parsed["action"])
158
- params = parsed.get("params", {})
159
- except Exception:
160
- at, params = ActionType.GET_FUNCTION_CODE, {}
161
-
162
- messages.append({"role": "assistant", "content": raw})
163
- result = env.step(Action(action_type=at, params=params))
164
- obs = result.observation.model_dump()
165
- r_val = result.reward.value
166
- print(f" step {step+1:2d}: {at.value:25s} r={r_val:+.2f}")
167
-
168
- if result.done:
169
- grader_score = round(r_val / 5.0, 3) if r_val > 0 else 0.0
170
- cum_reward = obs["cumulative_reward"]
171
- break
172
- time.sleep(0.3)
173
-
174
- print(f" → grader_score={grader_score:.3f} cum_reward={cum_reward:.2f}")
175
- return {"episode": ep, "seed": seed,
176
- "contract": obs["contract_name"], "function": fn,
177
- "grader_score": grader_score, "cumulative_reward": cum_reward}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
 
179
 
180
  # ─────────────────────────────────────────────────────────────────────────────
181
- # Task runners
182
  # ─────────────────────────────────────────────────────────────────────────────
183
 
184
- def run_task1(n: int = NUM_EPISODES) -> Dict[str, Any]:
185
- print("\n" + "="*60)
186
- print("TASK 1: Targeted Vulnerability Detection")
187
- print("="*60)
188
- env = Task1Environment()
189
- episodes = [run_t1_episode(env, SEED_BASE_T1 + i, i+1) for i in range(n)]
190
- avg_s = sum(e["grader_score"] for e in episodes) / n
191
- avg_r = sum(e["cumulative_reward"] for e in episodes) / n
192
- print(f"\n Avg grader score : {avg_s:.3f}")
193
- print(f" Avg cum reward : {avg_r:.2f}")
194
- return {"task_id": "task1_vuln_detection", "name": "Targeted Vulnerability Detection",
195
- "status": "active", "num_episodes": n, "episodes": episodes,
196
- "avg_grader_score": avg_s, "avg_cumulative_reward": avg_r}
197
-
198
-
199
- def run_task2(n: int = NUM_EPISODES) -> Dict[str, Any]:
200
- print("\n" + "="*60)
201
- print("TASK 2: Property Discovery")
202
- print("="*60)
203
- env = Task2Environment()
204
- episodes = [run_t2_episode(env, SEED_BASE_T2 + i, i+1) for i in range(n)]
205
- avg_s = sum(e["grader_score"] for e in episodes) / n
206
- avg_r = sum(e["cumulative_reward"] for e in episodes) / n
207
- print(f"\n Avg grader score : {avg_s:.3f}")
208
- print(f" Avg cum reward : {avg_r:.2f}")
209
- return {"task_id": "task2_property_discovery", "name": "Property Discovery",
210
- "status": "active", "num_episodes": n, "episodes": episodes,
211
- "avg_grader_score": avg_s, "avg_cumulative_reward": avg_r}
212
-
213
 
214
  def _t3_user_msg(obs: Dict[str, Any]) -> str:
215
  extra = obs.get("extra", {})
216
  return (
217
  f"Contract : {obs['contract_name']}\n"
218
- f"Property : {extra.get('property_english', '(no property)')}\n"
219
- f"Step: {obs['step_count']} | Reward: {obs['cumulative_reward']:.2f}\n\n"
220
- f"Last action: {obs['last_action'] or 'None'}\n"
221
- f"Result:\n{obs['last_action_result'] or 'Episode started.'}"
222
  )
223
 
224
 
225
- def run_t3_episode(env: Task3Environment, seed: int, ep: int) -> Dict[str, Any]:
 
226
  r = env.reset(seed=seed)
227
  obs = r.observation.model_dump()
228
- prop_preview = obs['extra'].get('property_english', '')[:55]
229
- print(f" ep={ep} seed={seed} {obs['contract_name']} \"{prop_preview}...\"")
230
-
231
- messages = [{"role": "system", "content": T3_SYSTEM}]
232
- grader_score = 0.0
233
- cum_reward = 0.0
234
-
235
- for step in range(15):
236
- messages.append({"role": "user", "content": _t3_user_msg(obs)})
237
- try:
238
- resp = client.chat.completions.create(
239
- model=MODEL_NAME, messages=messages,
240
- max_tokens=200, temperature=0.0,
241
- )
242
- raw = resp.choices[0].message.content.strip()
243
- except Exception as e:
244
- print(f" LLM error: {e}", file=sys.stderr)
245
- break
246
-
247
- try:
248
- parsed = json.loads(raw)
249
- at = ActionType(parsed["action"])
250
- params = parsed.get("params", {})
251
- except Exception:
252
- at, params = ActionType.LIST_FUNCTIONS, {}
253
-
254
- messages.append({"role": "assistant", "content": raw})
255
- result = env.step(Action(action_type=at, params=params))
256
- obs = result.observation.model_dump()
257
- print(f" step {step+1:2d}: {at.value:28s} r={result.reward.value:+.2f}")
258
-
259
- if result.done:
260
- v = result.reward.value
261
- grader_score = 1.0 if v >= 4.9 else (0.3 if v >= 1.0 else 0.0)
262
- cum_reward = obs["cumulative_reward"]
263
- break
264
- time.sleep(0.3)
265
-
266
- print(f" → grader_score={grader_score:.1f} cum_reward={cum_reward:.2f}")
267
- return {"episode": ep, "seed": seed, "contract": obs["contract_name"],
268
- "grader_score": grader_score, "cumulative_reward": cum_reward}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
 
270
 
271
  def run_task3(n: int = NUM_EPISODES) -> Dict[str, Any]:
272
- print("\n" + "="*60)
273
- print("TASK 3: Rule Checker")
274
- print("="*60)
275
- env = Task3Environment()
276
- episodes = [run_t3_episode(env, 42 + i, i + 1) for i in range(n)]
277
- avg_s = sum(e["grader_score"] for e in episodes) / n
278
- avg_r = sum(e["cumulative_reward"] for e in episodes) / n
279
- print(f"\n Avg grader score : {avg_s:.3f}")
280
- print(f" Avg cum reward : {avg_r:.2f}")
281
- return {"task_id": "task3_rule_checker", "name": "Rule Checker",
282
- "status": "active", "num_episodes": n, "episodes": episodes,
283
- "avg_grader_score": avg_s, "avg_cumulative_reward": avg_r}
 
 
284
 
285
 
286
  # ─────────────────────────────────────────────────────────────────────────────
287
  # Main
288
  # ─────────────────────────────────────────────────────────────────────────────
289
 
290
- def main():
291
- print("Smart Contract Audit RL Environment Baseline Inference")
292
- print(f"Model: {MODEL_NAME} | Base URL: {API_BASE_URL}")
 
293
 
294
  t1 = run_task1(NUM_EPISODES)
295
  t2 = run_task2(NUM_EPISODES)
296
  t3 = run_task3(NUM_EPISODES)
297
 
298
  results = {
299
- "model": MODEL_NAME, "base_url": API_BASE_URL,
300
- "tasks": [t1, t2, t3],
 
301
  }
302
-
303
- active = results["tasks"]
304
- overall = sum(t["avg_grader_score"] for t in active) / len(active)
305
  results["overall_avg_score"] = overall
306
 
307
- print("\n" + "="*60)
308
- print("BASELINE SUMMARY")
309
- print("="*60)
310
  for t in results["tasks"]:
311
- icon = "✅" if t["status"] == "active" else "⏳"
312
- print(f" {icon} {t['name']:40s}: {t['avg_grader_score']:.3f}")
313
- print(f"\n Overall (active tasks): {overall:.3f}")
314
 
315
  with open("baseline_scores.json", "w") as f:
316
  json.dump(results, f, indent=2)
317
- print("\n Scores written to baseline_scores.json")
318
 
319
 
320
  if __name__ == "__main__":
321
- main()
 
1
  """
2
  inference.py
3
  ------------
4
+ Baseline inference script Smart Contract Audit RL Environment.
 
 
5
 
6
+ Implements agents for all three tasks using the OpenAI-compatible client.
7
+ Emits mandatory structured stdout in the OpenEnv format.
8
+
9
+ MANDATORY ENV VARS:
10
+ API_BASE_URL LLM API endpoint (default: https://api.openai.com/v1)
11
+ MODEL_NAME Model identifier (default: gpt-4o-mini)
12
+ HF_TOKEN API key / HF token
13
+
14
+ MANDATORY STDOUT FORMAT (per episode):
15
+ [START] task=<id> env=smart-contract-audit model=<model>
16
+ [STEP] step=<n> action=<str> reward=<0.00> done=<true|false> error=<str|null>
17
+ [END] success=<true|false> steps=<n> score=<0.000> rewards=<r1,r2,...>
18
 
19
  Usage:
20
  python inference.py
21
 
22
  Output:
23
+ Structured stdout per episode, plus baseline_scores.json summary.
 
 
 
24
  """
25
 
26
+ import asyncio
27
  import json
28
  import os
29
  import sys
30
  import time
31
+ from typing import Any, Dict, List, Optional
32
 
33
  from openai import OpenAI
34
 
35
+ from tasks.task1 import Task1Environment
36
+ from tasks.task2 import Task2Environment
37
+ from tasks.task3 import Task3Environment
38
  from env.schemas import Action, ActionType
39
  from utils import T1_SYSTEM, T2_SYSTEM, T3_SYSTEM
40
 
 
42
  # Configuration
43
  # ─────────────────────────────────────────────────────────────────────────────
44
 
45
+ API_BASE_URL = os.getenv("API_BASE_URL", "https://api.openai.com/v1")
46
+ MODEL_NAME = os.getenv("MODEL_NAME", "gpt-4o-mini")
47
+ HF_TOKEN = os.getenv("HF_TOKEN", "")
48
 
49
  if not HF_TOKEN:
50
+ print("[WARN] HF_TOKEN not set API calls may fail.", file=sys.stderr)
51
+
52
+ # Benchmark / environment identifier (constant for this env)
53
+ ENV_BENCHMARK = "smart-contract-audit"
54
+
55
+ # Episodes per task
56
+ NUM_EPISODES = 3
57
+ SEED_BASE = 42
58
+
59
+ # Max steps per task
60
+ MAX_STEPS_T1 = 15
61
+ MAX_STEPS_T2 = 10
62
+ MAX_STEPS_T3 = 12
63
 
64
+ # A grader_score >= this is considered a "success" for the [END] line
65
+ SUCCESS_SCORE_THRESHOLD = 0.5
 
 
 
66
 
67
  client = OpenAI(api_key=HF_TOKEN, base_url=API_BASE_URL)
68
 
69
+
70
  # ─────────────────────────────────────────────────────────────────────────────
71
+ # Mandatory stdout helpers
72
  # ─────────────────────────────────────────────────────────────────────────────
73
 
74
+ def log_start(task: str, env: str, model: str) -> None:
75
+ """Emit the [START] line — one per episode."""
76
+ print(f"[START] task={task} env={env} model={model}", flush=True)
77
+
78
+
79
+ def log_step(
80
+ step: int,
81
+ action: str,
82
+ reward: float,
83
+ done: bool,
84
+ error: Optional[str] = None,
85
+ ) -> None:
86
+ """Emit a [STEP] line — one per env.step() call."""
87
+ error_val = error if error else "null"
88
+ print(
89
+ f"[STEP] step={step} action={action} "
90
+ f"reward={reward:.2f} done={str(done).lower()} error={error_val}",
91
+ flush=True,
92
+ )
93
+
94
+
95
+ def log_end(
96
+ success: bool,
97
+ steps: int,
98
+ score: float,
99
+ rewards: List[float],
100
+ ) -> None:
101
+ """Emit the [END] line — one per episode, always emitted."""
102
+ rewards_str = ",".join(f"{r:.2f}" for r in rewards)
103
+ print(
104
+ f"[END] success={str(success).lower()} steps={steps} "
105
+ f"score={score:.3f} rewards={rewards_str}",
106
+ flush=True,
107
+ )
108
+
109
+
110
+ # ─────────────────────────────────────────────────────────────────────────────
111
+ # Task 1 — Targeted Vulnerability Detection
112
+ # ──────────────���──────────────────────────────────────────────────────────────
113
 
114
  def _t1_user_msg(obs: Dict[str, Any]) -> str:
115
  return (
116
  f"Contract: {obs['contract_name']}\n"
117
  f"Description: {obs['contract_description']}\n"
118
+ f"Step: {obs['step_count']} | Reward so far: {obs['cumulative_reward']:.2f}\n\n"
119
+ f"Last action : {obs['last_action'] or 'None'}\n"
120
+ f"Last result : {obs['last_action_result'] or 'Episode just started.'}"
121
  )
122
 
123
 
124
+ def _run_t1_episode(env: Task1Environment, seed: int, ep_num: int) -> Dict[str, Any]:
125
+ """Run one Task 1 episode; emit [START]/[STEP]/[END]."""
126
+ r = env.reset(seed=seed)
127
  obs = r.observation.model_dump()
128
+
129
+ log_start(task="task1_vuln_detection", env=ENV_BENCHMARK, model=MODEL_NAME)
130
+
131
+ messages = [{"role": "system", "content": T1_SYSTEM}]
132
+ step_rewards: List[float] = []
133
+ grader_score = 0.0
134
+ steps_taken = 0
135
+ error_msg: Optional[str] = None
136
+
137
+ try:
138
+ for step in range(1, MAX_STEPS_T1 + 1):
139
+ messages.append({"role": "user", "content": _t1_user_msg(obs)})
140
+ try:
141
+ resp = client.chat.completions.create(
142
+ model=MODEL_NAME, messages=messages,
143
+ max_tokens=200, temperature=0.0,
144
+ )
145
+ raw = resp.choices[0].message.content.strip() # type: ignore
146
+ error_msg = None
147
+ except Exception as e:
148
+ raw = ""
149
+ error_msg = str(e)[:80]
150
+ print(f"[DEBUG] T1 LLM error ep={ep_num} step={step}: {e}", file=sys.stderr)
151
+
152
+ try:
153
+ parsed = json.loads(raw)
154
+ at = ActionType(parsed["action"])
155
+ params = parsed.get("params", {})
156
+ except Exception:
157
+ at, params = ActionType.LIST_FUNCTIONS, {}
158
+
159
+ messages.append({"role": "assistant", "content": raw})
160
+ result = env.step(Action(action_type=at, params=params))
161
+ obs = result.observation.model_dump()
162
+ r_val = result.reward.value
163
+ done = result.done
164
+
165
+ step_rewards.append(r_val)
166
+ steps_taken = step
167
+ log_step(step=step, action=at.value, reward=r_val, done=done, error=error_msg)
168
+
169
+ if done:
170
+ v = r_val
171
+ grader_score = 1.0 if v >= 4.9 else (0.5 if v >= 0.9 else 0.0)
172
+ break
173
+
174
+ time.sleep(0.3)
175
+
176
+ finally:
177
+ success = grader_score >= SUCCESS_SCORE_THRESHOLD
178
+ log_end(success=success, steps=steps_taken, score=grader_score, rewards=step_rewards)
179
+
180
+ return {
181
+ "episode": ep_num,
182
+ "seed": seed,
183
+ "contract": obs["contract_name"],
184
+ "grader_score": grader_score,
185
+ "cumulative_reward": obs["cumulative_reward"],
186
+ }
187
 
188
 
189
  # ──────────────────────────────────────────���──────────────────────────────────
190
+ # Task 2 — Property Discovery
191
  # ─────────────────────────────────────────────────────────────────────────────
192
 
193
 
 
197
  f"Contract : {obs['contract_name']}\n"
198
  f"Function : {extra.get('target_function', '?')} "
199
  f"({extra.get('target_signature', '')})\n"
200
+ f"Step: {obs['step_count']} | Reward so far: {obs['cumulative_reward']:.2f}\n\n"
201
+ f"Last action : {obs['last_action'] or 'None'}\n"
202
+ f"Last result :\n{obs['last_action_result'] or 'Episode just started.'}"
203
  )
204
 
205
 
206
+ def _run_t2_episode(env: Task2Environment, seed: int, ep_num: int) -> Dict[str, Any]:
207
+ """Run one Task 2 episode; emit [START]/[STEP]/[END]."""
208
+ r = env.reset(seed=seed)
209
  obs = r.observation.model_dump()
210
  fn = obs["extra"].get("target_function", "?")
211
+
212
+ log_start(task="task2_property_discovery", env=ENV_BENCHMARK, model=MODEL_NAME)
213
+
214
+ messages = [{"role": "system", "content": T2_SYSTEM}]
215
+ step_rewards: List[float] = []
216
+ grader_score = 0.0
217
+ steps_taken = 0
218
+ error_msg: Optional[str] = None
219
+
220
+ try:
221
+ for step in range(1, MAX_STEPS_T2 + 1):
222
+ messages.append({"role": "user", "content": _t2_user_msg(obs)})
223
+ try:
224
+ resp = client.chat.completions.create(
225
+ model=MODEL_NAME, messages=messages,
226
+ max_tokens=400, temperature=0.0,
227
+ )
228
+ raw = resp.choices[0].message.content.strip() # type: ignore
229
+ error_msg = None
230
+ except Exception as e:
231
+ raw = ""
232
+ error_msg = str(e)[:80]
233
+ print(f"[DEBUG] T2 LLM error ep={ep_num} step={step}: {e}", file=sys.stderr)
234
+
235
+ try:
236
+ parsed = json.loads(raw)
237
+ at = ActionType(parsed["action"])
238
+ params = parsed.get("params", {})
239
+ except Exception:
240
+ at, params = ActionType.GET_FUNCTION_CODE, {}
241
+
242
+ messages.append({"role": "assistant", "content": raw})
243
+ result = env.step(Action(action_type=at, params=params))
244
+ obs = result.observation.model_dump()
245
+ r_val = result.reward.value
246
+ done = result.done
247
+
248
+ step_rewards.append(r_val)
249
+ steps_taken = step
250
+ log_step(step=step, action=at.value, reward=r_val, done=done, error=error_msg)
251
+
252
+ if done:
253
+ grader_score = round(r_val / 5.0, 3) if r_val > 0 else 0.0
254
+ break
255
+
256
+ time.sleep(0.3)
257
+
258
+ finally:
259
+ success = grader_score >= SUCCESS_SCORE_THRESHOLD
260
+ log_end(success=success, steps=steps_taken, score=grader_score, rewards=step_rewards)
261
+
262
+ return {
263
+ "episode": ep_num,
264
+ "seed": seed,
265
+ "contract": obs["contract_name"],
266
+ "function": fn,
267
+ "grader_score": grader_score,
268
+ "cumulative_reward": obs["cumulative_reward"],
269
+ }
270
 
271
 
272
  # ─────────────────────────────────────────────────────────────────────────────
273
+ # Task 3 — Rule Checker
274
  # ─────────────────────────────────────────────────────────────────────────────
275
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
 
277
  def _t3_user_msg(obs: Dict[str, Any]) -> str:
278
  extra = obs.get("extra", {})
279
  return (
280
  f"Contract : {obs['contract_name']}\n"
281
+ f"Property : {extra.get('property_english', '(none)')}\n"
282
+ f"Step: {obs['step_count']} | Reward so far: {obs['cumulative_reward']:.2f}\n\n"
283
+ f"Last action : {obs['last_action'] or 'None'}\n"
284
+ f"Last result :\n{obs['last_action_result'] or 'Episode just started.'}"
285
  )
286
 
287
 
288
+ def _run_t3_episode(env: Task3Environment, seed: int, ep_num: int) -> Dict[str, Any]:
289
+ """Run one Task 3 episode; emit [START]/[STEP]/[END]."""
290
  r = env.reset(seed=seed)
291
  obs = r.observation.model_dump()
292
+
293
+ log_start(task="task3_rule_checker", env=ENV_BENCHMARK, model=MODEL_NAME)
294
+
295
+ messages = [{"role": "system", "content": T3_SYSTEM}]
296
+ step_rewards: List[float] = []
297
+ grader_score = 0.0
298
+ steps_taken = 0
299
+ error_msg: Optional[str] = None
300
+
301
+ try:
302
+ for step in range(1, MAX_STEPS_T3 + 1):
303
+ messages.append({"role": "user", "content": _t3_user_msg(obs)})
304
+ try:
305
+ resp = client.chat.completions.create(
306
+ model=MODEL_NAME, messages=messages,
307
+ max_tokens=200, temperature=0.0,
308
+ )
309
+ raw = resp.choices[0].message.content.strip() # type: ignore
310
+ error_msg = None
311
+ except Exception as e:
312
+ raw = ""
313
+ error_msg = str(e)[:80]
314
+ print(f"[DEBUG] T3 LLM error ep={ep_num} step={step}: {e}", file=sys.stderr)
315
+
316
+ try:
317
+ parsed = json.loads(raw)
318
+ at = ActionType(parsed["action"])
319
+ params = parsed.get("params", {})
320
+ except Exception:
321
+ at, params = ActionType.LIST_FUNCTIONS, {}
322
+
323
+ messages.append({"role": "assistant", "content": raw})
324
+ result = env.step(Action(action_type=at, params=params))
325
+ obs = result.observation.model_dump()
326
+ r_val = result.reward.value
327
+ done = result.done
328
+
329
+ step_rewards.append(r_val)
330
+ steps_taken = step
331
+ log_step(step=step, action=at.value, reward=r_val, done=done, error=error_msg)
332
+
333
+ if done:
334
+ v = r_val
335
+ grader_score = 1.0 if v >= 4.9 else (0.3 if v >= 1.0 else 0.0)
336
+ break
337
+
338
+ time.sleep(0.3)
339
+
340
+ finally:
341
+ success = grader_score >= SUCCESS_SCORE_THRESHOLD
342
+ log_end(success=success, steps=steps_taken, score=grader_score, rewards=step_rewards)
343
+
344
+ return {
345
+ "episode": ep_num,
346
+ "seed": seed,
347
+ "contract": obs["contract_name"],
348
+ "grader_score": grader_score,
349
+ "cumulative_reward": obs["cumulative_reward"],
350
+ }
351
+
352
+
353
+ # ─────────────────────────────────────────────────────────────────────────────
354
+ # Task runners
355
+ # ─────────────────────────────────────────────────────────────────────────────
356
+
357
+ def run_task1(n: int = NUM_EPISODES) -> Dict[str, Any]:
358
+ print("\n" + "="*60, flush=True)
359
+ print("TASK 1: Targeted Vulnerability Detection", flush=True)
360
+ print("="*60, flush=True)
361
+ env = Task1Environment()
362
+ episodes = [_run_t1_episode(env, SEED_BASE + i, i + 1) for i in range(n)]
363
+ avg_s = sum(e["grader_score"] for e in episodes) / n
364
+ avg_r = sum(e["cumulative_reward"] for e in episodes) / n
365
+ print(f"\n Avg grader score : {avg_s:.3f}", flush=True)
366
+ print(f" Avg cum reward : {avg_r:.2f}", flush=True)
367
+ return {
368
+ "task_id": "task1_vuln_detection", "name": "Targeted Vulnerability Detection",
369
+ "status": "active", "num_episodes": n, "episodes": episodes,
370
+ "avg_grader_score": avg_s, "avg_cumulative_reward": avg_r,
371
+ }
372
+
373
+
374
+ def run_task2(n: int = NUM_EPISODES) -> Dict[str, Any]:
375
+ print("\n" + "="*60, flush=True)
376
+ print("TASK 2: Property Discovery", flush=True)
377
+ print("="*60, flush=True)
378
+ env = Task2Environment()
379
+ episodes = [_run_t2_episode(env, SEED_BASE + i, i + 1) for i in range(n)]
380
+ avg_s = sum(e["grader_score"] for e in episodes) / n
381
+ avg_r = sum(e["cumulative_reward"] for e in episodes) / n
382
+ print(f"\n Avg grader score : {avg_s:.3f}", flush=True)
383
+ print(f" Avg cum reward : {avg_r:.2f}", flush=True)
384
+ return {
385
+ "task_id": "task2_property_discovery", "name": "Property Discovery",
386
+ "status": "active", "num_episodes": n, "episodes": episodes,
387
+ "avg_grader_score": avg_s, "avg_cumulative_reward": avg_r,
388
+ }
389
 
390
 
391
  def run_task3(n: int = NUM_EPISODES) -> Dict[str, Any]:
392
+ print("\n" + "="*60, flush=True)
393
+ print("TASK 3: Rule Checker", flush=True)
394
+ print("="*60, flush=True)
395
+ env = Task3Environment()
396
+ episodes = [_run_t3_episode(env, SEED_BASE + i, i + 1) for i in range(n)]
397
+ avg_s = sum(e["grader_score"] for e in episodes) / n
398
+ avg_r = sum(e["cumulative_reward"] for e in episodes) / n
399
+ print(f"\n Avg grader score : {avg_s:.3f}", flush=True)
400
+ print(f" Avg cum reward : {avg_r:.2f}", flush=True)
401
+ return {
402
+ "task_id": "task3_rule_checker", "name": "Rule Checker",
403
+ "status": "active", "num_episodes": n, "episodes": episodes,
404
+ "avg_grader_score": avg_s, "avg_cumulative_reward": avg_r,
405
+ }
406
 
407
 
408
  # ─────────────────────────────────────────────────────────────────────────────
409
  # Main
410
  # ─────────────────────────────────────────────────────────────────────────────
411
 
412
+ async def main() -> None:
413
+ """Async entry point (wraps sync env calls; asyncio.run() expected by caller)."""
414
+ print("Smart Contract Audit RL Environment — Baseline Inference", flush=True)
415
+ print(f"Model: {MODEL_NAME} | Base URL: {API_BASE_URL}", flush=True)
416
 
417
  t1 = run_task1(NUM_EPISODES)
418
  t2 = run_task2(NUM_EPISODES)
419
  t3 = run_task3(NUM_EPISODES)
420
 
421
  results = {
422
+ "model": MODEL_NAME,
423
+ "base_url": API_BASE_URL,
424
+ "tasks": [t1, t2, t3],
425
  }
426
+ overall = sum(t["avg_grader_score"] for t in results["tasks"]) / 3
 
 
427
  results["overall_avg_score"] = overall
428
 
429
+ print("\n" + "="*60, flush=True)
430
+ print("BASELINE SUMMARY", flush=True)
431
+ print("="*60, flush=True)
432
  for t in results["tasks"]:
433
+ print(f" {t['name']:40s}: {t['avg_grader_score']:.3f}", flush=True)
434
+ print(f"\n Overall avg grader score: {overall:.3f}", flush=True)
 
435
 
436
  with open("baseline_scores.json", "w") as f:
437
  json.dump(results, f, indent=2)
438
+ print("\n Scores written to baseline_scores.json", flush=True)
439
 
440
 
441
  if __name__ == "__main__":
442
+ asyncio.run(main())
openenv.yaml CHANGED
@@ -5,7 +5,7 @@ description: >
5
  Agents interact with real-world Solidity contract data from Certora-audited
6
  projects, practising three real audit tasks: vulnerability detection,
7
  property discovery, and rule checking.
8
- author: "SmartAudit Team"
9
  license: MIT
10
 
11
  tasks:
@@ -73,7 +73,7 @@ action_space:
73
  get_function_natspec: {params: {}, reward: -0.08}
74
  get_file_natspec: {params: {}, reward: -0.03}
75
  get_related_functions: {params: {}, reward: -0.06}
76
- get_signature: {params: {}, reward: -0.04}
77
  get_similar_rule: {params: {}, reward: -0.20}
78
  submit_property: {params: {property: string}, reward: "0.0-5.0 keyword-weighted, one attempt"}
79
  task3:
@@ -82,7 +82,7 @@ action_space:
82
  get_function_code: {params: {function_name: string}, reward: -0.10}
83
  get_state_variable: {params: {variable_name: "string opt"}, reward: -0.05}
84
  get_call_graph: {params: {}, reward: -0.08}
85
- get_property_specification: {params: {}, reward: -0.03}
86
  submit_function: {params: {function_name: string}, reward: "+5.0 / +1.5 / -1.5, one attempt"}
87
 
88
  reward:
@@ -135,4 +135,4 @@ interface:
135
  python:
136
  reset: "env.reset(seed=None) -> ResetResult"
137
  step: "env.step(action) -> StepResult"
138
- state: "env.state() -> StateResult"
 
5
  Agents interact with real-world Solidity contract data from Certora-audited
6
  projects, practising three real audit tasks: vulnerability detection,
7
  property discovery, and rule checking.
8
+ author: "Codex47"
9
  license: MIT
10
 
11
  tasks:
 
73
  get_function_natspec: {params: {}, reward: -0.08}
74
  get_file_natspec: {params: {}, reward: -0.03}
75
  get_related_functions: {params: {}, reward: -0.06}
76
+ get_io: {params: {}, reward: -0.04}
77
  get_similar_rule: {params: {}, reward: -0.20}
78
  submit_property: {params: {property: string}, reward: "0.0-5.0 keyword-weighted, one attempt"}
79
  task3:
 
82
  get_function_code: {params: {function_name: string}, reward: -0.10}
83
  get_state_variable: {params: {variable_name: "string opt"}, reward: -0.05}
84
  get_call_graph: {params: {}, reward: -0.08}
85
+ get_formalized_property: {params: {}, reward: -0.03}
86
  submit_function: {params: {function_name: string}, reward: "+5.0 / +1.5 / -1.5, one attempt"}
87
 
88
  reward:
 
135
  python:
136
  reset: "env.reset(seed=None) -> ResetResult"
137
  step: "env.step(action) -> StepResult"
138
+ state: "env.state() -> StateResult"
tasks/task1/environment.py CHANGED
@@ -27,14 +27,7 @@ from __future__ import annotations
27
  import random
28
  from typing import Any, Dict, List, Optional, Set
29
 
30
- from data.data_loader import (
31
- load_contracts,
32
- sample_episode,
33
- get_function_by_name,
34
- get_state_variable_by_name,
35
- list_function_names,
36
- list_state_variable_names,
37
- )
38
  from env.base_env import BaseEnv
39
  from env.schemas import (
40
  Action,
@@ -138,7 +131,7 @@ class Task1Environment(BaseEnv):
138
  return StateResult(
139
  task_id=TASK_ID,
140
  contract_name=self._contract.get("contract_name", ""),
141
- target_function=self._target_fn.get("name"),
142
  step_count=self._step_count,
143
  cumulative_reward=self._cumulative_reward,
144
  done=self._done,
 
27
  import random
28
  from typing import Any, Dict, List, Optional, Set
29
 
30
+ from data.data_loader import load_contracts, sample_episode
 
 
 
 
 
 
 
31
  from env.base_env import BaseEnv
32
  from env.schemas import (
33
  Action,
 
131
  return StateResult(
132
  task_id=TASK_ID,
133
  contract_name=self._contract.get("contract_name", ""),
134
+ target_function=self._target_fn.get("name", ""),
135
  step_count=self._step_count,
136
  cumulative_reward=self._cumulative_reward,
137
  done=self._done,
tasks/task1/grader.py CHANGED
@@ -10,14 +10,6 @@ Deterministic grader. Score range: 0.0 – 1.0
10
  from __future__ import annotations
11
  from typing import Dict
12
  from utils import SemanticMatcher
13
- from data.data_loader import load_vulnerabilities
14
-
15
- def match_vuln_keywords(submitted: str, expected: str) -> bool:
16
- """Checks if the submitted vulnerability type matches the expected one using keyword matching."""
17
- for types in load_vulnerabilities():
18
- if types["vulnerability"] == expected:
19
- return SemanticMatcher().match(types["terms"], submitted)
20
- return False
21
 
22
  class Task1Grader:
23
  def __init__(self, target_function: str, vulnerability_issue: str) -> None:
@@ -27,7 +19,7 @@ class Task1Grader:
27
  def grade_submission(self, submitted_function: str, submitted_vuln_type: str) -> float:
28
  if submitted_function.strip().lower() != self.target_function:
29
  return 0.0
30
- return 1.0 if match_vuln_keywords(submitted_vuln_type, self.vulnerability_issue) else 0.5
31
 
32
  def reward_for_score(self, score: float) -> float:
33
  if score == 1.0: return 5.0
 
10
  from __future__ import annotations
11
  from typing import Dict
12
  from utils import SemanticMatcher
 
 
 
 
 
 
 
 
13
 
14
  class Task1Grader:
15
  def __init__(self, target_function: str, vulnerability_issue: str) -> None:
 
19
  def grade_submission(self, submitted_function: str, submitted_vuln_type: str) -> float:
20
  if submitted_function.strip().lower() != self.target_function:
21
  return 0.0
22
+ return 1.0 if SemanticMatcher().match(self.vulnerability_issue, submitted_vuln_type) else 0.5
23
 
24
  def reward_for_score(self, score: float) -> float:
25
  if score == 1.0: return 5.0
tasks/task2/actions.py CHANGED
@@ -124,7 +124,7 @@ def submit_property(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
124
  score, confidence = ctx._grader.grade(submitted_text)
125
  reward = round(score * 5.0, 4)
126
 
127
- msg = f'Score: {score:.2f}/1.00 → Confidence: {confidence:.2f}\n'
128
  return msg, Reward(
129
  value=reward,
130
  reason=f"Property submission score={score:.3f}",
 
124
  score, confidence = ctx._grader.grade(submitted_text)
125
  reward = round(score * 5.0, 4)
126
 
127
+ msg = f'Score: {score:.2f}/1.00 → Confidence: {confidence}\n'
128
  return msg, Reward(
129
  value=reward,
130
  reason=f"Property submission score={score:.3f}",
tasks/task2/environment.py CHANGED
@@ -134,7 +134,7 @@ class Task2Environment(BaseEnv):
134
  return StateResult(
135
  task_id=TASK_ID,
136
  contract_name=self._contract.get("contract_name", ""),
137
- target_function=self._target_fn.get("name"),
138
  step_count=self._step_count,
139
  cumulative_reward=self._cum_reward,
140
  done=self._done,
 
134
  return StateResult(
135
  task_id=TASK_ID,
136
  contract_name=self._contract.get("contract_name", ""),
137
+ target_function=self._target_fn.get("name", ""),
138
  step_count=self._step_count,
139
  cumulative_reward=self._cum_reward,
140
  done=self._done,
tasks/task3/environment.py CHANGED
@@ -35,7 +35,7 @@ from __future__ import annotations
35
 
36
  import random
37
  from typing import Any, Dict, List, Optional, Set
38
- import actions
39
 
40
  from data.data_loader import load_contracts, sample_task3_episode
41
  from env.base_env import BaseEnv
@@ -83,6 +83,8 @@ class Task3Environment(BaseEnv):
83
  self._seen: Set[str] = set()
84
 
85
  # ── OpenEnv interface ─────────────────────────────────────────────────────
 
 
86
 
87
  def reset(self, seed: Optional[int] = None) -> ResetResult:
88
  if seed is not None:
@@ -94,8 +96,8 @@ class Task3Environment(BaseEnv):
94
  t3 = self._target_fn["task3"]
95
  self._grader = Task3Grader(
96
  target_function=self._target_fn["name"],
97
- partial_credit_functions=t3.get("partial_credit_functions", []),
98
- property_english=t3.get("property_english", ""),
99
  )
100
  self._step_count = 0
101
  self._cum_reward = 0.0
@@ -142,7 +144,7 @@ class Task3Environment(BaseEnv):
142
  return StateResult(
143
  task_id=TASK_ID,
144
  contract_name=self._contract.get("contract_name", ""),
145
- target_function=self._target_fn.get("name"),
146
  step_count=self._step_count,
147
  cumulative_reward=self._cum_reward,
148
  done=self._done,
 
35
 
36
  import random
37
  from typing import Any, Dict, List, Optional, Set
38
+ from tasks.task3 import actions
39
 
40
  from data.data_loader import load_contracts, sample_task3_episode
41
  from env.base_env import BaseEnv
 
83
  self._seen: Set[str] = set()
84
 
85
  # ── OpenEnv interface ─────────────────────────────────────────────────────
86
+
87
+ # ! Need to change alot here
88
 
89
  def reset(self, seed: Optional[int] = None) -> ResetResult:
90
  if seed is not None:
 
96
  t3 = self._target_fn["task3"]
97
  self._grader = Task3Grader(
98
  target_function=self._target_fn["name"],
99
+ partial_credit_functions=t3.get("partial_credit_functions", []), # ! doesn't exists
100
+ property_english=t3.get("property_english", ""), # ! doesn't exist
101
  )
102
  self._step_count = 0
103
  self._cum_reward = 0.0
 
144
  return StateResult(
145
  task_id=TASK_ID,
146
  contract_name=self._contract.get("contract_name", ""),
147
+ target_function=self._target_fn.get("name", ""),
148
  step_count=self._step_count,
149
  cumulative_reward=self._cum_reward,
150
  done=self._done,
utils/prompts.py CHANGED
@@ -18,7 +18,16 @@ Given a contract, identify the ONE vulnerable function and its vulnerability typ
18
  timestamp dependence, denial of service, unchecked return value
19
  4. Submit when confident
20
 
21
- Respond ONLY with valid JSON. No explanation, no markdown."""
 
 
 
 
 
 
 
 
 
22
 
23
  T2_SYSTEM = """You are a formal methods engineer specialising in Solidity smart contracts.
24
 
@@ -36,31 +45,6 @@ A good property covers:
36
  {"action": "get_function_code", "params": {}}
37
  {"action": "get_function_natspec", "params": {}}
38
  {"action": "get_file_natspec", "params": {}}
39
-
40
- def _t3_user_msg(obs: Dict[str, Any]) -> str:
41
- extra = obs.get("extra", {})
42
- return (
43
- f"Contract : {obs['contract_name']}\n"
44
- f"Property : {extra.get('property_english', '(no property)')}\n"
45
- f"Step: {obs['step_count']} | Reward: {obs['cumulative_reward']:.2f}\n\n"
46
- f"Last action: {obs['last_action'] or 'None'}\n"
47
- f"Result:\n{obs['last_action_result'] or 'Episode started.'}"
48
- )
49
-
50
-
51
- def run_t3_episode(env: Task3Environment, seed: int, ep: int) -> Dict[str, Any]:
52
- r = env.reset(seed=seed)
53
- obs = r.observation.model_dump()
54
- prop_preview = obs['extra'].get('property_english', '')[:55]
55
- print(f" ep={ep} seed={seed} {obs['contract_name']} \"{prop_preview}...\"")
56
-
57
- messages = [{"role": "system", "content": T3_SYSTEM}]
58
- grader_score = 0.0
59
- cum_reward = 0.0
60
-
61
- for step in range(15):
62
- messages.append({"role": "user", "content": _t3_user_msg(obs)})
63
-
64
  {"action": "get_related_functions", "params": {}}
65
  {"action": "get_io", "params": {}}
66
  {"action": "get_similar_rule", "params": {}}
@@ -73,7 +57,14 @@ def run_t3_episode(env: Task3Environment, seed: int, ep: int) -> Dict[str, Any]:
73
  - Write 2–4 sentences. Be specific about variable names and amounts.
74
  - Do NOT guess — read the code first.
75
 
76
- Respond ONLY with valid JSON. No markdown, no explanation."""
 
 
 
 
 
 
 
77
 
78
 
79
  T3_SYSTEM = """You are a smart contract security auditor checking rule compliance.
 
18
  timestamp dependence, denial of service, unchecked return value
19
  4. Submit when confident
20
 
21
+ Respond ONLY with valid JSON. No explanation, no markdown.
22
+
23
+ ## Evaluation Strategy:
24
+ Your output vulnerability_type will be compared to ground truth using a deterministic semantic matcher with
25
+ three weighted components:
26
+ - Lexical Jaccard (20%) - overlap of lemmatized, stopword‑removed tokens.
27
+ - Synonym Jaccard (25%) - overlap after expanding each word with WordNet synonyms.
28
+ - Semantic cosine (55%) - sentence‑embedding similarity (all‑MiniLM‑L6‑v2).
29
+ Match Threshold: score ≥ 0.72 → "match" (partial credit); score ≥ 0.88 → "strong match" (full credit).
30
+ """
31
 
32
  T2_SYSTEM = """You are a formal methods engineer specialising in Solidity smart contracts.
33
 
 
45
  {"action": "get_function_code", "params": {}}
46
  {"action": "get_function_natspec", "params": {}}
47
  {"action": "get_file_natspec", "params": {}}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  {"action": "get_related_functions", "params": {}}
49
  {"action": "get_io", "params": {}}
50
  {"action": "get_similar_rule", "params": {}}
 
57
  - Write 2–4 sentences. Be specific about variable names and amounts.
58
  - Do NOT guess — read the code first.
59
 
60
+ Respond ONLY with valid JSON. No markdown, no explanation.
61
+
62
+ ## Evaluation Strategy:
63
+ Your output vulnerability_type will be compared to ground truth using a deterministic semantic matcher with three weighted components:
64
+ - Lexical Jaccard (20%) - overlap of lemmatized, stopword‑removed tokens.
65
+ - Synonym Jaccard (25%) - overlap after expanding each word with WordNet synonyms.
66
+ - Semantic cosine (55%) - sentence‑embedding similarity (all‑MiniLM‑L6‑v2).
67
+ Match Threshold: score ≥ 0.72 → "match" (partial credit); score ≥ 0.88 → "strong match" (full credit)."""
68
 
69
 
70
  T3_SYSTEM = """You are a smart contract security auditor checking rule compliance.
utils/propertyretriever.py CHANGED
@@ -10,10 +10,8 @@ import pandas as pd
10
  import numpy as np
11
  from sentence_transformers import SentenceTransformer
12
  from sklearn.preprocessing import normalize
13
- import os
14
 
15
- DATA_DIR = os.path.join(os.path.dirname(__file__))
16
- DEFAULT_CSV_PATH = os.path.join(DATA_DIR, "properties.csv")
17
  SIMILARITY_THRESHOLD = 0.8 # Adjust as needed based on validation
18
 
19
  # -------------------------------------------------------------------
 
10
  import numpy as np
11
  from sentence_transformers import SentenceTransformer
12
  from sklearn.preprocessing import normalize
13
+ from data.data_loader import DEFAULT_CSV_PATH
14
 
 
 
15
  SIMILARITY_THRESHOLD = 0.8 # Adjust as needed based on validation
16
 
17
  # -------------------------------------------------------------------
utils/semanticmatcher.py CHANGED
@@ -200,6 +200,7 @@ class SemanticMatcher:
200
  """
201
  # Fast-path: normalized exact match
202
  if normalize(text_a) == normalize(text_b):
 
203
  return True
204
 
205
  tokens_a = tokenize_and_lemmatize(text_a)
 
200
  """
201
  # Fast-path: normalized exact match
202
  if normalize(text_a) == normalize(text_b):
203
+ self.confidence_level = "strong"
204
  return True
205
 
206
  tokens_a = tokenize_and_lemmatize(text_a)
validate.py CHANGED
@@ -77,7 +77,7 @@ def check_t2_env():
77
  assert r.observation.task_id == "task2_property_discovery"
78
  assert "target_function" in r.observation.extra
79
  for at in [ActionType.GET_FUNCTION_CODE, ActionType.GET_FUNCTION_NATSPEC,
80
- ActionType.GET_FILE_NATSPEC, ActionType.GET_IO,
81
  ActionType.GET_RELATED_FUNCTIONS, ActionType.GET_SIMILAR_RULE]:
82
  env.step(Action(action_type=at))
83
 
@@ -180,7 +180,7 @@ def check_t2_grader():
180
  from data.data_loader import load_contracts, get_all_property_entries
181
  for c, fn in get_all_property_entries(load_contracts()):
182
  g = Task2Grader(fn["name"], fn["property"])
183
- assert g.grade(fn["property"]["natural_language"]) >= 0.65
184
  assert g.grade("") == 0.0
185
  s = g.grade("test"); assert s == g.grade("test") # deterministic
186
 
@@ -299,4 +299,4 @@ def main():
299
  sys.exit(0)
300
 
301
  if __name__ == "__main__":
302
- main()
 
77
  assert r.observation.task_id == "task2_property_discovery"
78
  assert "target_function" in r.observation.extra
79
  for at in [ActionType.GET_FUNCTION_CODE, ActionType.GET_FUNCTION_NATSPEC,
80
+ ActionType.GET_FILE_NATSPEC, ActionType.GET_SIGNATURE,
81
  ActionType.GET_RELATED_FUNCTIONS, ActionType.GET_SIMILAR_RULE]:
82
  env.step(Action(action_type=at))
83
 
 
180
  from data.data_loader import load_contracts, get_all_property_entries
181
  for c, fn in get_all_property_entries(load_contracts()):
182
  g = Task2Grader(fn["name"], fn["property"])
183
+ assert g.grade(fn["property"])[0] >= 0.65
184
  assert g.grade("") == 0.0
185
  s = g.grade("test"); assert s == g.grade("test") # deterministic
186
 
 
299
  sys.exit(0)
300
 
301
  if __name__ == "__main__":
302
+ main()