Spaces:
Running
Running
ajaxwin commited on
Commit Β·
41a051f
1
Parent(s): 48661cd
refactor: Reward clamping in graders
Browse filesfixed schemas.py bug
improved prompts
- env/schemas.py +0 -4
- server/tasks/task1/grader.py +5 -1
- server/tasks/task2/grader.py +4 -1
- server/tasks/task3/actions.py +7 -7
- server/tasks/task3/environment.py +4 -4
- server/tasks/task3/grader.py +4 -1
- utils/prompts.py +110 -104
env/schemas.py
CHANGED
|
@@ -50,10 +50,6 @@ class ActionType(str, Enum):
|
|
| 50 |
GET_PROPERTY_SPECIFICATION = ("get_property_specification", 0.02)
|
| 51 |
GET_FUNCTION_METADATA = ("get_function_metadata", 0.04)
|
| 52 |
SUBMIT_FUNCTION = ("submit_function", 0.0)
|
| 53 |
-
GET_FUNCTION_CODE3 = ("get_function_code", 0.05)
|
| 54 |
-
GET_STATE_VARIABLE3 = ("get_state_variable", 0.04)
|
| 55 |
-
GET_CALL_GRAPH3 = ("get_call_graph", 0.08)
|
| 56 |
-
LIST_FUNCTIONS3 = ("get_list_function", 0.02)
|
| 57 |
|
| 58 |
# βββββββ General Actions βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 59 |
UNKNOWN = ("unknown", 0.0)
|
|
|
|
| 50 |
GET_PROPERTY_SPECIFICATION = ("get_property_specification", 0.02)
|
| 51 |
GET_FUNCTION_METADATA = ("get_function_metadata", 0.04)
|
| 52 |
SUBMIT_FUNCTION = ("submit_function", 0.0)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
# βββββββ General Actions βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 55 |
UNKNOWN = ("unknown", 0.0)
|
server/tasks/task1/grader.py
CHANGED
|
@@ -16,6 +16,9 @@ class Task1Grader:
|
|
| 16 |
# Log of No. of functions (n) is a heurisitic used to decided the size of contract code
|
| 17 |
self.n = n
|
| 18 |
self._decay = 0.75
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
def grade(self, submitted_function: str, submitted_vuln_type: str, steps: int, cummulative_cost: int) -> float:
|
| 21 |
"""Returns grade strictly in (0, 1)."""
|
|
@@ -24,7 +27,8 @@ class Task1Grader:
|
|
| 24 |
|
| 25 |
# Score formula
|
| 26 |
free_budget = (cummulative_cost / steps) * (self.n + 2)
|
| 27 |
-
|
|
|
|
| 28 |
|
| 29 |
def get_canonical_answer(self) -> Dict[str, str]:
|
| 30 |
return {"function": self.target_function, "vulnerability": self.vulnerability_issue}
|
|
|
|
| 16 |
# Log of No. of functions (n) is a heurisitic used to decided the size of contract code
|
| 17 |
self.n = n
|
| 18 |
self._decay = 0.75
|
| 19 |
+
|
| 20 |
+
def _clamp(self, reward: float) -> float:
|
| 21 |
+
return max(0.001, min(0.999, reward))
|
| 22 |
|
| 23 |
def grade(self, submitted_function: str, submitted_vuln_type: str, steps: int, cummulative_cost: int) -> float:
|
| 24 |
"""Returns grade strictly in (0, 1)."""
|
|
|
|
| 27 |
|
| 28 |
# Score formula
|
| 29 |
free_budget = (cummulative_cost / steps) * (self.n + 2)
|
| 30 |
+
reward = func_match * issue_match * (self._decay ** max(0, cummulative_cost - free_budget))
|
| 31 |
+
return self._clamp(reward)
|
| 32 |
|
| 33 |
def get_canonical_answer(self) -> Dict[str, str]:
|
| 34 |
return {"function": self.target_function, "vulnerability": self.vulnerability_issue}
|
server/tasks/task2/grader.py
CHANGED
|
@@ -24,6 +24,9 @@ class Task2Grader:
|
|
| 24 |
self.property = property
|
| 25 |
self.n = n
|
| 26 |
self._decay = 0.75
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
def grade(self, submitted: str, steps: int, cummulative_cost: int) -> Tuple[float, str]:
|
| 29 |
"""Deterministic grade strictly in (0, 1)."""
|
|
@@ -35,4 +38,4 @@ class Task2Grader:
|
|
| 35 |
free_budget = (cummulative_cost / steps) * (self.n + 2)
|
| 36 |
final_score = match_score * (self._decay ** max(0, cummulative_cost - free_budget))
|
| 37 |
|
| 38 |
-
return final_score, matcher.confidence()
|
|
|
|
| 24 |
self.property = property
|
| 25 |
self.n = n
|
| 26 |
self._decay = 0.75
|
| 27 |
+
|
| 28 |
+
def _clamp(self, reward: float) -> float:
|
| 29 |
+
return max(0.001, min(0.999, reward))
|
| 30 |
|
| 31 |
def grade(self, submitted: str, steps: int, cummulative_cost: int) -> Tuple[float, str]:
|
| 32 |
"""Deterministic grade strictly in (0, 1)."""
|
|
|
|
| 38 |
free_budget = (cummulative_cost / steps) * (self.n + 2)
|
| 39 |
final_score = match_score * (self._decay ** max(0, cummulative_cost - free_budget))
|
| 40 |
|
| 41 |
+
return self._clamp(final_score), matcher.confidence()
|
server/tasks/task3/actions.py
CHANGED
|
@@ -19,7 +19,7 @@ def list_functions(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
|
|
| 19 |
names = list_function_names(ctx._contract)
|
| 20 |
return (
|
| 21 |
f"Functions in {ctx._contract['contract_name']}: {', '.join(names)}",
|
| 22 |
-
Reward(value=ActionType.
|
| 23 |
)
|
| 24 |
|
| 25 |
|
|
@@ -64,13 +64,13 @@ def get_function_code(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
|
|
| 64 |
return (
|
| 65 |
f"Function '{fn_name}' not found. "
|
| 66 |
f"Available: {list_function_names(ctx._contract)}",
|
| 67 |
-
Reward(value=ActionType.
|
| 68 |
)
|
| 69 |
|
| 70 |
code = fn.get("code", "// no code available")
|
| 71 |
return (
|
| 72 |
f"// {fn_name}\n{code}",
|
| 73 |
-
Reward(value=ActionType.
|
| 74 |
)
|
| 75 |
|
| 76 |
|
|
@@ -84,18 +84,18 @@ def get_state_variable(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
|
|
| 84 |
names = list_state_variable_names(ctx._contract)
|
| 85 |
return (
|
| 86 |
f"State variables: {', '.join(names)}",
|
| 87 |
-
Reward(value=ActionType.
|
| 88 |
)
|
| 89 |
|
| 90 |
sv = get_state_variable_by_name(ctx._contract, var_name)
|
| 91 |
if sv is None:
|
| 92 |
return (
|
| 93 |
f"Variable '{var_name}' not found.",
|
| 94 |
-
Reward(value=ActionType.
|
| 95 |
)
|
| 96 |
return (
|
| 97 |
f"{sv['type']} {sv['visibility']} {sv['name']}: {sv.get('description','')}",
|
| 98 |
-
Reward(value=ActionType.
|
| 99 |
)
|
| 100 |
|
| 101 |
|
|
@@ -109,7 +109,7 @@ def get_call_graph(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
|
|
| 109 |
)
|
| 110 |
return (
|
| 111 |
f"Call graph: {cg_str}",
|
| 112 |
-
Reward(value=ActionType.
|
| 113 |
)
|
| 114 |
|
| 115 |
def get_property_specification(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
|
|
|
|
| 19 |
names = list_function_names(ctx._contract)
|
| 20 |
return (
|
| 21 |
f"Functions in {ctx._contract['contract_name']}: {', '.join(names)}",
|
| 22 |
+
Reward(value=ActionType.LIST_FUNCTIONS.cost, reason="list_functions cost"),
|
| 23 |
)
|
| 24 |
|
| 25 |
|
|
|
|
| 64 |
return (
|
| 65 |
f"Function '{fn_name}' not found. "
|
| 66 |
f"Available: {list_function_names(ctx._contract)}",
|
| 67 |
+
Reward(value=ActionType.GET_FUNCTION_CODE.cost, reason="Unknown function β extra penalty"),
|
| 68 |
)
|
| 69 |
|
| 70 |
code = fn.get("code", "// no code available")
|
| 71 |
return (
|
| 72 |
f"// {fn_name}\n{code}",
|
| 73 |
+
Reward(value=ActionType.GET_FUNCTION_CODE.cost, reason="get_function_code cost"),
|
| 74 |
)
|
| 75 |
|
| 76 |
|
|
|
|
| 84 |
names = list_state_variable_names(ctx._contract)
|
| 85 |
return (
|
| 86 |
f"State variables: {', '.join(names)}",
|
| 87 |
+
Reward(value=ActionType.GET_STATE_VARIABLE.cost, reason="Listed state variables"),
|
| 88 |
)
|
| 89 |
|
| 90 |
sv = get_state_variable_by_name(ctx._contract, var_name)
|
| 91 |
if sv is None:
|
| 92 |
return (
|
| 93 |
f"Variable '{var_name}' not found.",
|
| 94 |
+
Reward(value=ActionType.GET_STATE_VARIABLE.cost, reason="Unknown state variable"),
|
| 95 |
)
|
| 96 |
return (
|
| 97 |
f"{sv['type']} {sv['visibility']} {sv['name']}: {sv.get('description','')}",
|
| 98 |
+
Reward(value=ActionType.GET_STATE_VARIABLE.cost, reason="get_state_variable cost"),
|
| 99 |
)
|
| 100 |
|
| 101 |
|
|
|
|
| 109 |
)
|
| 110 |
return (
|
| 111 |
f"Call graph: {cg_str}",
|
| 112 |
+
Reward(value=ActionType.GET_CALL_GRAPH.cost, reason="get_call_graph cost"),
|
| 113 |
)
|
| 114 |
|
| 115 |
def get_property_specification(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
|
server/tasks/task3/environment.py
CHANGED
|
@@ -184,11 +184,11 @@ class Task3Environment(BaseEnv):
|
|
| 184 |
|
| 185 |
# Mapping from ActionType to handler function
|
| 186 |
handlers = {
|
| 187 |
-
ActionType.
|
| 188 |
ActionType.GET_FUNCTION_METADATA: actions.get_function_metadata,
|
| 189 |
-
ActionType.
|
| 190 |
-
ActionType.
|
| 191 |
-
ActionType.
|
| 192 |
ActionType.GET_PROPERTY_SPECIFICATION: actions.get_property_specification,
|
| 193 |
ActionType.SUBMIT_FUNCTION: actions.submit_function,
|
| 194 |
}
|
|
|
|
| 184 |
|
| 185 |
# Mapping from ActionType to handler function
|
| 186 |
handlers = {
|
| 187 |
+
ActionType.LIST_FUNCTIONS: actions.list_functions,
|
| 188 |
ActionType.GET_FUNCTION_METADATA: actions.get_function_metadata,
|
| 189 |
+
ActionType.GET_FUNCTION_CODE: actions.get_function_code,
|
| 190 |
+
ActionType.GET_STATE_VARIABLE: actions.get_state_variable,
|
| 191 |
+
ActionType.GET_CALL_GRAPH: actions.get_call_graph,
|
| 192 |
ActionType.GET_PROPERTY_SPECIFICATION: actions.get_property_specification,
|
| 193 |
ActionType.SUBMIT_FUNCTION: actions.submit_function,
|
| 194 |
}
|
server/tasks/task3/grader.py
CHANGED
|
@@ -34,6 +34,9 @@ class Task3Grader:
|
|
| 34 |
self.property_specification = property_specification
|
| 35 |
self.max_steps = max_steps
|
| 36 |
self._decay = 0.01
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
def grade(self, submitted_function: str, steps: int, cummulative_cost: int) -> float:
|
| 39 |
"""Returns deterministic grade strictly in (0, 1)."""
|
|
@@ -46,7 +49,7 @@ class Task3Grader:
|
|
| 46 |
reward = self.REWARD_PARTIAL
|
| 47 |
|
| 48 |
penalty = self._decay ** (-(steps * cummulative_cost) / self.max_steps)
|
| 49 |
-
return reward * penalty
|
| 50 |
|
| 51 |
def get_canonical_answer(self) -> Dict[str, Dict | str]:
|
| 52 |
"""For debugging / logging only β do not expose to the agent."""
|
|
|
|
| 34 |
self.property_specification = property_specification
|
| 35 |
self.max_steps = max_steps
|
| 36 |
self._decay = 0.01
|
| 37 |
+
|
| 38 |
+
def _clamp(self, reward: float) -> float:
|
| 39 |
+
return max(0.001, min(0.999, reward))
|
| 40 |
|
| 41 |
def grade(self, submitted_function: str, steps: int, cummulative_cost: int) -> float:
|
| 42 |
"""Returns deterministic grade strictly in (0, 1)."""
|
|
|
|
| 49 |
reward = self.REWARD_PARTIAL
|
| 50 |
|
| 51 |
penalty = self._decay ** (-(steps * cummulative_cost) / self.max_steps)
|
| 52 |
+
return self._clamp(reward * penalty)
|
| 53 |
|
| 54 |
def get_canonical_answer(self) -> Dict[str, Dict | str]:
|
| 55 |
"""For debugging / logging only β do not expose to the agent."""
|
utils/prompts.py
CHANGED
|
@@ -1,106 +1,112 @@
|
|
| 1 |
-
T1_SYSTEM = """You are
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
{"action":
|
| 11 |
-
{"action":
|
| 12 |
-
{"action":
|
| 13 |
-
{"action":
|
| 14 |
-
{"action":
|
| 15 |
-
{"action":
|
| 16 |
-
{"action":
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
1.
|
| 20 |
-
2.
|
| 21 |
-
3.
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
-
|
| 31 |
-
-
|
| 32 |
-
-
|
| 33 |
-
|
|
|
|
|
|
|
| 34 |
"""
|
| 35 |
|
| 36 |
-
T2_SYSTEM = """You are a formal methods engineer
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
-
|
| 63 |
-
-
|
| 64 |
-
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
{"action":
|
| 89 |
-
{"action":
|
| 90 |
-
{"action":
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
T1_SYSTEM = """You are a Solidity security auditor.
|
| 2 |
+
|
| 3 |
+
Goal: Identify exactly ONE vulnerable function and its vulnerability type.
|
| 4 |
+
|
| 5 |
+
Constraints:
|
| 6 |
+
- Each action has a cost β minimize steps.
|
| 7 |
+
- Prefer high-signal queries.
|
| 8 |
+
|
| 9 |
+
Available actions (ONE per turn, JSON only):
|
| 10 |
+
{"action":"list_functions","params":{}}
|
| 11 |
+
{"action":"get_function_code","params":{"function_name":"<name>"}}
|
| 12 |
+
{"action":"get_function_summary","params":{"function_name":"<name>"}}
|
| 13 |
+
{"action":"get_file_metadata","params":{}}
|
| 14 |
+
{"action":"get_state_variable","params":{"variable_name":"<name>"}}
|
| 15 |
+
{"action":"get_call_graph","params":{}}
|
| 16 |
+
{"action":"submit","params":{"function_name":"<name>","vulnerability_type":"<2-3 words>"}}
|
| 17 |
+
|
| 18 |
+
Heuristic:
|
| 19 |
+
1. Start: list_functions
|
| 20 |
+
2. Prioritize critical functions: withdraw, transfer, claim, stake, buy, bid, finalize, set*
|
| 21 |
+
3. Use summaries first; fetch full code only if needed
|
| 22 |
+
4. Inspect state/call graph only if hypothesis requires it
|
| 23 |
+
|
| 24 |
+
Common vulnerabilities in contracts:
|
| 25 |
+
- reentrancy
|
| 26 |
+
- access control
|
| 27 |
+
- integer overflow/underflow
|
| 28 |
+
- unchecked external call
|
| 29 |
+
- tx.origin misuse
|
| 30 |
+
- front-running
|
| 31 |
+
- timestamp dependence
|
| 32 |
+
- denial of service
|
| 33 |
+
|
| 34 |
+
Submit immediately once confident.
|
| 35 |
+
Output: JSON only. No text.
|
| 36 |
"""
|
| 37 |
|
| 38 |
+
T2_SYSTEM = """You are a Solidity formal methods engineer.
|
| 39 |
+
|
| 40 |
+
Goal: Write ONE precise natural-language property (postcondition/invariant) for the given function.
|
| 41 |
+
|
| 42 |
+
Constraints:
|
| 43 |
+
- Actions have cost β minimize steps.
|
| 44 |
+
- ONE submit attempt only.
|
| 45 |
+
|
| 46 |
+
Actions (ONE per turn, JSON only):
|
| 47 |
+
{"action":"get_function_code","params":{}}
|
| 48 |
+
{"action":"get_function_natspec","params":{}}
|
| 49 |
+
{"action":"get_file_natspec","params":{}}
|
| 50 |
+
{"action":"get_related_functions","params":{}}
|
| 51 |
+
{"action":"get_signature","params":{}}
|
| 52 |
+
{"action":"get_similar_rule","params":{}}
|
| 53 |
+
{"action":"submit_property","params":{"property":"<text>"}}
|
| 54 |
+
|
| 55 |
+
Strategy:
|
| 56 |
+
1. Start with get_signature + get_function_natspec
|
| 57 |
+
2. Fetch code if behavior unclear
|
| 58 |
+
3. Use related/state context only if needed
|
| 59 |
+
4. Use similar_rule sparingly (high cost)
|
| 60 |
+
|
| 61 |
+
Example Property requirements:
|
| 62 |
+
- Describe exact state changes (variables, balances, mappings)
|
| 63 |
+
- Specify asset transfers (ETH/tokens/NFTs) with amounts
|
| 64 |
+
- Include return values (if any)
|
| 65 |
+
- State revert conditions (if relevant)
|
| 66 |
+
- Use concrete variable names (no vague terms)
|
| 67 |
+
|
| 68 |
+
Format:
|
| 69 |
+
- 2β4 sentences
|
| 70 |
+
- Deterministic, testable, no speculation
|
| 71 |
+
|
| 72 |
+
Submit immediately once confident.
|
| 73 |
+
|
| 74 |
+
Output: JSON only.
|
| 75 |
+
"""
|
| 76 |
+
|
| 77 |
+
T3_SYSTEM = """You are a Solidity security auditor.
|
| 78 |
+
|
| 79 |
+
Goal: Identify ONE function that violates the given property.
|
| 80 |
+
|
| 81 |
+
Constraints:
|
| 82 |
+
- Actions have cost β minimize steps
|
| 83 |
+
- ONE submit attempt only
|
| 84 |
+
|
| 85 |
+
Actions (ONE per turn, JSON only):
|
| 86 |
+
{"action":"list_functions","params":{}}
|
| 87 |
+
{"action":"get_property_specification","params":{}}
|
| 88 |
+
{"action":"get_function_metadata","params":{"function_name":"<n>"}}
|
| 89 |
+
{"action":"get_function_code","params":{"function_name":"<n>"}}
|
| 90 |
+
{"action":"get_state_variable","params":{"variable_name":"<n>"}}
|
| 91 |
+
{"action":"get_call_graph","params":{}}
|
| 92 |
+
{"action":"submit_function","params":{"function_name":"<n>"}}
|
| 93 |
+
|
| 94 |
+
Strategy:
|
| 95 |
+
1. Read property β extract required guarantees (state, access, ordering)
|
| 96 |
+
2. list_functions to identify candidates
|
| 97 |
+
3. Use property_specification for precise constraints (cheap)
|
| 98 |
+
4. Inspect 1β2 likely violators via metadata β code
|
| 99 |
+
5. Use state/call graph only if violation depends on context
|
| 100 |
+
|
| 101 |
+
Example Violation heuristics:
|
| 102 |
+
- Missing/incorrect require conditions
|
| 103 |
+
- Access control mismatch
|
| 104 |
+
- Incorrect state updates or ordering
|
| 105 |
+
- Unsafe external calls (reentrancy)
|
| 106 |
+
- Violated invariants (balances, totals, limits)
|
| 107 |
+
|
| 108 |
+
Select the function that clearly breaks the property.
|
| 109 |
+
Submit immediately once confident.
|
| 110 |
+
|
| 111 |
+
Output: JSON only.
|
| 112 |
+
"""
|