Spaces:
Running
Running
ajaxwin commited on
Commit ·
f78cba2
1
Parent(s): 7f7bcc6
refactor: Improved grading logic for task 2
Browse files- agents/task2.py +5 -13
- data/contracts.json +2 -2
- env/schemas.py +8 -8
- eval.py +5 -5
- server/tasks/task1/environment.py +0 -2
- server/tasks/task2/actions.py +22 -47
- server/tasks/task2/environment.py +6 -8
- server/tasks/task2/grader.py +10 -11
agents/task2.py
CHANGED
|
@@ -30,11 +30,8 @@ def oracle_t2(env: Task2Environment, seed: int, verbose: bool = False) -> Dict[s
|
|
| 30 |
env.step(Action(action_type=ActionType.GET_FUNCTION_CODE))
|
| 31 |
result = env.step(Action(action_type=ActionType.SUBMIT_PROPERTY,
|
| 32 |
params={"property": gt_text}))
|
| 33 |
-
r_val = result.reward.value
|
| 34 |
-
score = round(r_val / 5.0, 4) if r_val > 0 else 0.0
|
| 35 |
return {"seed": seed, "contract": contract, "function": fn_name,
|
| 36 |
-
"grader_score":
|
| 37 |
-
"cumulative_reward": result.observation.cumulative_reward}
|
| 38 |
|
| 39 |
|
| 40 |
def partial_t2(env: Task2Environment, seed: int) -> Dict[str, Any]:
|
|
@@ -51,9 +48,7 @@ def partial_t2(env: Task2Environment, seed: int) -> Dict[str, Any]:
|
|
| 51 |
break
|
| 52 |
result = env.step(Action(action_type=ActionType.SUBMIT_PROPERTY,
|
| 53 |
params={"property": comment}))
|
| 54 |
-
|
| 55 |
-
return {"seed": seed, "grader_score": round(r_val / 5.0, 4) if r_val > 0 else 0.0,
|
| 56 |
-
"cumulative_reward": result.observation.cumulative_reward}
|
| 57 |
|
| 58 |
|
| 59 |
def random_t2(env: Task2Environment, seed: int) -> Dict[str, Any]:
|
|
@@ -90,10 +85,8 @@ def random_t2(env: Task2Environment, seed: int) -> Dict[str, Any]:
|
|
| 90 |
prop = rng.choice(templates)
|
| 91 |
result = env.step(Action(action_type=ActionType.SUBMIT_PROPERTY,
|
| 92 |
params={"property": prop}))
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
"submitted": prop[:60],
|
| 96 |
-
"cumulative_reward": result.observation.cumulative_reward}
|
| 97 |
|
| 98 |
|
| 99 |
def floor_t2(env: Task2Environment, seed: int) -> Dict[str, Any]:
|
|
@@ -101,5 +94,4 @@ def floor_t2(env: Task2Environment, seed: int) -> Dict[str, Any]:
|
|
| 101 |
env.reset(seed=seed)
|
| 102 |
result = env.step(Action(action_type=ActionType.SUBMIT_PROPERTY,
|
| 103 |
params={"property": ""}))
|
| 104 |
-
return {"seed": seed, "grader_score": 0.
|
| 105 |
-
"cumulative_reward": result.observation.cumulative_reward}
|
|
|
|
| 30 |
env.step(Action(action_type=ActionType.GET_FUNCTION_CODE))
|
| 31 |
result = env.step(Action(action_type=ActionType.SUBMIT_PROPERTY,
|
| 32 |
params={"property": gt_text}))
|
|
|
|
|
|
|
| 33 |
return {"seed": seed, "contract": contract, "function": fn_name,
|
| 34 |
+
"grader_score": result.reward.value }
|
|
|
|
| 35 |
|
| 36 |
|
| 37 |
def partial_t2(env: Task2Environment, seed: int) -> Dict[str, Any]:
|
|
|
|
| 48 |
break
|
| 49 |
result = env.step(Action(action_type=ActionType.SUBMIT_PROPERTY,
|
| 50 |
params={"property": comment}))
|
| 51 |
+
return {"seed": seed, "grader_score": result.reward.value}
|
|
|
|
|
|
|
| 52 |
|
| 53 |
|
| 54 |
def random_t2(env: Task2Environment, seed: int) -> Dict[str, Any]:
|
|
|
|
| 85 |
prop = rng.choice(templates)
|
| 86 |
result = env.step(Action(action_type=ActionType.SUBMIT_PROPERTY,
|
| 87 |
params={"property": prop}))
|
| 88 |
+
return {"seed": seed, "grader_score": result.reward.value,
|
| 89 |
+
"submitted": prop[:60]}
|
|
|
|
|
|
|
| 90 |
|
| 91 |
|
| 92 |
def floor_t2(env: Task2Environment, seed: int) -> Dict[str, Any]:
|
|
|
|
| 94 |
env.reset(seed=seed)
|
| 95 |
result = env.step(Action(action_type=ActionType.SUBMIT_PROPERTY,
|
| 96 |
params={"property": ""}))
|
| 97 |
+
return {"seed": seed, "grader_score": 0.001}
|
|
|
data/contracts.json
CHANGED
|
@@ -227,7 +227,7 @@
|
|
| 227 |
"property_specification": {
|
| 228 |
"precondition": "User has AToken balance B",
|
| 229 |
"operation": "burn(user, receiver, amount, index)",
|
| 230 |
-
"
|
| 231 |
"actual": "When amount.rayDiv(index) rounds down to 0, the burn operation transfers amount underlying tokens but burns 0 ATokens, resulting in user AToken balance unchanged = B, violating the postcondition where the balance should be B - amount."
|
| 232 |
}
|
| 233 |
},
|
|
@@ -934,7 +934,7 @@
|
|
| 934 |
"property_specification": {
|
| 935 |
"precondition": "User has debt balance B",
|
| 936 |
"operation": "mint(user, onBehalfOf, amount, rate)",
|
| 937 |
-
"
|
| 938 |
"actual": "When amount conversion rounds down to 0 in intermediate calculations, the mint operation may mint zero debt tokens while still transferring underlying tokens (or vice versa), resulting in user debt balance unchanged = B, violating the postcondition where the balance should be B + amount."
|
| 939 |
}
|
| 940 |
},
|
|
|
|
| 227 |
"property_specification": {
|
| 228 |
"precondition": "User has AToken balance B",
|
| 229 |
"operation": "burn(user, receiver, amount, index)",
|
| 230 |
+
"postcondition": "User's AToken balance = B - amount (within rounding tolerance ε)",
|
| 231 |
"actual": "When amount.rayDiv(index) rounds down to 0, the burn operation transfers amount underlying tokens but burns 0 ATokens, resulting in user AToken balance unchanged = B, violating the postcondition where the balance should be B - amount."
|
| 232 |
}
|
| 233 |
},
|
|
|
|
| 934 |
"property_specification": {
|
| 935 |
"precondition": "User has debt balance B",
|
| 936 |
"operation": "mint(user, onBehalfOf, amount, rate)",
|
| 937 |
+
"postcondition": "User's debt balance = B + amount (within rounding tolerance ε)",
|
| 938 |
"actual": "When amount conversion rounds down to 0 in intermediate calculations, the mint operation may mint zero debt tokens while still transferring underlying tokens (or vice versa), resulting in user debt balance unchanged = B, violating the postcondition where the balance should be B + amount."
|
| 939 |
}
|
| 940 |
},
|
env/schemas.py
CHANGED
|
@@ -39,17 +39,17 @@ class ActionType(str, Enum):
|
|
| 39 |
SUBMIT = ("submit", 0.0)
|
| 40 |
|
| 41 |
# ── Task 2 – Property Discovery ─────────────────────────────────────────
|
| 42 |
-
GET_SIMILAR_RULE = ("get_similar_rule", 0.
|
| 43 |
-
GET_FILE_NATSPEC = ("get_file_natspec", 0.
|
| 44 |
-
GET_FUNCTION_NATSPEC = ("get_function_natspec", 0.
|
| 45 |
-
GET_RELATED_FUNCTIONS = ("get_related_functions", 0.
|
| 46 |
-
GET_SIGNATURE = ("get_signature", 0.
|
| 47 |
SUBMIT_PROPERTY = ("submit_property", 0.0)
|
| 48 |
|
| 49 |
# ── Task 3 – Rule Checker ────────────────────────────────────────────────
|
| 50 |
-
GET_PROPERTY_SPECIFICATION
|
| 51 |
-
GET_FUNCTION_METADATA
|
| 52 |
-
SUBMIT_FUNCTION
|
| 53 |
|
| 54 |
# ─────── General Actions ─────────────────────────────────────────────────
|
| 55 |
UNKNOWN = ("unknown", 0.0)
|
|
|
|
| 39 |
SUBMIT = ("submit", 0.0)
|
| 40 |
|
| 41 |
# ── Task 2 – Property Discovery ─────────────────────────────────────────
|
| 42 |
+
GET_SIMILAR_RULE = ("get_similar_rule", 0.15)
|
| 43 |
+
GET_FILE_NATSPEC = ("get_file_natspec", 0.05)
|
| 44 |
+
GET_FUNCTION_NATSPEC = ("get_function_natspec", -0.08)
|
| 45 |
+
GET_RELATED_FUNCTIONS = ("get_related_functions", 0.07)
|
| 46 |
+
GET_SIGNATURE = ("get_signature", 0.04)
|
| 47 |
SUBMIT_PROPERTY = ("submit_property", 0.0)
|
| 48 |
|
| 49 |
# ── Task 3 – Rule Checker ────────────────────────────────────────────────
|
| 50 |
+
GET_PROPERTY_SPECIFICATION = ("get_property_specification", 0.0)
|
| 51 |
+
GET_FUNCTION_METADATA = ("get_function_metadata", 0.0)
|
| 52 |
+
SUBMIT_FUNCTION = ("submit_function", 0.0)
|
| 53 |
|
| 54 |
# ─────── General Actions ─────────────────────────────────────────────────
|
| 55 |
UNKNOWN = ("unknown", 0.0)
|
eval.py
CHANGED
|
@@ -121,7 +121,7 @@ def run_task2_eval(n: int, seed_offset: int, verbose: bool) -> Dict[str, Any]:
|
|
| 121 |
oracle_eps.append(ep)
|
| 122 |
icon = "✅" if ep["grader_score"] >= 0.65 else "⚠️ "
|
| 123 |
print(f" {icon} seed={ep['seed']:3d} {ep['contract']:12s}.{ep['function']:18s}"
|
| 124 |
-
f" score={ep['grader_score']:.3f}
|
| 125 |
oracle_avg = _avg(oracle_eps)
|
| 126 |
print(f"\n Oracle avg: {oracle_avg:.3f}")
|
| 127 |
|
|
@@ -143,10 +143,10 @@ def run_task2_eval(n: int, seed_offset: int, verbose: bool) -> Dict[str, Any]:
|
|
| 143 |
floor_avg = _avg(floor_eps)
|
| 144 |
print(f" Floor avg: {floor_avg:.3f}")
|
| 145 |
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
print(f"\n ✅ Task 2: oracle({oracle_avg:.3f}) > partial({partial_avg:.3f})"
|
| 151 |
f" ≥ random({random_avg:.3f}) ≥ floor(0.0)")
|
| 152 |
|
|
|
|
| 121 |
oracle_eps.append(ep)
|
| 122 |
icon = "✅" if ep["grader_score"] >= 0.65 else "⚠️ "
|
| 123 |
print(f" {icon} seed={ep['seed']:3d} {ep['contract']:12s}.{ep['function']:18s}"
|
| 124 |
+
f" score={ep['grader_score']:.3f}")
|
| 125 |
oracle_avg = _avg(oracle_eps)
|
| 126 |
print(f"\n Oracle avg: {oracle_avg:.3f}")
|
| 127 |
|
|
|
|
| 143 |
floor_avg = _avg(floor_eps)
|
| 144 |
print(f" Floor avg: {floor_avg:.3f}")
|
| 145 |
|
| 146 |
+
assert oracle_avg > 0.60, f"Oracle avg {oracle_avg:.3f} should be > 0.60"
|
| 147 |
+
assert oracle_avg > partial_avg >= floor_avg, \
|
| 148 |
+
"Score ordering violated: oracle > partial >= floor"
|
| 149 |
+
assert floor_avg == 0.001, f"Floor avg {floor_avg:.3f} should be 0.0"
|
| 150 |
print(f"\n ✅ Task 2: oracle({oracle_avg:.3f}) > partial({partial_avg:.3f})"
|
| 151 |
f" ≥ random({random_avg:.3f}) ≥ floor(0.0)")
|
| 152 |
|
server/tasks/task1/environment.py
CHANGED
|
@@ -61,8 +61,6 @@ class Task1Environment(BaseEnv):
|
|
| 61 |
self._done: bool = False
|
| 62 |
self._query_history: List[str] = []
|
| 63 |
self._seen_queries: Set[str] = set()
|
| 64 |
-
self._cost_free_steps: int = 0
|
| 65 |
-
self._decay: float = 0.0
|
| 66 |
|
| 67 |
# ------------------------------------------------------------------
|
| 68 |
# OpenEnv interface
|
|
|
|
| 61 |
self._done: bool = False
|
| 62 |
self._query_history: List[str] = []
|
| 63 |
self._seen_queries: Set[str] = set()
|
|
|
|
|
|
|
| 64 |
|
| 65 |
# ------------------------------------------------------------------
|
| 66 |
# OpenEnv interface
|
server/tasks/task2/actions.py
CHANGED
|
@@ -19,13 +19,13 @@ PropertyRetrieverInstance = PropertyRetriever() # Load once at module level
|
|
| 19 |
def get_function_code(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
|
| 20 |
"""Handle GET_FUNCTION_CODE action."""
|
| 21 |
if ctx._is_repeated(qkey):
|
| 22 |
-
return "Repeated query.", Reward(value=
|
| 23 |
|
| 24 |
fn = ctx._target_fn
|
| 25 |
code = fn.get("code", "// no code available")
|
| 26 |
return (
|
| 27 |
code,
|
| 28 |
-
Reward(value=
|
| 29 |
)
|
| 30 |
|
| 31 |
# TODO: Can separate comment and output_property(output_comment)
|
|
@@ -33,7 +33,7 @@ def get_function_code(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
|
|
| 33 |
def get_function_natspec(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
|
| 34 |
"""Handle GET_FUNCTION_NATSPEC action."""
|
| 35 |
if ctx._is_repeated(qkey):
|
| 36 |
-
return "Repeated query.", Reward(value=
|
| 37 |
|
| 38 |
fn = ctx._target_fn
|
| 39 |
name = fn["name"]
|
|
@@ -42,26 +42,26 @@ def get_function_natspec(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward
|
|
| 42 |
result = f"NatSpec for '{name}':\n{natspec}"
|
| 43 |
if out_prop:
|
| 44 |
result += f"\n\nExpected output: {out_prop}"
|
| 45 |
-
return result, Reward(value=
|
| 46 |
|
| 47 |
|
| 48 |
def get_file_natspec(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
|
| 49 |
"""Handle GET_FILE_NATSPEC action."""
|
| 50 |
if ctx._is_repeated(qkey):
|
| 51 |
-
return "Repeated query.", Reward(value=
|
| 52 |
|
| 53 |
meta = ctx._contract.get("metadata", {})
|
| 54 |
natspec = meta.get("natspec") or meta.get("description", "No file NatSpec available.")
|
| 55 |
return (
|
| 56 |
f"File NatSpec for {ctx._contract['contract_name']}:\n{natspec}",
|
| 57 |
-
Reward(value=
|
| 58 |
)
|
| 59 |
|
| 60 |
|
| 61 |
def get_related_functions_action(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
|
| 62 |
"""Handle GET_RELATED_FUNCTIONS action."""
|
| 63 |
if ctx._is_repeated(qkey):
|
| 64 |
-
return "Repeated query.", Reward(value=
|
| 65 |
|
| 66 |
name = ctx._target_fn["name"]
|
| 67 |
related = get_related_functions(ctx._contract, name)
|
|
@@ -76,23 +76,23 @@ def get_related_functions_action(ctx: Any, qkey: str, params: Dict) -> Tuple[str
|
|
| 76 |
comment = rfn.get("comment", "")
|
| 77 |
summaries.append(f" • {sig} — {comment}")
|
| 78 |
text = f"Related functions for '{name}':\n" + "\n".join(summaries)
|
| 79 |
-
return text, Reward(value=
|
| 80 |
|
| 81 |
|
| 82 |
def get_signature(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
|
| 83 |
"""Handle GET_SIGNATURE action."""
|
| 84 |
if ctx._is_repeated(qkey):
|
| 85 |
-
return "Repeated query.", Reward(value=
|
| 86 |
|
| 87 |
fn = ctx._target_fn
|
| 88 |
sig = fn.get("signature")
|
| 89 |
-
return sig, Reward(value=
|
| 90 |
|
| 91 |
|
| 92 |
def get_similar_rule_action(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
|
| 93 |
"""Handle GET_SIMILAR_RULE action."""
|
| 94 |
if ctx._is_repeated(qkey):
|
| 95 |
-
return "Repeated query.", Reward(value=
|
| 96 |
|
| 97 |
PropertyRetrieverInstance.load_model() # Ensure model is loaded before querying
|
| 98 |
similar_rule = PropertyRetrieverInstance.get_similar_property(ctx._target_fn["code"])
|
|
@@ -101,8 +101,7 @@ def get_similar_rule_action(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Rew
|
|
| 101 |
"No similar rule available for this function.",
|
| 102 |
Reward(value=-0.20, reason="get_similar_rule cost (not found)"),
|
| 103 |
)
|
| 104 |
-
return similar_rule, Reward(value=
|
| 105 |
-
|
| 106 |
|
| 107 |
def submit_property(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
|
| 108 |
"""Handle SUBMIT_PROPERTY action for Task 2.
|
|
@@ -111,46 +110,19 @@ def submit_property(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
|
|
| 111 |
---------------
|
| 112 |
property : str – natural-language property describing the function's behaviour
|
| 113 |
"""
|
| 114 |
-
|
| 115 |
-
return (
|
| 116 |
-
"❌ You have already submitted for this episode. "
|
| 117 |
-
"Only ONE submission is allowed.",
|
| 118 |
-
Reward(value=0.0, reason="Second submit_property attempt", partial=False),
|
| 119 |
-
)
|
| 120 |
-
|
| 121 |
submitted_property = params.get("property", "").strip()
|
| 122 |
-
|
| 123 |
if not submitted_property:
|
| 124 |
return (
|
| 125 |
"submit_property requires a non-empty 'property' string in params.",
|
| 126 |
-
Reward(value=
|
| 127 |
)
|
| 128 |
|
| 129 |
-
ctx._submitted = True
|
| 130 |
ctx._done = True
|
|
|
|
| 131 |
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
reward_val = float(score) # reward == grade for Task 2
|
| 135 |
-
|
| 136 |
-
if confidence == "strong":
|
| 137 |
-
msg = (
|
| 138 |
-
f"✅ STRONG MATCH. Your property closely matches the target. "
|
| 139 |
-
f"Score: {score:.3f} → Reward: {reward_val:.3f}"
|
| 140 |
-
)
|
| 141 |
-
elif confidence == "moderate":
|
| 142 |
-
msg = (
|
| 143 |
-
f"🟡 MODERATE MATCH. Your property partially captures the target behaviour. "
|
| 144 |
-
f"Score: {score:.3f} → Reward: {reward_val:.3f}"
|
| 145 |
-
)
|
| 146 |
-
else:
|
| 147 |
-
msg = (
|
| 148 |
-
f"❌ LOW MATCH. Your property does not sufficiently match the target. "
|
| 149 |
-
f"Score: {score:.3f} → Reward: {reward_val:.3f}"
|
| 150 |
-
)
|
| 151 |
-
|
| 152 |
-
return msg, Reward(
|
| 153 |
-
value=reward_val,
|
| 154 |
reason=f"submit_property confidence={confidence} score={score:.3f}",
|
| 155 |
partial=False,
|
| 156 |
)
|
|
@@ -158,7 +130,10 @@ def submit_property(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
|
|
| 158 |
|
| 159 |
def unknown_action(ctx: Any, qkey: str, params: Dict, action_type: str) -> Tuple[str, Reward]:
|
| 160 |
"""Fallback for unknown actions."""
|
|
|
|
|
|
|
| 161 |
return (
|
| 162 |
-
f"Unknown action type: '{action_type}'. Valid: {[a.value for a in ActionType]}
|
| 163 |
-
|
|
|
|
| 164 |
)
|
|
|
|
| 19 |
def get_function_code(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
|
| 20 |
"""Handle GET_FUNCTION_CODE action."""
|
| 21 |
if ctx._is_repeated(qkey):
|
| 22 |
+
return "Repeated query.", Reward(value=ActionType.REPEATED.cost, reason="Repeated query")
|
| 23 |
|
| 24 |
fn = ctx._target_fn
|
| 25 |
code = fn.get("code", "// no code available")
|
| 26 |
return (
|
| 27 |
code,
|
| 28 |
+
Reward(value=ActionType.GET_FUNCTION_CODE.cost, reason="get_function_code cost"),
|
| 29 |
)
|
| 30 |
|
| 31 |
# TODO: Can separate comment and output_property(output_comment)
|
|
|
|
| 33 |
def get_function_natspec(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
|
| 34 |
"""Handle GET_FUNCTION_NATSPEC action."""
|
| 35 |
if ctx._is_repeated(qkey):
|
| 36 |
+
return "Repeated query.", Reward(value=ActionType.REPEATED.cost, reason="Repeated query")
|
| 37 |
|
| 38 |
fn = ctx._target_fn
|
| 39 |
name = fn["name"]
|
|
|
|
| 42 |
result = f"NatSpec for '{name}':\n{natspec}"
|
| 43 |
if out_prop:
|
| 44 |
result += f"\n\nExpected output: {out_prop}"
|
| 45 |
+
return result, Reward(value=ActionType.GET_FILE_NATSPEC.cost, reason="get_function_natspec cost")
|
| 46 |
|
| 47 |
|
| 48 |
def get_file_natspec(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
|
| 49 |
"""Handle GET_FILE_NATSPEC action."""
|
| 50 |
if ctx._is_repeated(qkey):
|
| 51 |
+
return "Repeated query.", Reward(value=ActionType.REPEATED.cost, reason="Repeated query")
|
| 52 |
|
| 53 |
meta = ctx._contract.get("metadata", {})
|
| 54 |
natspec = meta.get("natspec") or meta.get("description", "No file NatSpec available.")
|
| 55 |
return (
|
| 56 |
f"File NatSpec for {ctx._contract['contract_name']}:\n{natspec}",
|
| 57 |
+
Reward(value=ActionType.GET_FILE_NATSPEC.cost, reason="get_file_natspec cost"),
|
| 58 |
)
|
| 59 |
|
| 60 |
|
| 61 |
def get_related_functions_action(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
|
| 62 |
"""Handle GET_RELATED_FUNCTIONS action."""
|
| 63 |
if ctx._is_repeated(qkey):
|
| 64 |
+
return "Repeated query.", Reward(value=ActionType.REPEATED.cost, reason="Repeated query")
|
| 65 |
|
| 66 |
name = ctx._target_fn["name"]
|
| 67 |
related = get_related_functions(ctx._contract, name)
|
|
|
|
| 76 |
comment = rfn.get("comment", "")
|
| 77 |
summaries.append(f" • {sig} — {comment}")
|
| 78 |
text = f"Related functions for '{name}':\n" + "\n".join(summaries)
|
| 79 |
+
return text, Reward(value=ActionType.GET_RELATED_FUNCTIONS.cost, reason="get_related_functions cost")
|
| 80 |
|
| 81 |
|
| 82 |
def get_signature(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
|
| 83 |
"""Handle GET_SIGNATURE action."""
|
| 84 |
if ctx._is_repeated(qkey):
|
| 85 |
+
return "Repeated query.", Reward(value=ActionType.REPEATED.cost, reason="Repeated query")
|
| 86 |
|
| 87 |
fn = ctx._target_fn
|
| 88 |
sig = fn.get("signature")
|
| 89 |
+
return sig, Reward(value=ActionType.GET_SIGNATURE.cost, reason="get_signature cost")
|
| 90 |
|
| 91 |
|
| 92 |
def get_similar_rule_action(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
|
| 93 |
"""Handle GET_SIMILAR_RULE action."""
|
| 94 |
if ctx._is_repeated(qkey):
|
| 95 |
+
return "Repeated query.", Reward(value=ActionType.REPEATED.cost, reason="Repeated query")
|
| 96 |
|
| 97 |
PropertyRetrieverInstance.load_model() # Ensure model is loaded before querying
|
| 98 |
similar_rule = PropertyRetrieverInstance.get_similar_property(ctx._target_fn["code"])
|
|
|
|
| 101 |
"No similar rule available for this function.",
|
| 102 |
Reward(value=-0.20, reason="get_similar_rule cost (not found)"),
|
| 103 |
)
|
| 104 |
+
return similar_rule, Reward(value=ActionType.GET_SIMILAR_RULE.cost, reason="get_similar_rule cost")
|
|
|
|
| 105 |
|
| 106 |
def submit_property(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
|
| 107 |
"""Handle SUBMIT_PROPERTY action for Task 2.
|
|
|
|
| 110 |
---------------
|
| 111 |
property : str – natural-language property describing the function's behaviour
|
| 112 |
"""
|
| 113 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
submitted_property = params.get("property", "").strip()
|
|
|
|
| 115 |
if not submitted_property:
|
| 116 |
return (
|
| 117 |
"submit_property requires a non-empty 'property' string in params.",
|
| 118 |
+
Reward(value=ActionType.RESUBMIT.cost, reason="Malformed submission", partial=False),
|
| 119 |
)
|
| 120 |
|
|
|
|
| 121 |
ctx._done = True
|
| 122 |
+
score, confidence = ctx._grader.grade(submitted_property, ctx._step_count, ctx._cum_reward)
|
| 123 |
|
| 124 |
+
return "", Reward(
|
| 125 |
+
value=score,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
reason=f"submit_property confidence={confidence} score={score:.3f}",
|
| 127 |
partial=False,
|
| 128 |
)
|
|
|
|
| 130 |
|
| 131 |
def unknown_action(ctx: Any, qkey: str, params: Dict, action_type: str) -> Tuple[str, Reward]:
|
| 132 |
"""Fallback for unknown actions."""
|
| 133 |
+
|
| 134 |
+
ctx._done = True
|
| 135 |
return (
|
| 136 |
+
f"Unknown action type: '{action_type}'. Valid: {[a.value for a in ActionType]}, \
|
| 137 |
+
Reset environment to start again.",
|
| 138 |
+
Reward(value=ActionType.UNKNOWN.cost, reason="Unknown action"),
|
| 139 |
)
|
server/tasks/task2/environment.py
CHANGED
|
@@ -27,6 +27,7 @@ from __future__ import annotations
|
|
| 27 |
|
| 28 |
import random
|
| 29 |
from typing import Any, Dict, List, Optional, Set
|
|
|
|
| 30 |
|
| 31 |
from data.data_loader import load_contracts, sample_property_episode
|
| 32 |
from env.base_env import BaseEnv
|
|
@@ -43,7 +44,6 @@ from .grader import Task2Grader
|
|
| 43 |
from server.tasks.task2 import actions
|
| 44 |
|
| 45 |
TASK_ID = "task2_property_discovery"
|
| 46 |
-
MAX_STEPS = 15
|
| 47 |
|
| 48 |
AVAILABLE_ACTIONS = [
|
| 49 |
ActionType.GET_FUNCTION_CODE,
|
|
@@ -62,6 +62,7 @@ class Task2Environment(BaseEnv):
|
|
| 62 |
def __init__(self, contracts_path: Optional[str] = None) -> None:
|
| 63 |
self._contracts = load_contracts(contracts_path) if contracts_path else load_contracts()
|
| 64 |
self._rng = random.Random()
|
|
|
|
| 65 |
|
| 66 |
# Episode state – initialised by reset()
|
| 67 |
self._contract: Dict[str, Any] = {}
|
|
@@ -70,7 +71,6 @@ class Task2Environment(BaseEnv):
|
|
| 70 |
self._step_count: int = 0
|
| 71 |
self._cum_reward: float = 0.0
|
| 72 |
self._done: bool = False
|
| 73 |
-
self._submitted: bool = False # only one submit_property allowed
|
| 74 |
self._query_hist: List[str] = []
|
| 75 |
self._seen: Set[str] = set()
|
| 76 |
|
|
@@ -86,6 +86,7 @@ class Task2Environment(BaseEnv):
|
|
| 86 |
self._grader = Task2Grader(
|
| 87 |
function_name=self._target_fn["name"],
|
| 88 |
property=self._target_fn["property"],
|
|
|
|
| 89 |
)
|
| 90 |
self._step_count = 0
|
| 91 |
self._cum_reward = 0.0
|
|
@@ -110,6 +111,9 @@ class Task2Environment(BaseEnv):
|
|
| 110 |
def step(self, action: Action) -> StepResult:
|
| 111 |
if self._done:
|
| 112 |
raise RuntimeError("Episode is done. Call reset() to start a new episode.")
|
|
|
|
|
|
|
|
|
|
| 113 |
|
| 114 |
self._step_count += 1
|
| 115 |
result_text, reward = self._dispatch(action)
|
|
@@ -147,12 +151,8 @@ class Task2Environment(BaseEnv):
|
|
| 147 |
return Observation(
|
| 148 |
task_id=TASK_ID,
|
| 149 |
contract_name=self._contract.get("contract_name", ""),
|
| 150 |
-
contract_description=self._contract.get("metadata", {}).get("description", ""),
|
| 151 |
-
available_actions=[a.value for a in AVAILABLE_ACTIONS],
|
| 152 |
last_action=last_action,
|
| 153 |
last_action_result=last_result,
|
| 154 |
-
step_count=self._step_count,
|
| 155 |
-
cumulative_reward=self._cum_reward,
|
| 156 |
done=self._done,
|
| 157 |
extra={
|
| 158 |
"target_function": self._target_fn.get("name", ""),
|
|
@@ -181,8 +181,6 @@ class Task2Environment(BaseEnv):
|
|
| 181 |
params = action.params
|
| 182 |
qkey = self._qkey(at, params)
|
| 183 |
|
| 184 |
-
# Mapping from ActionType to handler function
|
| 185 |
-
# Each handler expects (ctx, qkey, params) and returns (str, Reward)
|
| 186 |
handlers = {
|
| 187 |
ActionType.GET_FUNCTION_CODE: actions.get_function_code,
|
| 188 |
ActionType.GET_FUNCTION_NATSPEC: actions.get_function_natspec,
|
|
|
|
| 27 |
|
| 28 |
import random
|
| 29 |
from typing import Any, Dict, List, Optional, Set
|
| 30 |
+
from math import log2, floor
|
| 31 |
|
| 32 |
from data.data_loader import load_contracts, sample_property_episode
|
| 33 |
from env.base_env import BaseEnv
|
|
|
|
| 44 |
from server.tasks.task2 import actions
|
| 45 |
|
| 46 |
TASK_ID = "task2_property_discovery"
|
|
|
|
| 47 |
|
| 48 |
AVAILABLE_ACTIONS = [
|
| 49 |
ActionType.GET_FUNCTION_CODE,
|
|
|
|
| 62 |
def __init__(self, contracts_path: Optional[str] = None) -> None:
|
| 63 |
self._contracts = load_contracts(contracts_path) if contracts_path else load_contracts()
|
| 64 |
self._rng = random.Random()
|
| 65 |
+
self._max_steps: int = 40
|
| 66 |
|
| 67 |
# Episode state – initialised by reset()
|
| 68 |
self._contract: Dict[str, Any] = {}
|
|
|
|
| 71 |
self._step_count: int = 0
|
| 72 |
self._cum_reward: float = 0.0
|
| 73 |
self._done: bool = False
|
|
|
|
| 74 |
self._query_hist: List[str] = []
|
| 75 |
self._seen: Set[str] = set()
|
| 76 |
|
|
|
|
| 86 |
self._grader = Task2Grader(
|
| 87 |
function_name=self._target_fn["name"],
|
| 88 |
property=self._target_fn["property"],
|
| 89 |
+
n = floor(log2(len(self._contract["functions"])))
|
| 90 |
)
|
| 91 |
self._step_count = 0
|
| 92 |
self._cum_reward = 0.0
|
|
|
|
| 111 |
def step(self, action: Action) -> StepResult:
|
| 112 |
if self._done:
|
| 113 |
raise RuntimeError("Episode is done. Call reset() to start a new episode.")
|
| 114 |
+
|
| 115 |
+
if self._step_count > self._max_steps:
|
| 116 |
+
raise RuntimeError("Exceeded maximum number of steps allowed. Call reset() to start a new episode.")
|
| 117 |
|
| 118 |
self._step_count += 1
|
| 119 |
result_text, reward = self._dispatch(action)
|
|
|
|
| 151 |
return Observation(
|
| 152 |
task_id=TASK_ID,
|
| 153 |
contract_name=self._contract.get("contract_name", ""),
|
|
|
|
|
|
|
| 154 |
last_action=last_action,
|
| 155 |
last_action_result=last_result,
|
|
|
|
|
|
|
| 156 |
done=self._done,
|
| 157 |
extra={
|
| 158 |
"target_function": self._target_fn.get("name", ""),
|
|
|
|
| 181 |
params = action.params
|
| 182 |
qkey = self._qkey(at, params)
|
| 183 |
|
|
|
|
|
|
|
| 184 |
handlers = {
|
| 185 |
ActionType.GET_FUNCTION_CODE: actions.get_function_code,
|
| 186 |
ActionType.GET_FUNCTION_NATSPEC: actions.get_function_natspec,
|
server/tasks/task2/grader.py
CHANGED
|
@@ -9,12 +9,6 @@ Grade range: 0.0 – 1.0 (matchscore output, already normalised).
|
|
| 9 |
from typing import Tuple
|
| 10 |
from utils import SemanticMatcher
|
| 11 |
|
| 12 |
-
_SCORE_MIN = 0.001 # grades are strictly (0, 1)
|
| 13 |
-
_SCORE_MAX = 0.999
|
| 14 |
-
|
| 15 |
-
def _clamp(v: float) -> float:
|
| 16 |
-
return max(_SCORE_MIN, min(_SCORE_MAX, v))
|
| 17 |
-
|
| 18 |
class Task2Grader:
|
| 19 |
"""
|
| 20 |
Grades a Task 2 property submission.
|
|
@@ -25,15 +19,20 @@ class Task2Grader:
|
|
| 25 |
property : the 'property' field from the target function's data
|
| 26 |
"""
|
| 27 |
|
| 28 |
-
def __init__(self, function_name: str, property: str) -> None:
|
| 29 |
self.function_name = function_name
|
| 30 |
self.property = property
|
|
|
|
|
|
|
| 31 |
|
| 32 |
-
def grade(self, submitted: str) -> Tuple[float, str]:
|
| 33 |
"""Deterministic grade strictly in (0, 1)."""
|
| 34 |
if not submitted or not submitted.strip():
|
| 35 |
-
return
|
| 36 |
|
| 37 |
matcher = SemanticMatcher()
|
| 38 |
-
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
from typing import Tuple
|
| 10 |
from utils import SemanticMatcher
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
class Task2Grader:
|
| 13 |
"""
|
| 14 |
Grades a Task 2 property submission.
|
|
|
|
| 19 |
property : the 'property' field from the target function's data
|
| 20 |
"""
|
| 21 |
|
| 22 |
+
def __init__(self, function_name: str, property: str, n: int) -> None:
|
| 23 |
self.function_name = function_name
|
| 24 |
self.property = property
|
| 25 |
+
self.n = n
|
| 26 |
+
self._decay = 0.75
|
| 27 |
|
| 28 |
+
def grade(self, submitted: str, steps: int, cummulative_cost: int) -> Tuple[float, str]:
|
| 29 |
"""Deterministic grade strictly in (0, 1)."""
|
| 30 |
if not submitted or not submitted.strip():
|
| 31 |
+
return 0.001, "no_match"
|
| 32 |
|
| 33 |
matcher = SemanticMatcher()
|
| 34 |
+
match_score = matcher.matchscore(self.property, submitted)
|
| 35 |
+
free_budget = (cummulative_cost / steps) * (self.n + 2)
|
| 36 |
+
final_score = match_score * (self._decay ** max(0, cummulative_cost - free_budget))
|
| 37 |
+
|
| 38 |
+
return final_score, matcher.confidence()
|