ajaxwin commited on
Commit
f78cba2
·
1 Parent(s): 7f7bcc6

refactor: Improved grading logic for task 2

Browse files
agents/task2.py CHANGED
@@ -30,11 +30,8 @@ def oracle_t2(env: Task2Environment, seed: int, verbose: bool = False) -> Dict[s
30
  env.step(Action(action_type=ActionType.GET_FUNCTION_CODE))
31
  result = env.step(Action(action_type=ActionType.SUBMIT_PROPERTY,
32
  params={"property": gt_text}))
33
- r_val = result.reward.value
34
- score = round(r_val / 5.0, 4) if r_val > 0 else 0.0
35
  return {"seed": seed, "contract": contract, "function": fn_name,
36
- "grader_score": score,
37
- "cumulative_reward": result.observation.cumulative_reward}
38
 
39
 
40
  def partial_t2(env: Task2Environment, seed: int) -> Dict[str, Any]:
@@ -51,9 +48,7 @@ def partial_t2(env: Task2Environment, seed: int) -> Dict[str, Any]:
51
  break
52
  result = env.step(Action(action_type=ActionType.SUBMIT_PROPERTY,
53
  params={"property": comment}))
54
- r_val = result.reward.value
55
- return {"seed": seed, "grader_score": round(r_val / 5.0, 4) if r_val > 0 else 0.0,
56
- "cumulative_reward": result.observation.cumulative_reward}
57
 
58
 
59
  def random_t2(env: Task2Environment, seed: int) -> Dict[str, Any]:
@@ -90,10 +85,8 @@ def random_t2(env: Task2Environment, seed: int) -> Dict[str, Any]:
90
  prop = rng.choice(templates)
91
  result = env.step(Action(action_type=ActionType.SUBMIT_PROPERTY,
92
  params={"property": prop}))
93
- r_val = result.reward.value
94
- return {"seed": seed, "grader_score": round(r_val / 5.0, 4) if r_val > 0 else 0.0,
95
- "submitted": prop[:60],
96
- "cumulative_reward": result.observation.cumulative_reward}
97
 
98
 
99
  def floor_t2(env: Task2Environment, seed: int) -> Dict[str, Any]:
@@ -101,5 +94,4 @@ def floor_t2(env: Task2Environment, seed: int) -> Dict[str, Any]:
101
  env.reset(seed=seed)
102
  result = env.step(Action(action_type=ActionType.SUBMIT_PROPERTY,
103
  params={"property": ""}))
104
- return {"seed": seed, "grader_score": 0.0,
105
- "cumulative_reward": result.observation.cumulative_reward}
 
30
  env.step(Action(action_type=ActionType.GET_FUNCTION_CODE))
31
  result = env.step(Action(action_type=ActionType.SUBMIT_PROPERTY,
32
  params={"property": gt_text}))
 
 
33
  return {"seed": seed, "contract": contract, "function": fn_name,
34
+ "grader_score": result.reward.value }
 
35
 
36
 
37
  def partial_t2(env: Task2Environment, seed: int) -> Dict[str, Any]:
 
48
  break
49
  result = env.step(Action(action_type=ActionType.SUBMIT_PROPERTY,
50
  params={"property": comment}))
51
+ return {"seed": seed, "grader_score": result.reward.value}
 
 
52
 
53
 
54
  def random_t2(env: Task2Environment, seed: int) -> Dict[str, Any]:
 
85
  prop = rng.choice(templates)
86
  result = env.step(Action(action_type=ActionType.SUBMIT_PROPERTY,
87
  params={"property": prop}))
88
+ return {"seed": seed, "grader_score": result.reward.value,
89
+ "submitted": prop[:60]}
 
 
90
 
91
 
92
  def floor_t2(env: Task2Environment, seed: int) -> Dict[str, Any]:
 
94
  env.reset(seed=seed)
95
  result = env.step(Action(action_type=ActionType.SUBMIT_PROPERTY,
96
  params={"property": ""}))
97
+ return {"seed": seed, "grader_score": 0.001}
 
data/contracts.json CHANGED
@@ -227,7 +227,7 @@
227
  "property_specification": {
228
  "precondition": "User has AToken balance B",
229
  "operation": "burn(user, receiver, amount, index)",
230
- "expected_postcondition": "User's AToken balance = B - amount (within rounding tolerance ε)",
231
  "actual": "When amount.rayDiv(index) rounds down to 0, the burn operation transfers amount underlying tokens but burns 0 ATokens, resulting in user AToken balance unchanged = B, violating the postcondition where the balance should be B - amount."
232
  }
233
  },
@@ -934,7 +934,7 @@
934
  "property_specification": {
935
  "precondition": "User has debt balance B",
936
  "operation": "mint(user, onBehalfOf, amount, rate)",
937
- "expected_postcondition": "User's debt balance = B + amount (within rounding tolerance ε)",
938
  "actual": "When amount conversion rounds down to 0 in intermediate calculations, the mint operation may mint zero debt tokens while still transferring underlying tokens (or vice versa), resulting in user debt balance unchanged = B, violating the postcondition where the balance should be B + amount."
939
  }
940
  },
 
227
  "property_specification": {
228
  "precondition": "User has AToken balance B",
229
  "operation": "burn(user, receiver, amount, index)",
230
+ "postcondition": "User's AToken balance = B - amount (within rounding tolerance ε)",
231
  "actual": "When amount.rayDiv(index) rounds down to 0, the burn operation transfers amount underlying tokens but burns 0 ATokens, resulting in user AToken balance unchanged = B, violating the postcondition where the balance should be B - amount."
232
  }
233
  },
 
934
  "property_specification": {
935
  "precondition": "User has debt balance B",
936
  "operation": "mint(user, onBehalfOf, amount, rate)",
937
+ "postcondition": "User's debt balance = B + amount (within rounding tolerance ε)",
938
  "actual": "When amount conversion rounds down to 0 in intermediate calculations, the mint operation may mint zero debt tokens while still transferring underlying tokens (or vice versa), resulting in user debt balance unchanged = B, violating the postcondition where the balance should be B + amount."
939
  }
940
  },
env/schemas.py CHANGED
@@ -39,17 +39,17 @@ class ActionType(str, Enum):
39
  SUBMIT = ("submit", 0.0)
40
 
41
  # ── Task 2 – Property Discovery ─────────────────────────────────────────
42
- GET_SIMILAR_RULE = ("get_similar_rule", 0.0)
43
- GET_FILE_NATSPEC = ("get_file_natspec", 0.0)
44
- GET_FUNCTION_NATSPEC = ("get_function_natspec", 0.0)
45
- GET_RELATED_FUNCTIONS = ("get_related_functions", 0.0)
46
- GET_SIGNATURE = ("get_signature", 0.0)
47
  SUBMIT_PROPERTY = ("submit_property", 0.0)
48
 
49
  # ── Task 3 – Rule Checker ────────────────────────────────────────────────
50
- GET_PROPERTY_SPECIFICATION = ("get_property_specification", 0.0)
51
- GET_FUNCTION_METADATA = ("get_function_metadata", 0.0)
52
- SUBMIT_FUNCTION = ("submit_function", 0.0)
53
 
54
  # ─────── General Actions ─────────────────────────────────────────────────
55
  UNKNOWN = ("unknown", 0.0)
 
39
  SUBMIT = ("submit", 0.0)
40
 
41
  # ── Task 2 – Property Discovery ─────────────────────────────────────────
42
+ GET_SIMILAR_RULE = ("get_similar_rule", 0.15)
43
+ GET_FILE_NATSPEC = ("get_file_natspec", 0.05)
44
+ GET_FUNCTION_NATSPEC = ("get_function_natspec", -0.08)
45
+ GET_RELATED_FUNCTIONS = ("get_related_functions", 0.07)
46
+ GET_SIGNATURE = ("get_signature", 0.04)
47
  SUBMIT_PROPERTY = ("submit_property", 0.0)
48
 
49
  # ── Task 3 – Rule Checker ────────────────────────────────────────────────
50
+ GET_PROPERTY_SPECIFICATION = ("get_property_specification", 0.0)
51
+ GET_FUNCTION_METADATA = ("get_function_metadata", 0.0)
52
+ SUBMIT_FUNCTION = ("submit_function", 0.0)
53
 
54
  # ─────── General Actions ─────────────────────────────────────────────────
55
  UNKNOWN = ("unknown", 0.0)
eval.py CHANGED
@@ -121,7 +121,7 @@ def run_task2_eval(n: int, seed_offset: int, verbose: bool) -> Dict[str, Any]:
121
  oracle_eps.append(ep)
122
  icon = "✅" if ep["grader_score"] >= 0.65 else "⚠️ "
123
  print(f" {icon} seed={ep['seed']:3d} {ep['contract']:12s}.{ep['function']:18s}"
124
- f" score={ep['grader_score']:.3f} reward={ep['cumulative_reward']:+.2f}")
125
  oracle_avg = _avg(oracle_eps)
126
  print(f"\n Oracle avg: {oracle_avg:.3f}")
127
 
@@ -143,10 +143,10 @@ def run_task2_eval(n: int, seed_offset: int, verbose: bool) -> Dict[str, Any]:
143
  floor_avg = _avg(floor_eps)
144
  print(f" Floor avg: {floor_avg:.3f}")
145
 
146
- # assert oracle_avg > 0.60, f"Oracle avg {oracle_avg:.3f} should be > 0.60"
147
- # assert oracle_avg > partial_avg >= floor_avg, \
148
- # "Score ordering violated: oracle > partial >= floor"
149
- # assert floor_avg == 0.0, f"Floor avg {floor_avg:.3f} should be 0.0"
150
  print(f"\n ✅ Task 2: oracle({oracle_avg:.3f}) > partial({partial_avg:.3f})"
151
  f" ≥ random({random_avg:.3f}) ≥ floor(0.0)")
152
 
 
121
  oracle_eps.append(ep)
122
  icon = "✅" if ep["grader_score"] >= 0.65 else "⚠️ "
123
  print(f" {icon} seed={ep['seed']:3d} {ep['contract']:12s}.{ep['function']:18s}"
124
+ f" score={ep['grader_score']:.3f}")
125
  oracle_avg = _avg(oracle_eps)
126
  print(f"\n Oracle avg: {oracle_avg:.3f}")
127
 
 
143
  floor_avg = _avg(floor_eps)
144
  print(f" Floor avg: {floor_avg:.3f}")
145
 
146
+ assert oracle_avg > 0.60, f"Oracle avg {oracle_avg:.3f} should be > 0.60"
147
+ assert oracle_avg > partial_avg >= floor_avg, \
148
+ "Score ordering violated: oracle > partial >= floor"
149
+ assert floor_avg == 0.001, f"Floor avg {floor_avg:.3f} should be 0.0"
150
  print(f"\n ✅ Task 2: oracle({oracle_avg:.3f}) > partial({partial_avg:.3f})"
151
  f" ≥ random({random_avg:.3f}) ≥ floor(0.0)")
152
 
server/tasks/task1/environment.py CHANGED
@@ -61,8 +61,6 @@ class Task1Environment(BaseEnv):
61
  self._done: bool = False
62
  self._query_history: List[str] = []
63
  self._seen_queries: Set[str] = set()
64
- self._cost_free_steps: int = 0
65
- self._decay: float = 0.0
66
 
67
  # ------------------------------------------------------------------
68
  # OpenEnv interface
 
61
  self._done: bool = False
62
  self._query_history: List[str] = []
63
  self._seen_queries: Set[str] = set()
 
 
64
 
65
  # ------------------------------------------------------------------
66
  # OpenEnv interface
server/tasks/task2/actions.py CHANGED
@@ -19,13 +19,13 @@ PropertyRetrieverInstance = PropertyRetriever() # Load once at module level
19
  def get_function_code(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
20
  """Handle GET_FUNCTION_CODE action."""
21
  if ctx._is_repeated(qkey):
22
- return "Repeated query.", Reward(value=-0.40, reason="Repeated query")
23
 
24
  fn = ctx._target_fn
25
  code = fn.get("code", "// no code available")
26
  return (
27
  code,
28
- Reward(value=-0.06, reason="get_function_code cost"),
29
  )
30
 
31
  # TODO: Can separate comment and output_property(output_comment)
@@ -33,7 +33,7 @@ def get_function_code(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
33
  def get_function_natspec(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
34
  """Handle GET_FUNCTION_NATSPEC action."""
35
  if ctx._is_repeated(qkey):
36
- return "Repeated query.", Reward(value=-0.40, reason="Repeated query")
37
 
38
  fn = ctx._target_fn
39
  name = fn["name"]
@@ -42,26 +42,26 @@ def get_function_natspec(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward
42
  result = f"NatSpec for '{name}':\n{natspec}"
43
  if out_prop:
44
  result += f"\n\nExpected output: {out_prop}"
45
- return result, Reward(value=-0.08, reason="get_function_natspec cost")
46
 
47
 
48
  def get_file_natspec(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
49
  """Handle GET_FILE_NATSPEC action."""
50
  if ctx._is_repeated(qkey):
51
- return "Repeated query.", Reward(value=-0.40, reason="Repeated query")
52
 
53
  meta = ctx._contract.get("metadata", {})
54
  natspec = meta.get("natspec") or meta.get("description", "No file NatSpec available.")
55
  return (
56
  f"File NatSpec for {ctx._contract['contract_name']}:\n{natspec}",
57
- Reward(value=-0.03, reason="get_file_natspec cost"),
58
  )
59
 
60
 
61
  def get_related_functions_action(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
62
  """Handle GET_RELATED_FUNCTIONS action."""
63
  if ctx._is_repeated(qkey):
64
- return "Repeated query.", Reward(value=-0.40, reason="Repeated query")
65
 
66
  name = ctx._target_fn["name"]
67
  related = get_related_functions(ctx._contract, name)
@@ -76,23 +76,23 @@ def get_related_functions_action(ctx: Any, qkey: str, params: Dict) -> Tuple[str
76
  comment = rfn.get("comment", "")
77
  summaries.append(f" • {sig} — {comment}")
78
  text = f"Related functions for '{name}':\n" + "\n".join(summaries)
79
- return text, Reward(value=-0.06, reason="get_related_functions cost")
80
 
81
 
82
  def get_signature(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
83
  """Handle GET_SIGNATURE action."""
84
  if ctx._is_repeated(qkey):
85
- return "Repeated query.", Reward(value=-0.40, reason="Repeated query")
86
 
87
  fn = ctx._target_fn
88
  sig = fn.get("signature")
89
- return sig, Reward(value=-0.04, reason="get_signature cost")
90
 
91
 
92
  def get_similar_rule_action(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
93
  """Handle GET_SIMILAR_RULE action."""
94
  if ctx._is_repeated(qkey):
95
- return "Repeated query.", Reward(value=-0.40, reason="Repeated query")
96
 
97
  PropertyRetrieverInstance.load_model() # Ensure model is loaded before querying
98
  similar_rule = PropertyRetrieverInstance.get_similar_property(ctx._target_fn["code"])
@@ -101,8 +101,7 @@ def get_similar_rule_action(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Rew
101
  "No similar rule available for this function.",
102
  Reward(value=-0.20, reason="get_similar_rule cost (not found)"),
103
  )
104
- return similar_rule, Reward(value=-0.20, reason="get_similar_rule cost")
105
-
106
 
107
  def submit_property(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
108
  """Handle SUBMIT_PROPERTY action for Task 2.
@@ -111,46 +110,19 @@ def submit_property(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
111
  ---------------
112
  property : str – natural-language property describing the function's behaviour
113
  """
114
- if ctx._submitted:
115
- return (
116
- "❌ You have already submitted for this episode. "
117
- "Only ONE submission is allowed.",
118
- Reward(value=0.0, reason="Second submit_property attempt", partial=False),
119
- )
120
-
121
  submitted_property = params.get("property", "").strip()
122
-
123
  if not submitted_property:
124
  return (
125
  "submit_property requires a non-empty 'property' string in params.",
126
- Reward(value=0.0, reason="Malformed submission", partial=False),
127
  )
128
 
129
- ctx._submitted = True
130
  ctx._done = True
 
131
 
132
- # grade() returns (float score in [0,1], confidence str)
133
- score, confidence = ctx._grader.grade(submitted_property) # score already in [0.0, 1.0]
134
- reward_val = float(score) # reward == grade for Task 2
135
-
136
- if confidence == "strong":
137
- msg = (
138
- f"✅ STRONG MATCH. Your property closely matches the target. "
139
- f"Score: {score:.3f} → Reward: {reward_val:.3f}"
140
- )
141
- elif confidence == "moderate":
142
- msg = (
143
- f"🟡 MODERATE MATCH. Your property partially captures the target behaviour. "
144
- f"Score: {score:.3f} → Reward: {reward_val:.3f}"
145
- )
146
- else:
147
- msg = (
148
- f"❌ LOW MATCH. Your property does not sufficiently match the target. "
149
- f"Score: {score:.3f} → Reward: {reward_val:.3f}"
150
- )
151
-
152
- return msg, Reward(
153
- value=reward_val,
154
  reason=f"submit_property confidence={confidence} score={score:.3f}",
155
  partial=False,
156
  )
@@ -158,7 +130,10 @@ def submit_property(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
158
 
159
  def unknown_action(ctx: Any, qkey: str, params: Dict, action_type: str) -> Tuple[str, Reward]:
160
  """Fallback for unknown actions."""
 
 
161
  return (
162
- f"Unknown action type: '{action_type}'. Valid: {[a.value for a in ActionType]}",
163
- Reward(value=-0.10, reason="Unknown action"),
 
164
  )
 
19
  def get_function_code(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
20
  """Handle GET_FUNCTION_CODE action."""
21
  if ctx._is_repeated(qkey):
22
+ return "Repeated query.", Reward(value=ActionType.REPEATED.cost, reason="Repeated query")
23
 
24
  fn = ctx._target_fn
25
  code = fn.get("code", "// no code available")
26
  return (
27
  code,
28
+ Reward(value=ActionType.GET_FUNCTION_CODE.cost, reason="get_function_code cost"),
29
  )
30
 
31
  # TODO: Can separate comment and output_property(output_comment)
 
33
  def get_function_natspec(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
34
  """Handle GET_FUNCTION_NATSPEC action."""
35
  if ctx._is_repeated(qkey):
36
+ return "Repeated query.", Reward(value=ActionType.REPEATED.cost, reason="Repeated query")
37
 
38
  fn = ctx._target_fn
39
  name = fn["name"]
 
42
  result = f"NatSpec for '{name}':\n{natspec}"
43
  if out_prop:
44
  result += f"\n\nExpected output: {out_prop}"
45
+ return result, Reward(value=ActionType.GET_FILE_NATSPEC.cost, reason="get_function_natspec cost")
46
 
47
 
48
  def get_file_natspec(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
49
  """Handle GET_FILE_NATSPEC action."""
50
  if ctx._is_repeated(qkey):
51
+ return "Repeated query.", Reward(value=ActionType.REPEATED.cost, reason="Repeated query")
52
 
53
  meta = ctx._contract.get("metadata", {})
54
  natspec = meta.get("natspec") or meta.get("description", "No file NatSpec available.")
55
  return (
56
  f"File NatSpec for {ctx._contract['contract_name']}:\n{natspec}",
57
+ Reward(value=ActionType.GET_FILE_NATSPEC.cost, reason="get_file_natspec cost"),
58
  )
59
 
60
 
61
  def get_related_functions_action(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
62
  """Handle GET_RELATED_FUNCTIONS action."""
63
  if ctx._is_repeated(qkey):
64
+ return "Repeated query.", Reward(value=ActionType.REPEATED.cost, reason="Repeated query")
65
 
66
  name = ctx._target_fn["name"]
67
  related = get_related_functions(ctx._contract, name)
 
76
  comment = rfn.get("comment", "")
77
  summaries.append(f" • {sig} — {comment}")
78
  text = f"Related functions for '{name}':\n" + "\n".join(summaries)
79
+ return text, Reward(value=ActionType.GET_RELATED_FUNCTIONS.cost, reason="get_related_functions cost")
80
 
81
 
82
  def get_signature(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
83
  """Handle GET_SIGNATURE action."""
84
  if ctx._is_repeated(qkey):
85
+ return "Repeated query.", Reward(value=ActionType.REPEATED.cost, reason="Repeated query")
86
 
87
  fn = ctx._target_fn
88
  sig = fn.get("signature")
89
+ return sig, Reward(value=ActionType.GET_SIGNATURE.cost, reason="get_signature cost")
90
 
91
 
92
  def get_similar_rule_action(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
93
  """Handle GET_SIMILAR_RULE action."""
94
  if ctx._is_repeated(qkey):
95
+ return "Repeated query.", Reward(value=ActionType.REPEATED.cost, reason="Repeated query")
96
 
97
  PropertyRetrieverInstance.load_model() # Ensure model is loaded before querying
98
  similar_rule = PropertyRetrieverInstance.get_similar_property(ctx._target_fn["code"])
 
101
  "No similar rule available for this function.",
102
  Reward(value=-0.20, reason="get_similar_rule cost (not found)"),
103
  )
104
+ return similar_rule, Reward(value=ActionType.GET_SIMILAR_RULE.cost, reason="get_similar_rule cost")
 
105
 
106
  def submit_property(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
107
  """Handle SUBMIT_PROPERTY action for Task 2.
 
110
  ---------------
111
  property : str – natural-language property describing the function's behaviour
112
  """
113
+
 
 
 
 
 
 
114
  submitted_property = params.get("property", "").strip()
 
115
  if not submitted_property:
116
  return (
117
  "submit_property requires a non-empty 'property' string in params.",
118
+ Reward(value=ActionType.RESUBMIT.cost, reason="Malformed submission", partial=False),
119
  )
120
 
 
121
  ctx._done = True
122
+ score, confidence = ctx._grader.grade(submitted_property, ctx._step_count, ctx._cum_reward)
123
 
124
+ return "", Reward(
125
+ value=score,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  reason=f"submit_property confidence={confidence} score={score:.3f}",
127
  partial=False,
128
  )
 
130
 
131
  def unknown_action(ctx: Any, qkey: str, params: Dict, action_type: str) -> Tuple[str, Reward]:
132
  """Fallback for unknown actions."""
133
+
134
+ ctx._done = True
135
  return (
136
+ f"Unknown action type: '{action_type}'. Valid: {[a.value for a in ActionType]}, \
137
+ Reset environment to start again.",
138
+ Reward(value=ActionType.UNKNOWN.cost, reason="Unknown action"),
139
  )
server/tasks/task2/environment.py CHANGED
@@ -27,6 +27,7 @@ from __future__ import annotations
27
 
28
  import random
29
  from typing import Any, Dict, List, Optional, Set
 
30
 
31
  from data.data_loader import load_contracts, sample_property_episode
32
  from env.base_env import BaseEnv
@@ -43,7 +44,6 @@ from .grader import Task2Grader
43
  from server.tasks.task2 import actions
44
 
45
  TASK_ID = "task2_property_discovery"
46
- MAX_STEPS = 15
47
 
48
  AVAILABLE_ACTIONS = [
49
  ActionType.GET_FUNCTION_CODE,
@@ -62,6 +62,7 @@ class Task2Environment(BaseEnv):
62
  def __init__(self, contracts_path: Optional[str] = None) -> None:
63
  self._contracts = load_contracts(contracts_path) if contracts_path else load_contracts()
64
  self._rng = random.Random()
 
65
 
66
  # Episode state – initialised by reset()
67
  self._contract: Dict[str, Any] = {}
@@ -70,7 +71,6 @@ class Task2Environment(BaseEnv):
70
  self._step_count: int = 0
71
  self._cum_reward: float = 0.0
72
  self._done: bool = False
73
- self._submitted: bool = False # only one submit_property allowed
74
  self._query_hist: List[str] = []
75
  self._seen: Set[str] = set()
76
 
@@ -86,6 +86,7 @@ class Task2Environment(BaseEnv):
86
  self._grader = Task2Grader(
87
  function_name=self._target_fn["name"],
88
  property=self._target_fn["property"],
 
89
  )
90
  self._step_count = 0
91
  self._cum_reward = 0.0
@@ -110,6 +111,9 @@ class Task2Environment(BaseEnv):
110
  def step(self, action: Action) -> StepResult:
111
  if self._done:
112
  raise RuntimeError("Episode is done. Call reset() to start a new episode.")
 
 
 
113
 
114
  self._step_count += 1
115
  result_text, reward = self._dispatch(action)
@@ -147,12 +151,8 @@ class Task2Environment(BaseEnv):
147
  return Observation(
148
  task_id=TASK_ID,
149
  contract_name=self._contract.get("contract_name", ""),
150
- contract_description=self._contract.get("metadata", {}).get("description", ""),
151
- available_actions=[a.value for a in AVAILABLE_ACTIONS],
152
  last_action=last_action,
153
  last_action_result=last_result,
154
- step_count=self._step_count,
155
- cumulative_reward=self._cum_reward,
156
  done=self._done,
157
  extra={
158
  "target_function": self._target_fn.get("name", ""),
@@ -181,8 +181,6 @@ class Task2Environment(BaseEnv):
181
  params = action.params
182
  qkey = self._qkey(at, params)
183
 
184
- # Mapping from ActionType to handler function
185
- # Each handler expects (ctx, qkey, params) and returns (str, Reward)
186
  handlers = {
187
  ActionType.GET_FUNCTION_CODE: actions.get_function_code,
188
  ActionType.GET_FUNCTION_NATSPEC: actions.get_function_natspec,
 
27
 
28
  import random
29
  from typing import Any, Dict, List, Optional, Set
30
+ from math import log2, floor
31
 
32
  from data.data_loader import load_contracts, sample_property_episode
33
  from env.base_env import BaseEnv
 
44
  from server.tasks.task2 import actions
45
 
46
  TASK_ID = "task2_property_discovery"
 
47
 
48
  AVAILABLE_ACTIONS = [
49
  ActionType.GET_FUNCTION_CODE,
 
62
  def __init__(self, contracts_path: Optional[str] = None) -> None:
63
  self._contracts = load_contracts(contracts_path) if contracts_path else load_contracts()
64
  self._rng = random.Random()
65
+ self._max_steps: int = 40
66
 
67
  # Episode state – initialised by reset()
68
  self._contract: Dict[str, Any] = {}
 
71
  self._step_count: int = 0
72
  self._cum_reward: float = 0.0
73
  self._done: bool = False
 
74
  self._query_hist: List[str] = []
75
  self._seen: Set[str] = set()
76
 
 
86
  self._grader = Task2Grader(
87
  function_name=self._target_fn["name"],
88
  property=self._target_fn["property"],
89
+ n = floor(log2(len(self._contract["functions"])))
90
  )
91
  self._step_count = 0
92
  self._cum_reward = 0.0
 
111
  def step(self, action: Action) -> StepResult:
112
  if self._done:
113
  raise RuntimeError("Episode is done. Call reset() to start a new episode.")
114
+
115
+ if self._step_count > self._max_steps:
116
+ raise RuntimeError("Exceeded maximum number of steps allowed. Call reset() to start a new episode.")
117
 
118
  self._step_count += 1
119
  result_text, reward = self._dispatch(action)
 
151
  return Observation(
152
  task_id=TASK_ID,
153
  contract_name=self._contract.get("contract_name", ""),
 
 
154
  last_action=last_action,
155
  last_action_result=last_result,
 
 
156
  done=self._done,
157
  extra={
158
  "target_function": self._target_fn.get("name", ""),
 
181
  params = action.params
182
  qkey = self._qkey(at, params)
183
 
 
 
184
  handlers = {
185
  ActionType.GET_FUNCTION_CODE: actions.get_function_code,
186
  ActionType.GET_FUNCTION_NATSPEC: actions.get_function_natspec,
server/tasks/task2/grader.py CHANGED
@@ -9,12 +9,6 @@ Grade range: 0.0 – 1.0 (matchscore output, already normalised).
9
  from typing import Tuple
10
  from utils import SemanticMatcher
11
 
12
- _SCORE_MIN = 0.001 # grades are strictly (0, 1)
13
- _SCORE_MAX = 0.999
14
-
15
- def _clamp(v: float) -> float:
16
- return max(_SCORE_MIN, min(_SCORE_MAX, v))
17
-
18
  class Task2Grader:
19
  """
20
  Grades a Task 2 property submission.
@@ -25,15 +19,20 @@ class Task2Grader:
25
  property : the 'property' field from the target function's data
26
  """
27
 
28
- def __init__(self, function_name: str, property: str) -> None:
29
  self.function_name = function_name
30
  self.property = property
 
 
31
 
32
- def grade(self, submitted: str) -> Tuple[float, str]:
33
  """Deterministic grade strictly in (0, 1)."""
34
  if not submitted or not submitted.strip():
35
- return _clamp(0.0), "no_match" # → 0.001
36
 
37
  matcher = SemanticMatcher()
38
- score = matcher.matchscore(self.property, submitted) # already clamped by SemanticMatcher
39
- return _clamp(score), matcher.confidence()
 
 
 
 
9
  from typing import Tuple
10
  from utils import SemanticMatcher
11
 
 
 
 
 
 
 
12
  class Task2Grader:
13
  """
14
  Grades a Task 2 property submission.
 
19
  property : the 'property' field from the target function's data
20
  """
21
 
22
+ def __init__(self, function_name: str, property: str, n: int) -> None:
23
  self.function_name = function_name
24
  self.property = property
25
+ self.n = n
26
+ self._decay = 0.75
27
 
28
+ def grade(self, submitted: str, steps: int, cummulative_cost: int) -> Tuple[float, str]:
29
  """Deterministic grade strictly in (0, 1)."""
30
  if not submitted or not submitted.strip():
31
+ return 0.001, "no_match"
32
 
33
  matcher = SemanticMatcher()
34
+ match_score = matcher.matchscore(self.property, submitted)
35
+ free_budget = (cummulative_cost / steps) * (self.n + 2)
36
+ final_score = match_score * (self._decay ** max(0, cummulative_cost - free_budget))
37
+
38
+ return final_score, matcher.confidence()