ar9avg commited on
Commit
263261a
Β·
1 Parent(s): ba69b5f

Defensive score clamping at all emission points

Browse files

Every reward/score emitted by inference.py is now clamped to [0.05, 0.95]
via _clamp_score() before logging. This closes multiple paths that could
leak exact 0.0 or 1.0:

- Initial score default was 0.0 (would emit on early reset failure)
- Exception path logged reward=0.0 and appended 0.0 to rewards list
- env.reset() failure wasn't wrapped in its own try/except

Also bumped grade_response epsilon from 0.01 to 0.05 and aligned
openenv.yaml reward.range to [0.0, 1.0].

Files changed (3) hide show
  1. backend/env/tasks.py +1 -1
  2. inference.py +28 -11
  3. openenv.yaml +4 -5
backend/env/tasks.py CHANGED
@@ -330,7 +330,7 @@ def get_all_tasks() -> list[Task]:
330
  return list(TASKS.values())
331
 
332
 
333
- _EPS = 0.01 # wide enough that f"{x:.3f}" never rounds to 0.000 or 1.000
334
 
335
 
336
  def grade_response(
 
330
  return list(TASKS.values())
331
 
332
 
333
+ _EPS = 0.05 # wide margin so :.2f/:.3f never rounds to 0.00 or 1.00
334
 
335
 
336
  def grade_response(
inference.py CHANGED
@@ -152,6 +152,16 @@ def pick_action(
152
 
153
  # ── Single-episode runner ─────────────────────────────────────────────────────
154
 
 
 
 
 
 
 
 
 
 
 
155
  async def run_episode(
156
  env: SQLAgentEnv,
157
  client: OpenAI,
@@ -162,12 +172,18 @@ async def run_episode(
162
 
163
  rewards: List[float] = []
164
  steps_taken = 0
165
- score = 0.0
166
  success = False
167
  last_error: Optional[str] = None
168
 
169
  try:
170
- obs = env.reset(task_id)
 
 
 
 
 
 
171
 
172
  for step in range(1, MAX_STEPS + 1):
173
  action_name = pick_action(client, obs, step)
@@ -175,13 +191,14 @@ async def run_episode(
175
 
176
  try:
177
  obs, reward_info = await env.step(action)
178
- except RuntimeError as exc:
179
- log_step(step=step, action=action_name, reward=0.0, done=True, error=str(exc))
180
- rewards.append(0.0)
181
  steps_taken = step
182
  break
183
 
184
- reward = reward_info.value
 
185
  done = reward_info.done
186
  last_error = obs.error_message
187
  success = reward_info.success
@@ -200,14 +217,14 @@ async def run_episode(
200
  if done:
201
  break
202
 
203
- # Score: average of per-step rewards. Clamp strictly inside (0, 1)
204
- # with margin >= 0.005 so f"{score:.3f}" never formats to "0.000" or "1.000".
205
- _EPS = 0.01
206
  denom = max(len(rewards), 1)
207
- avg = sum(rewards) / denom if rewards else _EPS
208
- score = max(_EPS, min(1.0 - _EPS, avg))
209
 
210
  finally:
 
 
 
211
  log_end(
212
  success=success,
213
  steps=steps_taken,
 
152
 
153
  # ── Single-episode runner ─────────────────────────────────────────────────────
154
 
155
+ _SCORE_EPS = 0.05 # strict (0, 1) with generous margin for :.2f/:.3f rounding
156
+
157
+
158
+ def _clamp_score(x: float) -> float:
159
+ """Clamp to strictly (0, 1). Uses 0.05 margin so :.2f/:.3f formatting stays safe."""
160
+ if x != x: # NaN
161
+ return 0.5
162
+ return max(_SCORE_EPS, min(1.0 - _SCORE_EPS, x))
163
+
164
+
165
  async def run_episode(
166
  env: SQLAgentEnv,
167
  client: OpenAI,
 
172
 
173
  rewards: List[float] = []
174
  steps_taken = 0
175
+ score = _SCORE_EPS
176
  success = False
177
  last_error: Optional[str] = None
178
 
179
  try:
180
+ try:
181
+ obs = env.reset(task_id)
182
+ except Exception as exc:
183
+ log_step(step=1, action="reset", reward=_SCORE_EPS, done=True, error=str(exc))
184
+ rewards.append(_SCORE_EPS)
185
+ steps_taken = 1
186
+ return
187
 
188
  for step in range(1, MAX_STEPS + 1):
189
  action_name = pick_action(client, obs, step)
 
191
 
192
  try:
193
  obs, reward_info = await env.step(action)
194
+ except Exception as exc:
195
+ log_step(step=step, action=action_name, reward=_SCORE_EPS, done=True, error=str(exc))
196
+ rewards.append(_SCORE_EPS)
197
  steps_taken = step
198
  break
199
 
200
+ raw_reward = reward_info.value if reward_info.value is not None else _SCORE_EPS
201
+ reward = _clamp_score(raw_reward)
202
  done = reward_info.done
203
  last_error = obs.error_message
204
  success = reward_info.success
 
217
  if done:
218
  break
219
 
 
 
 
220
  denom = max(len(rewards), 1)
221
+ avg = sum(rewards) / denom if rewards else _SCORE_EPS
222
+ score = _clamp_score(avg)
223
 
224
  finally:
225
+ # Final safety net: score and every reward must be strictly in (0, 1)
226
+ score = _clamp_score(score)
227
+ rewards = [_clamp_score(r) for r in rewards]
228
  log_end(
229
  success=success,
230
  steps=steps_taken,
openenv.yaml CHANGED
@@ -82,12 +82,11 @@ observation_space:
82
 
83
  # ── Reward ───────────────────────────────────────────────────────────────────
84
  reward:
85
- range: [-1.5, 1.5]
86
  description: >
87
- Shaped reward providing partial progress signals throughout the episode.
88
- Success on attempt N: 1.0 - 0.1*(N-1).
89
- Failure step: -0.1 - 0.05*N + severity_improvement_bonus + error_class_change_bonus.
90
- Penalizes infinite loops (consecutive same error) and rewards convergence toward correct SQL.
91
 
92
  # ── Tasks ────────────────────────────────────────────────────────────────────
93
  tasks:
 
82
 
83
  # ── Reward ───────────────────────────────────────────────────────────────────
84
  reward:
85
+ range: [0.0, 1.0]
86
  description: >
87
+ Task score is the grader output clamped strictly inside (0, 1). Graders
88
+ score partial progress (column correctness, row-count match) and apply
89
+ attempt penalties for multi-step repair episodes.
 
90
 
91
  # ── Tasks ────────────────────────────────────────────────────────────────────
92
  tasks: