Spaces:
Running
Running
Defensive score clamping at all emission points
Browse filesEvery reward/score emitted by inference.py is now clamped to [0.05, 0.95]
via _clamp_score() before logging. This closes multiple paths that could
leak exact 0.0 or 1.0:
- Initial score default was 0.0 (would emit on early reset failure)
- Exception path logged reward=0.0 and appended 0.0 to rewards list
- env.reset() failure wasn't wrapped in its own try/except
Also bumped grade_response epsilon from 0.01 to 0.05 and aligned
openenv.yaml reward.range to [0.0, 1.0].
- backend/env/tasks.py +1 -1
- inference.py +28 -11
- openenv.yaml +4 -5
backend/env/tasks.py
CHANGED
|
@@ -330,7 +330,7 @@ def get_all_tasks() -> list[Task]:
|
|
| 330 |
return list(TASKS.values())
|
| 331 |
|
| 332 |
|
| 333 |
-
_EPS = 0.
|
| 334 |
|
| 335 |
|
| 336 |
def grade_response(
|
|
|
|
| 330 |
return list(TASKS.values())
|
| 331 |
|
| 332 |
|
| 333 |
+
_EPS = 0.05 # wide margin so :.2f/:.3f never rounds to 0.00 or 1.00
|
| 334 |
|
| 335 |
|
| 336 |
def grade_response(
|
inference.py
CHANGED
|
@@ -152,6 +152,16 @@ def pick_action(
|
|
| 152 |
|
| 153 |
# ββ Single-episode runner βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 154 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
async def run_episode(
|
| 156 |
env: SQLAgentEnv,
|
| 157 |
client: OpenAI,
|
|
@@ -162,12 +172,18 @@ async def run_episode(
|
|
| 162 |
|
| 163 |
rewards: List[float] = []
|
| 164 |
steps_taken = 0
|
| 165 |
-
score =
|
| 166 |
success = False
|
| 167 |
last_error: Optional[str] = None
|
| 168 |
|
| 169 |
try:
|
| 170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
|
| 172 |
for step in range(1, MAX_STEPS + 1):
|
| 173 |
action_name = pick_action(client, obs, step)
|
|
@@ -175,13 +191,14 @@ async def run_episode(
|
|
| 175 |
|
| 176 |
try:
|
| 177 |
obs, reward_info = await env.step(action)
|
| 178 |
-
except
|
| 179 |
-
log_step(step=step, action=action_name, reward=
|
| 180 |
-
rewards.append(
|
| 181 |
steps_taken = step
|
| 182 |
break
|
| 183 |
|
| 184 |
-
|
|
|
|
| 185 |
done = reward_info.done
|
| 186 |
last_error = obs.error_message
|
| 187 |
success = reward_info.success
|
|
@@ -200,14 +217,14 @@ async def run_episode(
|
|
| 200 |
if done:
|
| 201 |
break
|
| 202 |
|
| 203 |
-
# Score: average of per-step rewards. Clamp strictly inside (0, 1)
|
| 204 |
-
# with margin >= 0.005 so f"{score:.3f}" never formats to "0.000" or "1.000".
|
| 205 |
-
_EPS = 0.01
|
| 206 |
denom = max(len(rewards), 1)
|
| 207 |
-
avg = sum(rewards) / denom if rewards else
|
| 208 |
-
score =
|
| 209 |
|
| 210 |
finally:
|
|
|
|
|
|
|
|
|
|
| 211 |
log_end(
|
| 212 |
success=success,
|
| 213 |
steps=steps_taken,
|
|
|
|
| 152 |
|
| 153 |
# ββ Single-episode runner βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 154 |
|
| 155 |
+
_SCORE_EPS = 0.05 # strict (0, 1) with generous margin for :.2f/:.3f rounding
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
def _clamp_score(x: float) -> float:
|
| 159 |
+
"""Clamp to strictly (0, 1). Uses 0.05 margin so :.2f/:.3f formatting stays safe."""
|
| 160 |
+
if x != x: # NaN
|
| 161 |
+
return 0.5
|
| 162 |
+
return max(_SCORE_EPS, min(1.0 - _SCORE_EPS, x))
|
| 163 |
+
|
| 164 |
+
|
| 165 |
async def run_episode(
|
| 166 |
env: SQLAgentEnv,
|
| 167 |
client: OpenAI,
|
|
|
|
| 172 |
|
| 173 |
rewards: List[float] = []
|
| 174 |
steps_taken = 0
|
| 175 |
+
score = _SCORE_EPS
|
| 176 |
success = False
|
| 177 |
last_error: Optional[str] = None
|
| 178 |
|
| 179 |
try:
|
| 180 |
+
try:
|
| 181 |
+
obs = env.reset(task_id)
|
| 182 |
+
except Exception as exc:
|
| 183 |
+
log_step(step=1, action="reset", reward=_SCORE_EPS, done=True, error=str(exc))
|
| 184 |
+
rewards.append(_SCORE_EPS)
|
| 185 |
+
steps_taken = 1
|
| 186 |
+
return
|
| 187 |
|
| 188 |
for step in range(1, MAX_STEPS + 1):
|
| 189 |
action_name = pick_action(client, obs, step)
|
|
|
|
| 191 |
|
| 192 |
try:
|
| 193 |
obs, reward_info = await env.step(action)
|
| 194 |
+
except Exception as exc:
|
| 195 |
+
log_step(step=step, action=action_name, reward=_SCORE_EPS, done=True, error=str(exc))
|
| 196 |
+
rewards.append(_SCORE_EPS)
|
| 197 |
steps_taken = step
|
| 198 |
break
|
| 199 |
|
| 200 |
+
raw_reward = reward_info.value if reward_info.value is not None else _SCORE_EPS
|
| 201 |
+
reward = _clamp_score(raw_reward)
|
| 202 |
done = reward_info.done
|
| 203 |
last_error = obs.error_message
|
| 204 |
success = reward_info.success
|
|
|
|
| 217 |
if done:
|
| 218 |
break
|
| 219 |
|
|
|
|
|
|
|
|
|
|
| 220 |
denom = max(len(rewards), 1)
|
| 221 |
+
avg = sum(rewards) / denom if rewards else _SCORE_EPS
|
| 222 |
+
score = _clamp_score(avg)
|
| 223 |
|
| 224 |
finally:
|
| 225 |
+
# Final safety net: score and every reward must be strictly in (0, 1)
|
| 226 |
+
score = _clamp_score(score)
|
| 227 |
+
rewards = [_clamp_score(r) for r in rewards]
|
| 228 |
log_end(
|
| 229 |
success=success,
|
| 230 |
steps=steps_taken,
|
openenv.yaml
CHANGED
|
@@ -82,12 +82,11 @@ observation_space:
|
|
| 82 |
|
| 83 |
# ββ Reward βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 84 |
reward:
|
| 85 |
-
range: [
|
| 86 |
description: >
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
Penalizes infinite loops (consecutive same error) and rewards convergence toward correct SQL.
|
| 91 |
|
| 92 |
# ββ Tasks ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 93 |
tasks:
|
|
|
|
| 82 |
|
| 83 |
# ββ Reward βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 84 |
reward:
|
| 85 |
+
range: [0.0, 1.0]
|
| 86 |
description: >
|
| 87 |
+
Task score is the grader output clamped strictly inside (0, 1). Graders
|
| 88 |
+
score partial progress (column correctness, row-count match) and apply
|
| 89 |
+
attempt penalties for multi-step repair episodes.
|
|
|
|
| 90 |
|
| 91 |
# ββ Tasks ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 92 |
tasks:
|