ar9avg commited on
Commit
ba69b5f
·
1 Parent(s): b86d426

Widen score epsilon to 0.01 so :.3f formatting stays in (0, 1)

Browse files

The validator parses score=X.XXX from the [END] line of inference.py
stdout. With eps=1e-9 and :.3f formatting, scores rounded to "0.000"
or "1.000" which the validator reads as exactly 0.0 or 1.0.

Using eps=0.01 guarantees the formatted score is always strictly
between 0.010 and 0.990.

Files changed (2) hide show
  1. backend/env/tasks.py +1 -1
  2. inference.py +6 -5
backend/env/tasks.py CHANGED
@@ -330,7 +330,7 @@ def get_all_tasks() -> list[Task]:
330
  return list(TASKS.values())
331
 
332
 
333
- _EPS = 1e-9
334
 
335
 
336
  def grade_response(
 
330
  return list(TASKS.values())
331
 
332
 
333
+ _EPS = 0.01 # wide enough that f"{x:.3f}" never rounds to 0.000 or 1.000
334
 
335
 
336
  def grade_response(
inference.py CHANGED
@@ -200,11 +200,12 @@ async def run_episode(
200
  if done:
201
  break
202
 
203
- # Score: average of per-step rewards (each already in (0,1) from env)
204
- # Clamp to [0, 1] as a safety net
205
- total = sum(rewards)
206
- max_possible = MAX_STEPS * 1.0
207
- score = min(max(total / max_possible if max_possible > 0 else 0.0, 0.0), 1.0)
 
208
 
209
  finally:
210
  log_end(
 
200
  if done:
201
  break
202
 
203
+ # Score: average of per-step rewards. Clamp strictly inside (0, 1)
204
+ # with margin >= 0.005 so f"{score:.3f}" never formats to "0.000" or "1.000".
205
+ _EPS = 0.01
206
+ denom = max(len(rewards), 1)
207
+ avg = sum(rewards) / denom if rewards else _EPS
208
+ score = max(_EPS, min(1.0 - _EPS, avg))
209
 
210
  finally:
211
  log_end(