Spaces:
Sleeping
Sleeping
fic: score condition
Browse files- inference.py +4 -4
inference.py
CHANGED
|
@@ -210,7 +210,7 @@ async def run_episode(
|
|
| 210 |
print(f"[STEP] step={step} action={action.action_type} reward=0.01 done=true error={e}", flush=True)
|
| 211 |
break
|
| 212 |
obs = result.observation
|
| 213 |
-
reward = round(min(0.99, result.reward or 0.01), 2)
|
| 214 |
done = result.done
|
| 215 |
if action.action_type in ("inspect_logs", "inspect_config", "inspect_gradients"):
|
| 216 |
source = action.action_type.replace("inspect_", "")
|
|
@@ -229,7 +229,7 @@ async def run_episode(
|
|
| 229 |
break
|
| 230 |
|
| 231 |
# WebSocket is closed — safe to call the judge now
|
| 232 |
-
keyword_score = rewards[-1] if rewards else 0.01
|
| 233 |
judge_score: float | None = None
|
| 234 |
if submit_action is not None:
|
| 235 |
judge_score = llm_judge(
|
|
@@ -242,10 +242,10 @@ async def run_episode(
|
|
| 242 |
inspection_order=inspection_order,
|
| 243 |
)
|
| 244 |
if judge_score is None:
|
| 245 |
-
score = round(keyword_score, 2)
|
| 246 |
# print(f" [JUDGE] scenario={scenario_key} keyword={keyword_score:.2f} reasoning=n/a total={score:.2f}", file=sys.stderr, flush=True)
|
| 247 |
else:
|
| 248 |
-
score = round(0.85 * keyword_score + 0.15 * judge_score, 2)
|
| 249 |
# print(f" [JUDGE] scenario={scenario_key} keyword={keyword_score:.3f} reasoning={judge_score:.3f} total={score:.3f}", file=sys.stderr, flush=True)
|
| 250 |
|
| 251 |
success = score >= SUCCESS_THRESHOLD
|
|
|
|
| 210 |
print(f"[STEP] step={step} action={action.action_type} reward=0.01 done=true error={e}", flush=True)
|
| 211 |
break
|
| 212 |
obs = result.observation
|
| 213 |
+
reward = round(max(0.01, min(0.99, result.reward or 0.01)), 2)
|
| 214 |
done = result.done
|
| 215 |
if action.action_type in ("inspect_logs", "inspect_config", "inspect_gradients"):
|
| 216 |
source = action.action_type.replace("inspect_", "")
|
|
|
|
| 229 |
break
|
| 230 |
|
| 231 |
# WebSocket is closed — safe to call the judge now
|
| 232 |
+
keyword_score = max(0.01, min(0.99, rewards[-1])) if rewards else 0.01
|
| 233 |
judge_score: float | None = None
|
| 234 |
if submit_action is not None:
|
| 235 |
judge_score = llm_judge(
|
|
|
|
| 242 |
inspection_order=inspection_order,
|
| 243 |
)
|
| 244 |
if judge_score is None:
|
| 245 |
+
score = round(max(0.01, min(0.99, keyword_score)), 2)
|
| 246 |
# print(f" [JUDGE] scenario={scenario_key} keyword={keyword_score:.2f} reasoning=n/a total={score:.2f}", file=sys.stderr, flush=True)
|
| 247 |
else:
|
| 248 |
+
score = round(max(0.01, min(0.99, 0.85 * keyword_score + 0.15 * judge_score)), 2)
|
| 249 |
# print(f" [JUDGE] scenario={scenario_key} keyword={keyword_score:.3f} reasoning={judge_score:.3f} total={score:.3f}", file=sys.stderr, flush=True)
|
| 250 |
|
| 251 |
success = score >= SUCCESS_THRESHOLD
|