Spaces:
Running
Running
Commit ·
f96532b
1
Parent(s): 699f953
Fix syntax of [END] STDOUT line to perfectly match Hackathon mandatory format with score= parameter
Browse files- inference.py +5 -5
inference.py
CHANGED
|
@@ -266,12 +266,12 @@ def run_task(client: OpenAI, task_id: str) -> tuple:
|
|
| 266 |
except Exception as e:
|
| 267 |
# Env unreachable — must still emit [START] and [END]
|
| 268 |
print(f"[START] task={task_id} env={BENCHMARK} model={MODEL_NAME}", flush=True)
|
| 269 |
-
print(f"[END] success=false steps=0 rewards=0.01", flush=True)
|
| 270 |
return 0.01, False
|
| 271 |
|
| 272 |
if "error" in data and not data.get("episode_id"):
|
| 273 |
print(f"[START] task={task_id} env={BENCHMARK} model={MODEL_NAME}", flush=True)
|
| 274 |
-
print(f"[END] success=false steps=0 rewards=0.01", flush=True)
|
| 275 |
return 0.01, False
|
| 276 |
|
| 277 |
episode_id = data.get("episode_id", "unknown")
|
|
@@ -378,7 +378,7 @@ def run_task(client: OpenAI, task_id: str) -> tuple:
|
|
| 378 |
# spec: success=<true|false> steps=<n> rewards=<r1,r2,...,rn>
|
| 379 |
# NO score= field — not in the official spec
|
| 380 |
print(
|
| 381 |
-
f"[END] success={str(success).lower()} steps={step_num} rewards={rewards_str}",
|
| 382 |
flush=True
|
| 383 |
)
|
| 384 |
|
|
@@ -420,13 +420,13 @@ def main() -> None:
|
|
| 420 |
if remaining not in scores:
|
| 421 |
scores[remaining] = 0.01
|
| 422 |
print(f"[START] task={remaining} env={BENCHMARK} model={MODEL_NAME}", flush=True)
|
| 423 |
-
print(f"[END] success=false steps=0 rewards=0.01", flush=True)
|
| 424 |
break
|
| 425 |
|
| 426 |
except Exception as e:
|
| 427 |
scores[task_id] = 0.01
|
| 428 |
print(f"[START] task={task_id} env={BENCHMARK} model={MODEL_NAME}", flush=True)
|
| 429 |
-
print(f"[END] success=false steps=0 rewards=0.01", flush=True)
|
| 430 |
|
| 431 |
avg = round(sum(scores.values()) / max(len(scores), 1), 4)
|
| 432 |
print(f"\n✅ All tasks complete! Average: {avg:.4f}", flush=True)
|
|
|
|
| 266 |
except Exception as e:
|
| 267 |
# Env unreachable — must still emit [START] and [END]
|
| 268 |
print(f"[START] task={task_id} env={BENCHMARK} model={MODEL_NAME}", flush=True)
|
| 269 |
+
print(f"[END] success=false steps=0 score=0.01 rewards=0.01", flush=True)
|
| 270 |
return 0.01, False
|
| 271 |
|
| 272 |
if "error" in data and not data.get("episode_id"):
|
| 273 |
print(f"[START] task={task_id} env={BENCHMARK} model={MODEL_NAME}", flush=True)
|
| 274 |
+
print(f"[END] success=false steps=0 score=0.01 rewards=0.01", flush=True)
|
| 275 |
return 0.01, False
|
| 276 |
|
| 277 |
episode_id = data.get("episode_id", "unknown")
|
|
|
|
| 378 |
# spec: success=<true|false> steps=<n> rewards=<r1,r2,...,rn>
|
| 379 |
# NO score= field — not in the official spec
|
| 380 |
print(
|
| 381 |
+
f"[END] success={str(success).lower()} steps={step_num} score={score:.4f} rewards={rewards_str}",
|
| 382 |
flush=True
|
| 383 |
)
|
| 384 |
|
|
|
|
| 420 |
if remaining not in scores:
|
| 421 |
scores[remaining] = 0.01
|
| 422 |
print(f"[START] task={remaining} env={BENCHMARK} model={MODEL_NAME}", flush=True)
|
| 423 |
+
print(f"[END] success=false steps=0 score=0.01 rewards=0.01", flush=True)
|
| 424 |
break
|
| 425 |
|
| 426 |
except Exception as e:
|
| 427 |
scores[task_id] = 0.01
|
| 428 |
print(f"[START] task={task_id} env={BENCHMARK} model={MODEL_NAME}", flush=True)
|
| 429 |
+
print(f"[END] success=false steps=0 score=0.01 rewards=0.01", flush=True)
|
| 430 |
|
| 431 |
avg = round(sum(scores.values()) / max(len(scores), 1), 4)
|
| 432 |
print(f"\n✅ All tasks complete! Average: {avg:.4f}", flush=True)
|