Spaces:

aim143
/

support-queue-openenv

Sleeping

App Files Files Community

eeshwar143 commited on 20 days ago

Commit

4c21555

1 Parent(s): dd3c25c

Clamp task scores to open interval

Browse files

Files changed (2) hide show

inference.py +12 -4
support_queue_env/grading.py +8 -1

inference.py CHANGED Viewed

@@ -49,6 +49,11 @@ ALLOW_DIRECT_OPENAI = os.getenv("ALLOW_DIRECT_OPENAI") == "1"
 BENCHMARK = "support_queue_env"
 SUCCESS_SCORE_THRESHOLD = 0.80
 MAX_TOKENS = 250
 def log_start(task: str, env: str, model: str) -> None:
@@ -303,7 +308,7 @@ async def run_task(client: Any, env: SupportQueueEnv, task: TaskCard) -> dict[st
     history: List[str] = []
     rewards: List[float] = []
     steps_taken = 0
-    score = 0.0
     success = False
     log_start(task=task.task_id, env=BENCHMARK, model=MODEL_NAME)
@@ -344,7 +349,7 @@ async def run_task(client: Any, env: SupportQueueEnv, task: TaskCard) -> dict[st
                 break
         score = sum(rewards) / len(rewards) if rewards else 0.0
-        score = min(max(score, 0.0), 1.0)
         success = score >= SUCCESS_SCORE_THRESHOLD
     except Exception as exc:
@@ -377,11 +382,11 @@ async def main() -> None:
         print(f"[DEBUG] Environment bootstrap failed: {exc}", flush=True)
         for task in tasks:
             log_start(task=task.task_id, env=BENCHMARK, model=MODEL_NAME)
-            log_end(success=False, steps=0, score=0.0, rewards=[])
             results.append(
                 {
                     "task_id": task.task_id,
-                    "score": 0.0,
                     "steps": 0,
                     "rewards": [],
                     "success": False,
@@ -409,3 +414,6 @@ if __name__ == "__main__":
         asyncio.run(main())
     except Exception as exc:
         print(f"[DEBUG] Fatal inference error: {exc}", flush=True)

 BENCHMARK = "support_queue_env"
 SUCCESS_SCORE_THRESHOLD = 0.80
 MAX_TOKENS = 250
+SCORE_EPSILON = 0.0001
+def clamp_task_score(score: float) -> float:
+    return min(max(score, SCORE_EPSILON), 1.0 - SCORE_EPSILON)
 def log_start(task: str, env: str, model: str) -> None:
     history: List[str] = []
     rewards: List[float] = []
     steps_taken = 0
+    score = clamp_task_score(0.0)
     success = False
     log_start(task=task.task_id, env=BENCHMARK, model=MODEL_NAME)
                 break
         score = sum(rewards) / len(rewards) if rewards else 0.0
+        score = clamp_task_score(score)
         success = score >= SUCCESS_SCORE_THRESHOLD
     except Exception as exc:
         print(f"[DEBUG] Environment bootstrap failed: {exc}", flush=True)
         for task in tasks:
             log_start(task=task.task_id, env=BENCHMARK, model=MODEL_NAME)
+            log_end(success=False, steps=0, score=clamp_task_score(0.0), rewards=[])
             results.append(
                 {
                     "task_id": task.task_id,
+                    "score": clamp_task_score(0.0),
                     "steps": 0,
                     "rewards": [],
                     "success": False,
         asyncio.run(main())
     except Exception as exc:
         print(f"[DEBUG] Fatal inference error: {exc}", flush=True)

support_queue_env/grading.py CHANGED Viewed

@@ -8,6 +8,11 @@ from support_queue_env.models import GradingBreakdown, SupportQueueAction, Ticke
 from support_queue_env.tasks import TicketSpec
 PRIORITY_ORDER = ["P1", "P2", "P3", "P4"]
 def _normalize(text: str) -> str:
@@ -72,7 +77,7 @@ def grade_ticket(ticket: TicketSpec, action: SupportQueueAction) -> TicketFeedba
         + breakdown.response_score
         + breakdown.penalty
     )
-    breakdown.total = round(max(0.0, min(1.0, total)), 4)
     matched_summary = summary_hits if ticket.summary_keywords else 0
     matched_response = response_hits if ticket.response_keywords else 0
@@ -92,3 +97,5 @@ def grade_ticket(ticket: TicketSpec, action: SupportQueueAction) -> TicketFeedba
         breakdown=breakdown,
         feedback=feedback,
     )

 from support_queue_env.tasks import TicketSpec
 PRIORITY_ORDER = ["P1", "P2", "P3", "P4"]
+SCORE_EPSILON = 0.001
+def _open_unit_interval(score: float) -> float:
+    return round(min(max(score, SCORE_EPSILON), 1.0 - SCORE_EPSILON), 4)
 def _normalize(text: str) -> str:
         + breakdown.response_score
         + breakdown.penalty
     )
+    breakdown.total = _open_unit_interval(total)
     matched_summary = summary_hits if ticket.summary_keywords else 0
     matched_response = response_hits if ticket.response_keywords else 0
         breakdown=breakdown,
         feedback=feedback,
     )