Spaces:

mathi3046
/

customer-support-env

Sleeping

App Files Files Community

mathi3046 commited on Apr 8

Commit

6c591d0

1 Parent(s): 3932d4b

last 0 and 1 error update

Browse files

Files changed (8) hide show

__init__.py +2 -0
grader.py +35 -81
inference.py +100 -38
models.py +68 -10
pyproject.toml +1 -1
server/app.py +23 -18
server/environment.py +19 -7
validate.py +12 -8

__init__.py CHANGED Viewed

@@ -11,6 +11,7 @@ from .models import (
     SupportState,
     RewardBreakdown,
     StepResult,
 )
 from .server.environment import CustomerSupportEnvironment
@@ -21,6 +22,7 @@ __all__ = [
     "SupportState",
     "RewardBreakdown",
     "StepResult",
 ]
 __version__ = "1.0.0"

     SupportState,
     RewardBreakdown,
     StepResult,
+    safe_score,
 )
 from .server.environment import CustomerSupportEnvironment
     "SupportState",
     "RewardBreakdown",
     "StepResult",
+    "safe_score",
 ]
 __version__ = "1.0.0"

grader.py CHANGED Viewed

@@ -9,43 +9,17 @@ Evaluates agent responses on three axes:
 Returns a RewardBreakdown with a total score in (0.0, 1.0) — strict open interval.
 IMPORTANT — Every numeric score produced by this module is passed through
-``normalize_score`` before it leaves the grader so that the evaluator NEVER
 receives a boundary value (0.0 or 1.0).
 """
 import re
 from typing import Any, Dict, List
-from models import RewardBreakdown
-# ──────────────────────────────────────────────────────────────────
-# Central score normaliser — THE single source of truth
-# ──────────────────────────────────────────────────────────────────
-# Strict open-interval bounds: scores must never be exactly 0.0 or 1.0
-_SCORE_FLOOR = 0.0001
-_SCORE_CEIL  = 0.9999
-def normalize_score(value: Any) -> float:
-    """Clamp *value* into the strict open interval (0, 1).
-    * ``None``  → 0.5
-    * anything that cannot be converted to float → 0.5
-    * values ≤ 0 → ``_SCORE_FLOOR``
-    * values ≥ 1 → ``_SCORE_CEIL``
-    """
-    if value is None:
-        return 0.5
-    try:
-        v = float(value)
-    except (TypeError, ValueError):
-        return 0.5
-    # Guard against NaN / Inf
-    if v != v or v == float('inf') or v == float('-inf'):
-        return 0.5
-    return max(_SCORE_FLOOR, min(_SCORE_CEIL, v))
 def _normalise(text: str) -> str:
@@ -68,18 +42,16 @@ def _score_correctness(
     norm = _normalise(response)
     criteria = rubric.get("criteria", [])
     if not criteria:
-        # No rubric → return a safe neutral score, never 0.0
-        return normalize_score(0.1)
     total = 0.0
     for criterion in criteria:
         kw_group: List[str] = criterion.get("keyword_group", [])
         points: float = criterion.get("points", 0.0)
-        # Award points if ANY keyword in the group is found
         if any(kw.lower() in norm for kw in kw_group):
             total += points
-    return normalize_score(total)
 # ──────────────────────────────────────────────────────────────────
@@ -102,33 +74,27 @@ def _score_tone(
     positive_signals: List[str] = criteria.get("positive_signals", [])
     negative_signals: List[str] = criteria.get("negative_signals", [])
-    # Count matches
     pos_count = sum(1 for sig in positive_signals if sig.lower() in norm)
     neg_count = sum(1 for sig in negative_signals if sig.lower() in norm)
-    # Base score: 0.5 (neutral)
     score = 0.5
-    # Each positive signal adds points (diminishing returns)
     if positive_signals:
         pos_ratio = pos_count / len(positive_signals)
-        score += pos_ratio * 0.4  # max +0.4 from positives (keeps below 1.0)
-    # Each negative signal deducts heavily
     if neg_count > 0:
-        score -= min(neg_count * 0.2, 0.4)  # max -0.4 from negatives (keeps above 0.0)
-    # Additional length/quality checks
     word_count = len(norm.split())
     if word_count < 10:
-        score -= 0.1  # Too terse is often rude
-    # Check if response uses ALL CAPS excessively
     upper_ratio = sum(1 for c in response if c.isupper()) / max(len(response), 1)
     if upper_ratio > 0.4 and len(response) > 20:
-        score -= 0.05  # Shouting in response
-    return normalize_score(score)
 # ──────────────────────────────────────────────────────────────────
@@ -148,8 +114,7 @@ def _score_completeness(
     norm = _normalise(response)
     criteria = rubric.get("criteria", [])
     if not criteria:
-        # No rubric → return a safe neutral score, never 0.0
-        return normalize_score(0.1)
     total = 0.0
     for criterion in criteria:
@@ -157,14 +122,12 @@ def _score_completeness(
         points = criterion.get("points", 0.0)
         if check == "addresses_question" or check == "addresses_defect":
-            # Check if response directly addresses the main issue
             subject = _normalise(ticket_info.get("subject", ""))
             subject_words = [w for w in subject.split() if len(w) > 3]
             if any(w in norm for w in subject_words) or len(norm.split()) > 20:
                 total += points
         elif check == "provides_next_steps":
-            # Check for actionable next steps
             step_indicators = [
                 "will", "can", "please", "next step", "process",
                 "we'll", "i'll", "going to", "let me", "i can",
@@ -174,15 +137,13 @@ def _score_completeness(
                 total += points
         elif check == "references_order":
-            # Check if the specific order ID is referenced
             order_id = ticket_info.get("order_id", "")
             if order_id and order_id.lower() in norm:
                 total += points
             elif "order" in norm:
-                total += points * 0.5  # Partial credit for mentioning order
         elif check == "explains_policy":
-            # Check if relevant policy details are mentioned
             policy_terms = [
                 "policy", "within", "days", "eligible", "qualify",
                 "terms", "condition", "guideline",
@@ -191,7 +152,6 @@ def _score_completeness(
                 total += points
         elif check == "provides_process":
-            # Check if return/refund process is outlined
             process_terms = [
                 "step", "first", "then", "send", "ship", "return",
                 "label", "process", "receive", "refund",
@@ -200,13 +160,11 @@ def _score_completeness(
                 total += points
         elif check == "offers_options":
-            # Check if multiple options are presented
             option_indicators = ["or", "option", "alternative", "either", "choose", "prefer"]
             if any(ind in norm for ind in option_indicators):
                 total += points
         elif check == "acknowledges_all_issues":
-            # For hard task: must address multiple issues
             issues_to_address = ["wrong", "late", "delay", "rude", "staff", "agent"]
             addressed = sum(1 for iss in issues_to_address if iss in norm)
             if addressed >= 3:
@@ -217,7 +175,6 @@ def _score_completeness(
                 total += points * 0.3
         elif check == "concrete_resolution":
-            # Check for concrete actions, not just apologies
             concrete_terms = [
                 "refund", "replacement", "ship", "send", "credit",
                 "discount", "expedite", "priority", "immediately",
@@ -227,7 +184,6 @@ def _score_completeness(
                 total += points
         elif check == "timeline":
-            # Check if specific timelines are given
             time_patterns = [
                 r"\d+\s*(hour|day|week|business day)",
                 r"within\s+\d+",
@@ -241,17 +197,15 @@ def _score_completeness(
                 total += points
         elif check == "empathy":
-            # Check for empathetic language
             empathy_terms = [
                 "understand", "frustrat", "sorry", "apologize",
                 "inconvenience", "disappoint", "concern",
-                "appreciate your patience", "I hear you",
             ]
             if sum(1 for t in empathy_terms if t in norm) >= 2:
                 total += points
         elif check == "follow_up_plan":
-            # Check for follow-up commitments
             follow_up_terms = [
                 "follow up", "follow-up", "check back", "update you",
                 "keep you informed", "contact you", "reach out",
@@ -260,7 +214,7 @@ def _score_completeness(
             if any(t in norm for t in follow_up_terms):
                 total += points
-    return normalize_score(total)
 # ──────────────────────────────────────────────────────────────────
@@ -278,11 +232,9 @@ def _compute_penalties(
     norm = _normalise(response)
     penalty = 0.0
-    # Penalty: empty or near-empty response
     if len(norm.split()) < 5:
         penalty -= 0.2
-    # Penalty: repeated response (copy-paste from previous)
     if conversation_history:
         prev_agent_msgs = [
             _normalise(m.get("content", ""))
@@ -297,7 +249,6 @@ def _compute_penalties(
                 penalty -= 0.1
                 break
-    # Penalty: harmful/inappropriate content
     harmful_patterns = [
         "kill", "die", "hate you", "shut up", "idiot",
         "moron", "loser", "go away",
@@ -305,7 +256,6 @@ def _compute_penalties(
     if any(pat in norm for pat in harmful_patterns):
         penalty -= 0.3
-    # Penalty: completely irrelevant response
     irrelevant_signals = [
         "weather", "recipe", "joke", "game score",
         "political", "stock market",
@@ -336,18 +286,19 @@ def grade_response(
         conversation_history: Previous messages
     Returns:
-        RewardBreakdown with ALL scores in strict (0.0, 1.0) open interval
     """
-    # Score each axis — normalize_score guarantees (0, 1)
-    correctness = normalize_score(_score_correctness(
         response,
         grading_rubric.get("correctness", {}),
     ))
-    tone = normalize_score(_score_tone(
         response,
         grading_rubric.get("tone", {}),
     ))
-    completeness = normalize_score(_score_completeness(
         response,
         grading_rubric.get("completeness", {}),
         ticket_info,
@@ -369,16 +320,18 @@ def grade_response(
         + completeness * w_completeness
     )
-    # Apply penalties — normalize_score guarantees strict (0, 1)
-    total = normalize_score(weighted + penalties)
     # The efficiency field re-uses the weighted pre-penalty score
-    efficiency = normalize_score(weighted)
     # Debug logging
-    print(f"[DEBUG] correctness={correctness:.4f} tone={tone:.4f} "
-          f"completeness={completeness:.4f} weighted={weighted:.4f} "
-          f"penalties={penalties:.4f} total={total:.4f}")
     # Build explanation
     parts = []
@@ -389,12 +342,13 @@ def grade_response(
         parts.append(f"Penalties: {penalties:.4f}")
     parts.append(f"Total: {total:.4f}")
     return RewardBreakdown(
-        correctness=normalize_score(correctness),
-        tone=normalize_score(tone),
-        completeness=normalize_score(completeness),
-        efficiency=normalize_score(efficiency),
         penalties=round(penalties, 4),
-        total=normalize_score(total),
         explanation=" | ".join(parts),
     )

 Returns a RewardBreakdown with a total score in (0.0, 1.0) — strict open interval.
 IMPORTANT — Every numeric score produced by this module is passed through
+``safe_score`` before it leaves the grader so that the evaluator NEVER
 receives a boundary value (0.0 or 1.0).
 """
+import logging
 import re
 from typing import Any, Dict, List
+from models import RewardBreakdown, safe_score
+logger = logging.getLogger(__name__)
 def _normalise(text: str) -> str:
     norm = _normalise(response)
     criteria = rubric.get("criteria", [])
     if not criteria:
+        return safe_score(0.1)
     total = 0.0
     for criterion in criteria:
         kw_group: List[str] = criterion.get("keyword_group", [])
         points: float = criterion.get("points", 0.0)
         if any(kw.lower() in norm for kw in kw_group):
             total += points
+    return safe_score(total)
 # ──────────────────────────────────────────────────────────────────
     positive_signals: List[str] = criteria.get("positive_signals", [])
     negative_signals: List[str] = criteria.get("negative_signals", [])
     pos_count = sum(1 for sig in positive_signals if sig.lower() in norm)
     neg_count = sum(1 for sig in negative_signals if sig.lower() in norm)
     score = 0.5
     if positive_signals:
         pos_ratio = pos_count / len(positive_signals)
+        score += pos_ratio * 0.4
     if neg_count > 0:
+        score -= min(neg_count * 0.2, 0.4)
     word_count = len(norm.split())
     if word_count < 10:
+        score -= 0.1
     upper_ratio = sum(1 for c in response if c.isupper()) / max(len(response), 1)
     if upper_ratio > 0.4 and len(response) > 20:
+        score -= 0.05
+    return safe_score(score)
 # ──────────────────────────────────────────────────────────────────
     norm = _normalise(response)
     criteria = rubric.get("criteria", [])
     if not criteria:
+        return safe_score(0.1)
     total = 0.0
     for criterion in criteria:
         points = criterion.get("points", 0.0)
         if check == "addresses_question" or check == "addresses_defect":
             subject = _normalise(ticket_info.get("subject", ""))
             subject_words = [w for w in subject.split() if len(w) > 3]
             if any(w in norm for w in subject_words) or len(norm.split()) > 20:
                 total += points
         elif check == "provides_next_steps":
             step_indicators = [
                 "will", "can", "please", "next step", "process",
                 "we'll", "i'll", "going to", "let me", "i can",
                 total += points
         elif check == "references_order":
             order_id = ticket_info.get("order_id", "")
             if order_id and order_id.lower() in norm:
                 total += points
             elif "order" in norm:
+                total += points * 0.5
         elif check == "explains_policy":
             policy_terms = [
                 "policy", "within", "days", "eligible", "qualify",
                 "terms", "condition", "guideline",
                 total += points
         elif check == "provides_process":
             process_terms = [
                 "step", "first", "then", "send", "ship", "return",
                 "label", "process", "receive", "refund",
                 total += points
         elif check == "offers_options":
             option_indicators = ["or", "option", "alternative", "either", "choose", "prefer"]
             if any(ind in norm for ind in option_indicators):
                 total += points
         elif check == "acknowledges_all_issues":
             issues_to_address = ["wrong", "late", "delay", "rude", "staff", "agent"]
             addressed = sum(1 for iss in issues_to_address if iss in norm)
             if addressed >= 3:
                 total += points * 0.3
         elif check == "concrete_resolution":
             concrete_terms = [
                 "refund", "replacement", "ship", "send", "credit",
                 "discount", "expedite", "priority", "immediately",
                 total += points
         elif check == "timeline":
             time_patterns = [
                 r"\d+\s*(hour|day|week|business day)",
                 r"within\s+\d+",
                 total += points
         elif check == "empathy":
             empathy_terms = [
                 "understand", "frustrat", "sorry", "apologize",
                 "inconvenience", "disappoint", "concern",
+                "appreciate your patience", "i hear you",
             ]
             if sum(1 for t in empathy_terms if t in norm) >= 2:
                 total += points
         elif check == "follow_up_plan":
             follow_up_terms = [
                 "follow up", "follow-up", "check back", "update you",
                 "keep you informed", "contact you", "reach out",
             if any(t in norm for t in follow_up_terms):
                 total += points
+    return safe_score(total)
 # ──────────────────────────────────────────────────────────────────
     norm = _normalise(response)
     penalty = 0.0
     if len(norm.split()) < 5:
         penalty -= 0.2
     if conversation_history:
         prev_agent_msgs = [
             _normalise(m.get("content", ""))
                 penalty -= 0.1
                 break
     harmful_patterns = [
         "kill", "die", "hate you", "shut up", "idiot",
         "moron", "loser", "go away",
     if any(pat in norm for pat in harmful_patterns):
         penalty -= 0.3
     irrelevant_signals = [
         "weather", "recipe", "joke", "game score",
         "political", "stock market",
         conversation_history: Previous messages
     Returns:
+        RewardBreakdown with ALL scores in strict (0.0, 1.0) open interval.
+        The RewardBreakdown model auto-clamps all score fields via validators.
     """
+    # Score each axis — safe_score guarantees (0, 1)
+    correctness = safe_score(_score_correctness(
         response,
         grading_rubric.get("correctness", {}),
     ))
+    tone = safe_score(_score_tone(
         response,
         grading_rubric.get("tone", {}),
     ))
+    completeness = safe_score(_score_completeness(
         response,
         grading_rubric.get("completeness", {}),
         ticket_info,
         + completeness * w_completeness
     )
+    # Apply penalties — safe_score guarantees strict (0, 1)
+    total = safe_score(weighted + penalties)
     # The efficiency field re-uses the weighted pre-penalty score
+    efficiency = safe_score(weighted)
     # Debug logging
+    logger.info(
+        f"[GRADER] correctness={correctness:.4f} tone={tone:.4f} "
+        f"completeness={completeness:.4f} weighted={weighted:.4f} "
+        f"penalties={penalties:.4f} total={total:.4f}"
+    )
     # Build explanation
     parts = []
         parts.append(f"Penalties: {penalties:.4f}")
     parts.append(f"Total: {total:.4f}")
+    # RewardBreakdown auto-clamps all score fields via field_validator
     return RewardBreakdown(
+        correctness=correctness,
+        tone=tone,
+        completeness=completeness,
+        efficiency=efficiency,
         penalties=round(penalties, 4),
+        total=total,
         explanation=" | ".join(parts),
     )

inference.py CHANGED Viewed

@@ -74,40 +74,94 @@ logging.basicConfig(
 logger = logging.getLogger(__name__)
-def _strict_score(value: Any) -> float:
-    """Normalize any numeric-like score to strict open interval (0, 1).
     CRITICAL: Every score passed to the evaluator MUST satisfy 0 < score < 1.
     This function is the last line of defence.
     """
     try:
         numeric = float(value)
     except (TypeError, ValueError):
-        numeric = 0.5
     # Guard against NaN / Inf
     if numeric != numeric or numeric == float('inf') or numeric == float('-inf'):
-        numeric = 0.5
-    clamped = max(0.0001, min(0.9999, numeric))
-    print(f"[DEBUG] _strict_score: input={value!r} -> {clamped:.4f}")
-    return clamped
 def _sanitize_task_result(task_result: Dict[str, Any]) -> Dict[str, Any]:
     """Ensure task result contains evaluator-safe score fields.
-    CRITICAL: total_reward and avg_reward MUST both be in strict (0, 1).
     The evaluator checks per-task scores and rejects 0.0 or 1.0.
     """
     safe = dict(task_result)
     safe["steps"] = int(safe.get("steps", 0) or 0)
-    safe["total_reward"] = _strict_score(safe.get("total_reward", 0.5))
-    safe["avg_reward"] = _strict_score(safe.get("avg_reward", 0.5))
     safe["elapsed"] = float(safe.get("elapsed", 0.0) or 0.0)
-    print(f"[DEBUG] _sanitize_task_result: task={safe.get('task_id')} "
-          f"total_reward={safe['total_reward']:.4f} avg_reward={safe['avg_reward']:.4f}")
     return safe
 # ──────────────────────────────────────────────────────────────────
 # LLM Client  (uses OpenAI SDK — required by checklist item 4)
 # ──────────────────────────────────────────────────────────────────
@@ -305,6 +359,7 @@ def build_messages(
 def run_task(env_client: EnvClient, task_id: str) -> Dict[str, Any]:
     """
     Run a single task to completion and return results.
     """
     logger.info(f"[START] task_id={task_id}")
     start_time = time.time()
@@ -341,7 +396,7 @@ def run_task(env_client: EnvClient, task_id: str) -> Dict[str, Any]:
         step_count += 1
         # Guard against endpoint-side boundary values (0.0 or 1.0)
-        step_reward = _strict_score(result.get("reward", 0.01))
         total_reward += step_reward
         done = result.get("done", False)
         obs = result.get("observation", {})
@@ -352,20 +407,20 @@ def run_task(env_client: EnvClient, task_id: str) -> Dict[str, Any]:
         logger.info(
             f"[STEP] task={task_id} step={step_count} "
             f"reward={step_reward:.4f} "
-            f"correctness={reward_breakdown.get('correctness', 0):.2f} "
-            f"tone={reward_breakdown.get('tone', 0):.2f} "
-            f"completeness={reward_breakdown.get('completeness', 0):.2f} "
             f"done={done}"
         )
     # Compute average reward for this task — clamped to strict (0, 1)
-    avg_reward = _strict_score(total_reward / max(step_count, 1))
     elapsed = time.time() - start_time
     # CRITICAL: total_reward accumulates across steps and WILL exceed 1.0
     # (e.g. 3 steps × 0.5 = 1.5). The evaluator checks per-task values,
-    # so we MUST clamp it to strict (0, 1) before output.
-    safe_total_reward = _strict_score(total_reward / max(step_count, 1))
     logger.info(
         f"[END] task_id={task_id} "
@@ -381,6 +436,7 @@ def run_task(env_client: EnvClient, task_id: str) -> Dict[str, Any]:
         "steps": step_count,
         "total_reward": safe_total_reward,
         "avg_reward": avg_reward,
         "elapsed": elapsed,
     }
@@ -407,20 +463,8 @@ def main():
         """Write sanitized results and return sanitized final score."""
         sanitized_results = [_sanitize_task_result(r) for r in results]
-        # Add 'score' alias — evaluator may read this field name
-        for r in sanitized_results:
-            r["score"] = _strict_score(r.get("avg_reward", 0.5))
         total_avg = sum(r["avg_reward"] for r in sanitized_results)
-        final = _strict_score(total_avg / len(sanitized_results)) if sanitized_results else 0.5
-        # FINAL VALIDATION — catch any remaining boundary values
-        for r in sanitized_results:
-            for key in ["total_reward", "avg_reward", "score"]:
-                val = r.get(key)
-                if val is not None and (val <= 0.0 or val >= 1.0):
-                    logger.error(f"[CRITICAL] {r.get('task_id')}.{key}={val} VIOLATES (0,1)! Clamping.")
-                    r[key] = _strict_score(val)
         output = {
             "final_score": final,
@@ -432,11 +476,27 @@ def main():
             },
         }
         logger.info(f"[DEBUG] Final output JSON scores:")
-        logger.info(f"  final_score: {final:.6f}")
-        for r in sanitized_results:
-            logger.info(f"  {r.get('task_id')}: total_reward={r.get('total_reward'):.6f} "
-                         f"avg_reward={r.get('avg_reward'):.6f} score={r.get('score'):.6f}")
         try:
             os.makedirs("outputs", exist_ok=True)
@@ -446,7 +506,7 @@ def main():
         except Exception as e:
             logger.error(f"[ERROR] Failed to save results: {e}")
-        return final
     # Wait for environment to be ready
     logger.info("[START] Waiting for environment server...")
@@ -464,6 +524,7 @@ def main():
                 "steps": 0,
                 "total_reward": 0.01,
                 "avg_reward": 0.01,
                 "elapsed": 0.0,
                 "error": "environment_unavailable",
             }
@@ -486,6 +547,7 @@ def main():
                 "steps": 0,
                 "total_reward": 0.01,
                 "avg_reward": 0.01,
                 "elapsed": 0.0,
                 "error": str(e),
             })
@@ -507,7 +569,7 @@ def main():
         )
         total_avg += r.get("avg_reward", 0)
-    final_score = _strict_score(total_avg / len(results)) if results else 0.01
     logger.info("-" * 60)
     logger.info(f"  FINAL SCORE: {final_score:.4f} (0.0 -- 1.0)")
     logger.info("=" * 60)

 logger = logging.getLogger(__name__)
+# ──────────────────────────────────────────────────────────────────
+# Safe score utility — THE last line of defence
+# ──────────────────────────────────────────────────────────────────
+_SCORE_FLOOR = 0.0001
+_SCORE_CEIL  = 0.9999
+def safe_score(value: Any) -> float:
+    """Normalize any value to strict open interval (0, 1).
     CRITICAL: Every score passed to the evaluator MUST satisfy 0 < score < 1.
     This function is the last line of defence.
+    Rules:
+        * None → 0.5
+        * Strings / non-numeric → 0.5
+        * NaN / ±Inf → 0.5
+        * ≤ 0  → 0.0001
+        * ≥ 1  → 0.9999
     """
+    if value is None:
+        return 0.5
+    if isinstance(value, str):
+        try:
+            value = float(value)
+        except (TypeError, ValueError):
+            return 0.5
     try:
         numeric = float(value)
     except (TypeError, ValueError):
+        return 0.5
     # Guard against NaN / Inf
     if numeric != numeric or numeric == float('inf') or numeric == float('-inf'):
+        return 0.5
+    return max(_SCORE_FLOOR, min(_SCORE_CEIL, numeric))
 def _sanitize_task_result(task_result: Dict[str, Any]) -> Dict[str, Any]:
     """Ensure task result contains evaluator-safe score fields.
+    CRITICAL: total_reward, avg_reward, and score MUST all be in strict (0, 1).
     The evaluator checks per-task scores and rejects 0.0 or 1.0.
     """
     safe = dict(task_result)
     safe["steps"] = int(safe.get("steps", 0) or 0)
+    safe["total_reward"] = safe_score(safe.get("total_reward", 0.5))
+    safe["avg_reward"] = safe_score(safe.get("avg_reward", 0.5))
     safe["elapsed"] = float(safe.get("elapsed", 0.0) or 0.0)
+    # ALWAYS include a 'score' field — evaluator may read this
+    safe["score"] = safe_score(safe.get("score", safe.get("avg_reward", 0.5)))
+    logger.info(
+        f"[DEBUG] _sanitize: task={safe.get('task_id')} "
+        f"total_reward={safe['total_reward']:.4f} "
+        f"avg_reward={safe['avg_reward']:.4f} "
+        f"score={safe['score']:.4f}"
+    )
     return safe
+def _sanitize_full_output(output: Dict[str, Any]) -> Dict[str, Any]:
+    """Final global sanitization pass over the entire output dict.
+    Walks all task_results and clamps every numeric score field.
+    This is the ABSOLUTE LAST safeguard before JSON serialization.
+    """
+    sanitized = dict(output)
+    # Clamp final_score
+    sanitized["final_score"] = safe_score(sanitized.get("final_score", 0.5))
+    # Clamp every score in every task result
+    score_keys = ["total_reward", "avg_reward", "score"]
+    for r in sanitized.get("task_results", []):
+        for key in score_keys:
+            if key in r:
+                val = r[key]
+                clamped = safe_score(val)
+                if val != clamped:
+                    logger.warning(
+                        f"[SANITIZE] {r.get('task_id')}.{key}: "
+                        f"{val} → {clamped} (was out of bounds)"
+                    )
+                r[key] = clamped
+    return sanitized
 # ──────────────────────────────────────────────────────────────────
 # LLM Client  (uses OpenAI SDK — required by checklist item 4)
 # ──────────────────────────────────────────────────────────────────
 def run_task(env_client: EnvClient, task_id: str) -> Dict[str, Any]:
     """
     Run a single task to completion and return results.
+    All scores are clamped to strict (0, 1) before returning.
     """
     logger.info(f"[START] task_id={task_id}")
     start_time = time.time()
         step_count += 1
         # Guard against endpoint-side boundary values (0.0 or 1.0)
+        step_reward = safe_score(result.get("reward", 0.01))
         total_reward += step_reward
         done = result.get("done", False)
         obs = result.get("observation", {})
         logger.info(
             f"[STEP] task={task_id} step={step_count} "
             f"reward={step_reward:.4f} "
+            f"correctness={safe_score(reward_breakdown.get('correctness', 0.5)):.2f} "
+            f"tone={safe_score(reward_breakdown.get('tone', 0.5)):.2f} "
+            f"completeness={safe_score(reward_breakdown.get('completeness', 0.5)):.2f} "
             f"done={done}"
         )
     # Compute average reward for this task — clamped to strict (0, 1)
+    avg_reward = safe_score(total_reward / max(step_count, 1))
     elapsed = time.time() - start_time
     # CRITICAL: total_reward accumulates across steps and WILL exceed 1.0
     # (e.g. 3 steps × 0.5 = 1.5). The evaluator checks per-task values,
+    # so we MUST use avg_reward (which is already clamped) for total_reward too.
+    safe_total_reward = safe_score(total_reward / max(step_count, 1))
     logger.info(
         f"[END] task_id={task_id} "
         "steps": step_count,
         "total_reward": safe_total_reward,
         "avg_reward": avg_reward,
+        "score": avg_reward,  # Always include 'score' field
         "elapsed": elapsed,
     }
         """Write sanitized results and return sanitized final score."""
         sanitized_results = [_sanitize_task_result(r) for r in results]
         total_avg = sum(r["avg_reward"] for r in sanitized_results)
+        final = safe_score(total_avg / len(sanitized_results)) if sanitized_results else 0.5
         output = {
             "final_score": final,
             },
         }
+        # FINAL GLOBAL SANITIZATION — the absolute last safeguard
+        output = _sanitize_full_output(output)
         logger.info(f"[DEBUG] Final output JSON scores:")
+        logger.info(f"  final_score: {output['final_score']:.6f}")
+        for r in output["task_results"]:
+            logger.info(
+                f"  {r.get('task_id')}: total_reward={r.get('total_reward'):.6f} "
+                f"avg_reward={r.get('avg_reward'):.6f} score={r.get('score'):.6f}"
+            )
+        # ASSERTION: Catch any remaining violations (log & auto-correct, never crash)
+        for r in output["task_results"]:
+            for key in ["total_reward", "avg_reward", "score"]:
+                val = r.get(key)
+                if val is not None and (val <= 0.0 or val >= 1.0):
+                    logger.error(
+                        f"[CRITICAL] ASSERTION FAILED: {r.get('task_id')}.{key}={val} "
+                        f"VIOLATES strict (0,1)! Auto-correcting..."
+                    )
+                    r[key] = safe_score(val)
         try:
             os.makedirs("outputs", exist_ok=True)
         except Exception as e:
             logger.error(f"[ERROR] Failed to save results: {e}")
+        return output["final_score"]
     # Wait for environment to be ready
     logger.info("[START] Waiting for environment server...")
                 "steps": 0,
                 "total_reward": 0.01,
                 "avg_reward": 0.01,
+                "score": 0.01,
                 "elapsed": 0.0,
                 "error": "environment_unavailable",
             }
                 "steps": 0,
                 "total_reward": 0.01,
                 "avg_reward": 0.01,
+                "score": 0.01,
                 "elapsed": 0.0,
                 "error": str(e),
             })
         )
         total_avg += r.get("avg_reward", 0)
+    final_score = safe_score(total_avg / len(results)) if results else 0.01
     logger.info("-" * 60)
     logger.info(f"  FINAL SCORE: {final_score:.4f} (0.0 -- 1.0)")
     logger.info("=" * 60)

models.py CHANGED Viewed

@@ -3,12 +3,55 @@ Pydantic models for the Customer Support Ticket Resolution Environment.
 Defines the Action, Observation, State, and Reward models used for
 type-safe communication between the agent and environment.
 """
 from enum import Enum
 from typing import Any, Dict, List, Optional
-from pydantic import BaseModel, Field
 # ──────────────────────────────────────────────────────────────────
@@ -126,29 +169,30 @@ class SupportObservation(BaseModel):
 # ──────────────────────────────────────────────────────────────────
-# Reward Model
 # ──────────────────────────────────────────────────────────────────
 class RewardBreakdown(BaseModel):
-    """Detailed breakdown of the reward score."""
     correctness: float = Field(
         default=0.01,
-        gt=0.0, lt=1.0,
         description="Score for factual correctness — strict (0, 1)",
     )
     tone: float = Field(
         default=0.01,
-        gt=0.0, lt=1.0,
         description="Score for professional tone — strict (0, 1)",
     )
     completeness: float = Field(
         default=0.01,
-        gt=0.0, lt=1.0,
         description="Score for response completeness — strict (0, 1)",
     )
     efficiency: float = Field(
         default=0.01,
-        gt=0.0, lt=1.0,
         description="Score for resolution efficiency — strict (0, 1)",
     )
     penalties: float = Field(
@@ -158,7 +202,6 @@ class RewardBreakdown(BaseModel):
     )
     total: float = Field(
         default=0.01,
-        gt=0.0, lt=1.0,
         description="Overall weighted score — strict (0, 1)",
     )
     explanation: str = Field(
@@ -166,6 +209,15 @@ class RewardBreakdown(BaseModel):
         description="Human-readable explanation of the score",
     )
 # ──────────────────────────────────────────────────────────────────
 # State Model
@@ -194,12 +246,18 @@ class SupportState(BaseModel):
 # ──────────────────────────────────────────────────────────────────
-# Step Result (matches OpenEnv convention)
 # ──────────────────���───────────────────────────────────────────────
 class StepResult(BaseModel):
     """Result returned from step(), matching OpenEnv convention."""
     observation: SupportObservation
-    reward: float = Field(gt=0.0, lt=1.0)
     done: bool
     info: Dict[str, Any] = Field(default_factory=dict)

 Defines the Action, Observation, State, and Reward models used for
 type-safe communication between the agent and environment.
+IMPORTANT: Score fields use custom validators that AUTO-CLAMP to (0, 1)
+instead of raising ValidationError. This prevents the evaluator from ever
+seeing boundary values (0.0 or 1.0).
 """
 from enum import Enum
 from typing import Any, Dict, List, Optional
+from pydantic import BaseModel, Field, field_validator
+# ──────────────────────────────────────────────────────────────────
+# Central safe-score utility — shared by all modules
+# ──────────────────────────────────────────────────────────────────
+_SCORE_FLOOR = 0.0001
+_SCORE_CEIL  = 0.9999
+def safe_score(value: Any) -> float:
+    """Clamp *any* value into the strict open interval (0, 1).
+    This is the SINGLE source of truth for score normalisation across
+    the entire project.  Every score must pass through this function
+    before leaving any boundary (model field, API response, JSON output).
+    Rules:
+        * ``None``  → 0.5  (safe default)
+        * Strings / non-numeric → 0.5
+        * NaN / ±Inf → 0.5
+        * ≤ 0  → 0.0001
+        * ≥ 1  → 0.9999
+    """
+    if value is None:
+        return 0.5
+    if isinstance(value, str):
+        try:
+            value = float(value)
+        except (TypeError, ValueError):
+            return 0.5
+    try:
+        v = float(value)
+    except (TypeError, ValueError):
+        return 0.5
+    # Guard NaN / Inf
+    if v != v or v == float("inf") or v == float("-inf"):
+        return 0.5
+    return max(_SCORE_FLOOR, min(_SCORE_CEIL, v))
 # ──────────────────────────────────────────────────────────────────
 # ──────────────────────────────────────────────────────────────────
+# Reward Model — uses auto-clamping validators instead of gt/lt
 # ──────────────────────────────────────────────────────────────────
 class RewardBreakdown(BaseModel):
+    """Detailed breakdown of the reward score.
+    IMPORTANT: All score fields auto-clamp to strict (0, 1) via validators.
+    This prevents Pydantic from raising ValidationError on boundary values
+    and ensures the evaluator NEVER receives 0.0 or 1.0.
+    """
     correctness: float = Field(
         default=0.01,
         description="Score for factual correctness — strict (0, 1)",
     )
     tone: float = Field(
         default=0.01,
         description="Score for professional tone — strict (0, 1)",
     )
     completeness: float = Field(
         default=0.01,
         description="Score for response completeness — strict (0, 1)",
     )
     efficiency: float = Field(
         default=0.01,
         description="Score for resolution efficiency — strict (0, 1)",
     )
     penalties: float = Field(
     )
     total: float = Field(
         default=0.01,
         description="Overall weighted score — strict (0, 1)",
     )
     explanation: str = Field(
         description="Human-readable explanation of the score",
     )
+    @field_validator(
+        "correctness", "tone", "completeness", "efficiency", "total",
+        mode="before",
+    )
+    @classmethod
+    def _clamp_score(cls, v: Any) -> float:
+        """Auto-clamp score fields to strict (0, 1)."""
+        return safe_score(v)
 # ──────────────────────────────────────────────────────────────────
 # State Model
 # ──────────────────────────────────────────────────────────────────
+# Step Result (matches OpenEnv convention) — auto-clamps reward
 # ──────────────────���───────────────────────────────────────────────
 class StepResult(BaseModel):
     """Result returned from step(), matching OpenEnv convention."""
     observation: SupportObservation
+    reward: float = Field(default=0.01)
     done: bool
     info: Dict[str, Any] = Field(default_factory=dict)
+    @field_validator("reward", mode="before")
+    @classmethod
+    def _clamp_reward(cls, v: Any) -> float:
+        """Auto-clamp reward to strict (0, 1)."""
+        return safe_score(v)

pyproject.toml CHANGED Viewed

@@ -37,4 +37,4 @@ packages = [
 ]
 [tool.pyright]
-extraPaths = ["."]

 ]
 [tool.pyright]
+extraPaths = [".", "openenv"]

server/app.py CHANGED Viewed

@@ -23,24 +23,13 @@ from typing import Any, Dict, Optional
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel, Field
-from models import SupportAction, SupportObservation, SupportState
 from server.environment import CustomerSupportEnvironment
 from tasks import TASK_IDS, TASKS
-def _safe_score(value) -> float:
-    """Clamp any value to strict (0, 1) for evaluator safety."""
-    try:
-        v = float(value)
-    except (TypeError, ValueError):
-        v = 0.5
-    if v != v or v == float('inf') or v == float('-inf'):
-        v = 0.5
-    return max(0.0001, min(0.9999, v))
 # ──────────────────────────────────────────────────────────────────
 # Request / Response schemas
 # ──────────────────────────────────────────────────────────────────
@@ -55,11 +44,23 @@ class StepRequest(BaseModel):
 class StepResponse(BaseModel):
     observation: SupportObservation
-    reward: float = Field(gt=0.0, lt=1.0)
     done: bool
     info: Dict[str, Any]
 class TaskInfo(BaseModel):
     task_id: str
@@ -154,17 +155,21 @@ def step(request: StepRequest):
     """Execute an agent action and return the result."""
     try:
         obs, reward, done, info = env.step(action=request.action)
-        # Clamp reward to strict (0, 1) — evaluator rejects 0.0 or 1.0
-        safe_reward = _safe_score(reward)
         # Also clamp all scores inside reward_breakdown in info
         if "reward_breakdown" in info and isinstance(info["reward_breakdown"], dict):
             rb = info["reward_breakdown"]
             for key in ["correctness", "tone", "completeness", "efficiency", "total"]:
                 if key in rb:
-                    rb[key] = _safe_score(rb[key])
         return StepResponse(
             observation=obs,
-            reward=safe_reward,
             done=done,
             info=info,
         )

 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field, field_validator
+from models import SupportAction, SupportObservation, SupportState, safe_score
 from server.environment import CustomerSupportEnvironment
 from tasks import TASK_IDS, TASKS
 # ──────────────────────────────────────────────────────────────────
 # Request / Response schemas
 # ──────────────────────────────────────────────────────────────────
 class StepResponse(BaseModel):
+    """Response from the /step endpoint.
+    Uses an auto-clamping validator instead of gt/lt constraints.
+    This prevents Pydantic from raising ValidationError on boundary
+    values and ensures the evaluator NEVER receives 0.0 or 1.0.
+    """
     observation: SupportObservation
+    reward: float = Field(default=0.01, description="Step reward in strict (0, 1)")
     done: bool
     info: Dict[str, Any]
+    @field_validator("reward", mode="before")
+    @classmethod
+    def _clamp_reward(cls, v: Any) -> float:
+        """Auto-clamp reward to strict (0, 1)."""
+        return safe_score(v)
 class TaskInfo(BaseModel):
     task_id: str
     """Execute an agent action and return the result."""
     try:
         obs, reward, done, info = env.step(action=request.action)
+        # Triple-safe: clamp reward via safe_score before passing to StepResponse
+        # (StepResponse also has its own auto-clamping validator)
+        clamped_reward = safe_score(reward)
         # Also clamp all scores inside reward_breakdown in info
         if "reward_breakdown" in info and isinstance(info["reward_breakdown"], dict):
             rb = info["reward_breakdown"]
             for key in ["correctness", "tone", "completeness", "efficiency", "total"]:
                 if key in rb:
+                    rb[key] = safe_score(rb[key])
         return StepResponse(
             observation=obs,
+            reward=clamped_reward,
             done=done,
             info=info,
         )

server/environment.py CHANGED Viewed

@@ -11,6 +11,7 @@ Implements the standard OpenEnv interface:
     - state()         → SupportState
 """
 import sys
 import os
 from typing import Any, Dict, List, Optional, Tuple
@@ -34,10 +35,13 @@ from models import (
     TicketInfo,
     TicketPriority,
     TicketStatus,
 )
 from grader import grade_response
 from tasks import TASKS, TASK_IDS, get_task
 class CustomerSupportEnvironment:
     """
@@ -129,6 +133,7 @@ class CustomerSupportEnvironment:
         Returns:
             Tuple of (observation, reward, done, info).
         """
         if self._state is None or self._state.done:
             raise RuntimeError(
@@ -155,9 +160,12 @@ class CustomerSupportEnvironment:
             conversation_history=[m.model_dump() for m in self._conversation],
         )
-        # Clamp step reward to strict (0, 1) — never exactly 0.0 or 1.0
-        step_reward = max(0.0001, min(0.9999, reward_breakdown.total))
-        print(f"[DEBUG] environment.step: raw_total={reward_breakdown.total:.6f} step_reward={step_reward:.6f}")
         self._cumulative_reward += step_reward
         self._state.cumulative_reward = self._cumulative_reward
         self._state.reward_history.append(reward_breakdown)
@@ -183,7 +191,6 @@ class CustomerSupportEnvironment:
                 next_msg = follow_ups[self._follow_up_index]
                 self._follow_up_index += 1
             else:
-                # Generate a contextual customer acknowledgement
                 next_msg = self._generate_contextual_reply(action)
             self._current_message = next_msg
@@ -196,12 +203,17 @@ class CustomerSupportEnvironment:
             )
         # Compute average reward — clamped to strict (0, 1)
-        avg_reward = self._cumulative_reward / self._state.step_count
-        avg_reward = max(0.0001, min(0.9999, avg_reward))
         # Build info dict — all scores strictly in (0, 1)
         info = {
-            "reward_breakdown": reward_breakdown.model_dump(),
             "step_reward": step_reward,
             "cumulative_reward": self._cumulative_reward,
             "average_reward": avg_reward,

     - state()         → SupportState
 """
+import logging
 import sys
 import os
 from typing import Any, Dict, List, Optional, Tuple
     TicketInfo,
     TicketPriority,
     TicketStatus,
+    safe_score,
 )
 from grader import grade_response
 from tasks import TASKS, TASK_IDS, get_task
+logger = logging.getLogger(__name__)
 class CustomerSupportEnvironment:
     """
         Returns:
             Tuple of (observation, reward, done, info).
+            reward is ALWAYS in strict (0, 1).
         """
         if self._state is None or self._state.done:
             raise RuntimeError(
             conversation_history=[m.model_dump() for m in self._conversation],
         )
+        # Clamp step reward to strict (0, 1) — safe_score guarantees this
+        step_reward = safe_score(reward_breakdown.total)
+        logger.info(
+            f"[ENV] step: raw_total={reward_breakdown.total:.6f} "
+            f"step_reward={step_reward:.6f}"
+        )
         self._cumulative_reward += step_reward
         self._state.cumulative_reward = self._cumulative_reward
         self._state.reward_history.append(reward_breakdown)
                 next_msg = follow_ups[self._follow_up_index]
                 self._follow_up_index += 1
             else:
                 next_msg = self._generate_contextual_reply(action)
             self._current_message = next_msg
             )
         # Compute average reward — clamped to strict (0, 1)
+        avg_reward = safe_score(self._cumulative_reward / self._state.step_count)
         # Build info dict — all scores strictly in (0, 1)
+        # Clamp every numeric score in reward_breakdown before exposing
+        rb_dict = reward_breakdown.model_dump()
+        for key in ["correctness", "tone", "completeness", "efficiency", "total"]:
+            if key in rb_dict:
+                rb_dict[key] = safe_score(rb_dict[key])
         info = {
+            "reward_breakdown": rb_dict,
             "step_reward": step_reward,
             "cumulative_reward": self._cumulative_reward,
             "average_reward": avg_reward,

validate.py CHANGED Viewed

@@ -5,7 +5,7 @@ Runs through all 3 tasks with deterministic responses and verifies:
   ✓ reset() returns valid SupportObservation
   ✓ step() returns (observation, reward, done, info) with correct types
   ✓ state() returns valid SupportState
-  ✓ Rewards are non-constant and in [0.0, 1.0]
   ✓ Episodes terminate correctly
   ✓ Grader produces varying scores for different responses
@@ -19,7 +19,7 @@ import os
 # Ensure project root is on path
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
-from models import SupportAction, SupportObservation, SupportState, RewardBreakdown
 from server.environment import CustomerSupportEnvironment
 from tasks import TASK_IDS
@@ -66,9 +66,9 @@ def validate_task(env: CustomerSupportEnvironment, task_id: str, responses: list
         rewards.append(reward)
         breakdown = info.get("reward_breakdown", {})
         print(f"  ✓ step({i+1}) → reward={reward:.4f} | "
-              f"correctness={breakdown.get('correctness', 0):.2f} "
-              f"tone={breakdown.get('tone', 0):.2f} "
-              f"completeness={breakdown.get('completeness', 0):.2f} "
               f"done={done}")
         if done:
@@ -82,7 +82,7 @@ def validate_task(env: CustomerSupportEnvironment, task_id: str, responses: list
     return {
         "task_id": task_id,
         "rewards": rewards,
-        "avg_reward": max(0.0001, min(0.9999, sum(rewards) / len(rewards))) if rewards else 0.5,
         "steps": len(rewards),
     }
@@ -137,6 +137,11 @@ def validate_grader_variance():
     print(f"  ✓ Grader produces varying scores (NOT constant)")
     print(f"  ✓ Good > Bad > Irrelevant ordering confirmed")
 def main():
     print("=" * 50)
@@ -208,8 +213,7 @@ def main():
     for r in all_results:
         print(f"  ✓ {r['task_id']:20s} → avg_reward={r['avg_reward']:.4f} steps={r['steps']}")
         total_avg += r['avg_reward']
-    overall = total_avg / len(all_results) if all_results else 0.01
-    overall = max(0.0001, min(0.9999, overall))
     print(f"\n  Overall Score: {overall:.4f}")
     print(f"\n  ✅ ALL VALIDATIONS PASSED!")
     return 0

   ✓ reset() returns valid SupportObservation
   ✓ step() returns (observation, reward, done, info) with correct types
   ✓ state() returns valid SupportState
+  ✓ Rewards are non-constant and in (0.0, 1.0) strict open interval
   ✓ Episodes terminate correctly
   ✓ Grader produces varying scores for different responses
 # Ensure project root is on path
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from models import SupportAction, SupportObservation, SupportState, RewardBreakdown, safe_score
 from server.environment import CustomerSupportEnvironment
 from tasks import TASK_IDS
         rewards.append(reward)
         breakdown = info.get("reward_breakdown", {})
         print(f"  ✓ step({i+1}) → reward={reward:.4f} | "
+              f"correctness={safe_score(breakdown.get('correctness', 0.5)):.2f} "
+              f"tone={safe_score(breakdown.get('tone', 0.5)):.2f} "
+              f"completeness={safe_score(breakdown.get('completeness', 0.5)):.2f} "
               f"done={done}")
         if done:
     return {
         "task_id": task_id,
         "rewards": rewards,
+        "avg_reward": safe_score(sum(rewards) / len(rewards)) if rewards else 0.5,
         "steps": len(rewards),
     }
     print(f"  ✓ Grader produces varying scores (NOT constant)")
     print(f"  ✓ Good > Bad > Irrelevant ordering confirmed")
+    # Verify ALL rewards are strictly in (0, 1)
+    for label, r in [("good", good_reward), ("bad", bad_reward), ("irr", irr_reward)]:
+        assert 0.0 < r < 1.0, f"{label} reward {r} violates strict (0, 1)!"
+    print(f"  ✓ All rewards strictly in (0, 1) open interval")
 def main():
     print("=" * 50)
     for r in all_results:
         print(f"  ✓ {r['task_id']:20s} → avg_reward={r['avg_reward']:.4f} steps={r['steps']}")
         total_avg += r['avg_reward']
+    overall = safe_score(total_avg / len(all_results)) if all_results else 0.01
     print(f"\n  Overall Score: {overall:.4f}")
     print(f"\n  ✅ ALL VALIDATIONS PASSED!")
     return 0