mathi3046 commited on
Commit
6c591d0
Β·
1 Parent(s): 3932d4b

last 0 and 1 error update

Browse files
Files changed (8) hide show
  1. __init__.py +2 -0
  2. grader.py +35 -81
  3. inference.py +100 -38
  4. models.py +68 -10
  5. pyproject.toml +1 -1
  6. server/app.py +23 -18
  7. server/environment.py +19 -7
  8. validate.py +12 -8
__init__.py CHANGED
@@ -11,6 +11,7 @@ from .models import (
11
  SupportState,
12
  RewardBreakdown,
13
  StepResult,
 
14
  )
15
  from .server.environment import CustomerSupportEnvironment
16
 
@@ -21,6 +22,7 @@ __all__ = [
21
  "SupportState",
22
  "RewardBreakdown",
23
  "StepResult",
 
24
  ]
25
 
26
  __version__ = "1.0.0"
 
11
  SupportState,
12
  RewardBreakdown,
13
  StepResult,
14
+ safe_score,
15
  )
16
  from .server.environment import CustomerSupportEnvironment
17
 
 
22
  "SupportState",
23
  "RewardBreakdown",
24
  "StepResult",
25
+ "safe_score",
26
  ]
27
 
28
  __version__ = "1.0.0"
grader.py CHANGED
@@ -9,43 +9,17 @@ Evaluates agent responses on three axes:
9
  Returns a RewardBreakdown with a total score in (0.0, 1.0) β€” strict open interval.
10
 
11
  IMPORTANT β€” Every numeric score produced by this module is passed through
12
- ``normalize_score`` before it leaves the grader so that the evaluator NEVER
13
  receives a boundary value (0.0 or 1.0).
14
  """
15
 
 
16
  import re
17
  from typing import Any, Dict, List
18
 
19
- from models import RewardBreakdown
20
 
21
-
22
- # ──────────────────────────────────────────────────────────────────
23
- # Central score normaliser β€” THE single source of truth
24
- # ──────────────────────────────────────────────────────────────────
25
-
26
- # Strict open-interval bounds: scores must never be exactly 0.0 or 1.0
27
- _SCORE_FLOOR = 0.0001
28
- _SCORE_CEIL = 0.9999
29
-
30
-
31
- def normalize_score(value: Any) -> float:
32
- """Clamp *value* into the strict open interval (0, 1).
33
-
34
- * ``None`` β†’ 0.5
35
- * anything that cannot be converted to float β†’ 0.5
36
- * values ≀ 0 β†’ ``_SCORE_FLOOR``
37
- * values β‰₯ 1 β†’ ``_SCORE_CEIL``
38
- """
39
- if value is None:
40
- return 0.5
41
- try:
42
- v = float(value)
43
- except (TypeError, ValueError):
44
- return 0.5
45
- # Guard against NaN / Inf
46
- if v != v or v == float('inf') or v == float('-inf'):
47
- return 0.5
48
- return max(_SCORE_FLOOR, min(_SCORE_CEIL, v))
49
 
50
 
51
  def _normalise(text: str) -> str:
@@ -68,18 +42,16 @@ def _score_correctness(
68
  norm = _normalise(response)
69
  criteria = rubric.get("criteria", [])
70
  if not criteria:
71
- # No rubric β†’ return a safe neutral score, never 0.0
72
- return normalize_score(0.1)
73
 
74
  total = 0.0
75
  for criterion in criteria:
76
  kw_group: List[str] = criterion.get("keyword_group", [])
77
  points: float = criterion.get("points", 0.0)
78
- # Award points if ANY keyword in the group is found
79
  if any(kw.lower() in norm for kw in kw_group):
80
  total += points
81
 
82
- return normalize_score(total)
83
 
84
 
85
  # ──────────────────────────────────────────────────────────────────
@@ -102,33 +74,27 @@ def _score_tone(
102
  positive_signals: List[str] = criteria.get("positive_signals", [])
103
  negative_signals: List[str] = criteria.get("negative_signals", [])
104
 
105
- # Count matches
106
  pos_count = sum(1 for sig in positive_signals if sig.lower() in norm)
107
  neg_count = sum(1 for sig in negative_signals if sig.lower() in norm)
108
 
109
- # Base score: 0.5 (neutral)
110
  score = 0.5
111
 
112
- # Each positive signal adds points (diminishing returns)
113
  if positive_signals:
114
  pos_ratio = pos_count / len(positive_signals)
115
- score += pos_ratio * 0.4 # max +0.4 from positives (keeps below 1.0)
116
 
117
- # Each negative signal deducts heavily
118
  if neg_count > 0:
119
- score -= min(neg_count * 0.2, 0.4) # max -0.4 from negatives (keeps above 0.0)
120
 
121
- # Additional length/quality checks
122
  word_count = len(norm.split())
123
  if word_count < 10:
124
- score -= 0.1 # Too terse is often rude
125
 
126
- # Check if response uses ALL CAPS excessively
127
  upper_ratio = sum(1 for c in response if c.isupper()) / max(len(response), 1)
128
  if upper_ratio > 0.4 and len(response) > 20:
129
- score -= 0.05 # Shouting in response
130
 
131
- return normalize_score(score)
132
 
133
 
134
  # ──────────────────────────────────────────────────────────────────
@@ -148,8 +114,7 @@ def _score_completeness(
148
  norm = _normalise(response)
149
  criteria = rubric.get("criteria", [])
150
  if not criteria:
151
- # No rubric β†’ return a safe neutral score, never 0.0
152
- return normalize_score(0.1)
153
 
154
  total = 0.0
155
  for criterion in criteria:
@@ -157,14 +122,12 @@ def _score_completeness(
157
  points = criterion.get("points", 0.0)
158
 
159
  if check == "addresses_question" or check == "addresses_defect":
160
- # Check if response directly addresses the main issue
161
  subject = _normalise(ticket_info.get("subject", ""))
162
  subject_words = [w for w in subject.split() if len(w) > 3]
163
  if any(w in norm for w in subject_words) or len(norm.split()) > 20:
164
  total += points
165
 
166
  elif check == "provides_next_steps":
167
- # Check for actionable next steps
168
  step_indicators = [
169
  "will", "can", "please", "next step", "process",
170
  "we'll", "i'll", "going to", "let me", "i can",
@@ -174,15 +137,13 @@ def _score_completeness(
174
  total += points
175
 
176
  elif check == "references_order":
177
- # Check if the specific order ID is referenced
178
  order_id = ticket_info.get("order_id", "")
179
  if order_id and order_id.lower() in norm:
180
  total += points
181
  elif "order" in norm:
182
- total += points * 0.5 # Partial credit for mentioning order
183
 
184
  elif check == "explains_policy":
185
- # Check if relevant policy details are mentioned
186
  policy_terms = [
187
  "policy", "within", "days", "eligible", "qualify",
188
  "terms", "condition", "guideline",
@@ -191,7 +152,6 @@ def _score_completeness(
191
  total += points
192
 
193
  elif check == "provides_process":
194
- # Check if return/refund process is outlined
195
  process_terms = [
196
  "step", "first", "then", "send", "ship", "return",
197
  "label", "process", "receive", "refund",
@@ -200,13 +160,11 @@ def _score_completeness(
200
  total += points
201
 
202
  elif check == "offers_options":
203
- # Check if multiple options are presented
204
  option_indicators = ["or", "option", "alternative", "either", "choose", "prefer"]
205
  if any(ind in norm for ind in option_indicators):
206
  total += points
207
 
208
  elif check == "acknowledges_all_issues":
209
- # For hard task: must address multiple issues
210
  issues_to_address = ["wrong", "late", "delay", "rude", "staff", "agent"]
211
  addressed = sum(1 for iss in issues_to_address if iss in norm)
212
  if addressed >= 3:
@@ -217,7 +175,6 @@ def _score_completeness(
217
  total += points * 0.3
218
 
219
  elif check == "concrete_resolution":
220
- # Check for concrete actions, not just apologies
221
  concrete_terms = [
222
  "refund", "replacement", "ship", "send", "credit",
223
  "discount", "expedite", "priority", "immediately",
@@ -227,7 +184,6 @@ def _score_completeness(
227
  total += points
228
 
229
  elif check == "timeline":
230
- # Check if specific timelines are given
231
  time_patterns = [
232
  r"\d+\s*(hour|day|week|business day)",
233
  r"within\s+\d+",
@@ -241,17 +197,15 @@ def _score_completeness(
241
  total += points
242
 
243
  elif check == "empathy":
244
- # Check for empathetic language
245
  empathy_terms = [
246
  "understand", "frustrat", "sorry", "apologize",
247
  "inconvenience", "disappoint", "concern",
248
- "appreciate your patience", "I hear you",
249
  ]
250
  if sum(1 for t in empathy_terms if t in norm) >= 2:
251
  total += points
252
 
253
  elif check == "follow_up_plan":
254
- # Check for follow-up commitments
255
  follow_up_terms = [
256
  "follow up", "follow-up", "check back", "update you",
257
  "keep you informed", "contact you", "reach out",
@@ -260,7 +214,7 @@ def _score_completeness(
260
  if any(t in norm for t in follow_up_terms):
261
  total += points
262
 
263
- return normalize_score(total)
264
 
265
 
266
  # ──────────────────────────────────────────────────────────────────
@@ -278,11 +232,9 @@ def _compute_penalties(
278
  norm = _normalise(response)
279
  penalty = 0.0
280
 
281
- # Penalty: empty or near-empty response
282
  if len(norm.split()) < 5:
283
  penalty -= 0.2
284
 
285
- # Penalty: repeated response (copy-paste from previous)
286
  if conversation_history:
287
  prev_agent_msgs = [
288
  _normalise(m.get("content", ""))
@@ -297,7 +249,6 @@ def _compute_penalties(
297
  penalty -= 0.1
298
  break
299
 
300
- # Penalty: harmful/inappropriate content
301
  harmful_patterns = [
302
  "kill", "die", "hate you", "shut up", "idiot",
303
  "moron", "loser", "go away",
@@ -305,7 +256,6 @@ def _compute_penalties(
305
  if any(pat in norm for pat in harmful_patterns):
306
  penalty -= 0.3
307
 
308
- # Penalty: completely irrelevant response
309
  irrelevant_signals = [
310
  "weather", "recipe", "joke", "game score",
311
  "political", "stock market",
@@ -336,18 +286,19 @@ def grade_response(
336
  conversation_history: Previous messages
337
 
338
  Returns:
339
- RewardBreakdown with ALL scores in strict (0.0, 1.0) open interval
 
340
  """
341
- # Score each axis β€” normalize_score guarantees (0, 1)
342
- correctness = normalize_score(_score_correctness(
343
  response,
344
  grading_rubric.get("correctness", {}),
345
  ))
346
- tone = normalize_score(_score_tone(
347
  response,
348
  grading_rubric.get("tone", {}),
349
  ))
350
- completeness = normalize_score(_score_completeness(
351
  response,
352
  grading_rubric.get("completeness", {}),
353
  ticket_info,
@@ -369,16 +320,18 @@ def grade_response(
369
  + completeness * w_completeness
370
  )
371
 
372
- # Apply penalties β€” normalize_score guarantees strict (0, 1)
373
- total = normalize_score(weighted + penalties)
374
 
375
  # The efficiency field re-uses the weighted pre-penalty score
376
- efficiency = normalize_score(weighted)
377
 
378
  # Debug logging
379
- print(f"[DEBUG] correctness={correctness:.4f} tone={tone:.4f} "
380
- f"completeness={completeness:.4f} weighted={weighted:.4f} "
381
- f"penalties={penalties:.4f} total={total:.4f}")
 
 
382
 
383
  # Build explanation
384
  parts = []
@@ -389,12 +342,13 @@ def grade_response(
389
  parts.append(f"Penalties: {penalties:.4f}")
390
  parts.append(f"Total: {total:.4f}")
391
 
 
392
  return RewardBreakdown(
393
- correctness=normalize_score(correctness),
394
- tone=normalize_score(tone),
395
- completeness=normalize_score(completeness),
396
- efficiency=normalize_score(efficiency),
397
  penalties=round(penalties, 4),
398
- total=normalize_score(total),
399
  explanation=" | ".join(parts),
400
  )
 
9
  Returns a RewardBreakdown with a total score in (0.0, 1.0) β€” strict open interval.
10
 
11
  IMPORTANT β€” Every numeric score produced by this module is passed through
12
+ ``safe_score`` before it leaves the grader so that the evaluator NEVER
13
  receives a boundary value (0.0 or 1.0).
14
  """
15
 
16
+ import logging
17
  import re
18
  from typing import Any, Dict, List
19
 
20
+ from models import RewardBreakdown, safe_score
21
 
22
+ logger = logging.getLogger(__name__)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
 
25
  def _normalise(text: str) -> str:
 
42
  norm = _normalise(response)
43
  criteria = rubric.get("criteria", [])
44
  if not criteria:
45
+ return safe_score(0.1)
 
46
 
47
  total = 0.0
48
  for criterion in criteria:
49
  kw_group: List[str] = criterion.get("keyword_group", [])
50
  points: float = criterion.get("points", 0.0)
 
51
  if any(kw.lower() in norm for kw in kw_group):
52
  total += points
53
 
54
+ return safe_score(total)
55
 
56
 
57
  # ──────────────────────────────────────────────────────────────────
 
74
  positive_signals: List[str] = criteria.get("positive_signals", [])
75
  negative_signals: List[str] = criteria.get("negative_signals", [])
76
 
 
77
  pos_count = sum(1 for sig in positive_signals if sig.lower() in norm)
78
  neg_count = sum(1 for sig in negative_signals if sig.lower() in norm)
79
 
 
80
  score = 0.5
81
 
 
82
  if positive_signals:
83
  pos_ratio = pos_count / len(positive_signals)
84
+ score += pos_ratio * 0.4
85
 
 
86
  if neg_count > 0:
87
+ score -= min(neg_count * 0.2, 0.4)
88
 
 
89
  word_count = len(norm.split())
90
  if word_count < 10:
91
+ score -= 0.1
92
 
 
93
  upper_ratio = sum(1 for c in response if c.isupper()) / max(len(response), 1)
94
  if upper_ratio > 0.4 and len(response) > 20:
95
+ score -= 0.05
96
 
97
+ return safe_score(score)
98
 
99
 
100
  # ──────────────────────────────────────────────────────────────────
 
114
  norm = _normalise(response)
115
  criteria = rubric.get("criteria", [])
116
  if not criteria:
117
+ return safe_score(0.1)
 
118
 
119
  total = 0.0
120
  for criterion in criteria:
 
122
  points = criterion.get("points", 0.0)
123
 
124
  if check == "addresses_question" or check == "addresses_defect":
 
125
  subject = _normalise(ticket_info.get("subject", ""))
126
  subject_words = [w for w in subject.split() if len(w) > 3]
127
  if any(w in norm for w in subject_words) or len(norm.split()) > 20:
128
  total += points
129
 
130
  elif check == "provides_next_steps":
 
131
  step_indicators = [
132
  "will", "can", "please", "next step", "process",
133
  "we'll", "i'll", "going to", "let me", "i can",
 
137
  total += points
138
 
139
  elif check == "references_order":
 
140
  order_id = ticket_info.get("order_id", "")
141
  if order_id and order_id.lower() in norm:
142
  total += points
143
  elif "order" in norm:
144
+ total += points * 0.5
145
 
146
  elif check == "explains_policy":
 
147
  policy_terms = [
148
  "policy", "within", "days", "eligible", "qualify",
149
  "terms", "condition", "guideline",
 
152
  total += points
153
 
154
  elif check == "provides_process":
 
155
  process_terms = [
156
  "step", "first", "then", "send", "ship", "return",
157
  "label", "process", "receive", "refund",
 
160
  total += points
161
 
162
  elif check == "offers_options":
 
163
  option_indicators = ["or", "option", "alternative", "either", "choose", "prefer"]
164
  if any(ind in norm for ind in option_indicators):
165
  total += points
166
 
167
  elif check == "acknowledges_all_issues":
 
168
  issues_to_address = ["wrong", "late", "delay", "rude", "staff", "agent"]
169
  addressed = sum(1 for iss in issues_to_address if iss in norm)
170
  if addressed >= 3:
 
175
  total += points * 0.3
176
 
177
  elif check == "concrete_resolution":
 
178
  concrete_terms = [
179
  "refund", "replacement", "ship", "send", "credit",
180
  "discount", "expedite", "priority", "immediately",
 
184
  total += points
185
 
186
  elif check == "timeline":
 
187
  time_patterns = [
188
  r"\d+\s*(hour|day|week|business day)",
189
  r"within\s+\d+",
 
197
  total += points
198
 
199
  elif check == "empathy":
 
200
  empathy_terms = [
201
  "understand", "frustrat", "sorry", "apologize",
202
  "inconvenience", "disappoint", "concern",
203
+ "appreciate your patience", "i hear you",
204
  ]
205
  if sum(1 for t in empathy_terms if t in norm) >= 2:
206
  total += points
207
 
208
  elif check == "follow_up_plan":
 
209
  follow_up_terms = [
210
  "follow up", "follow-up", "check back", "update you",
211
  "keep you informed", "contact you", "reach out",
 
214
  if any(t in norm for t in follow_up_terms):
215
  total += points
216
 
217
+ return safe_score(total)
218
 
219
 
220
  # ──────────────────────────────────────────────────────────────────
 
232
  norm = _normalise(response)
233
  penalty = 0.0
234
 
 
235
  if len(norm.split()) < 5:
236
  penalty -= 0.2
237
 
 
238
  if conversation_history:
239
  prev_agent_msgs = [
240
  _normalise(m.get("content", ""))
 
249
  penalty -= 0.1
250
  break
251
 
 
252
  harmful_patterns = [
253
  "kill", "die", "hate you", "shut up", "idiot",
254
  "moron", "loser", "go away",
 
256
  if any(pat in norm for pat in harmful_patterns):
257
  penalty -= 0.3
258
 
 
259
  irrelevant_signals = [
260
  "weather", "recipe", "joke", "game score",
261
  "political", "stock market",
 
286
  conversation_history: Previous messages
287
 
288
  Returns:
289
+ RewardBreakdown with ALL scores in strict (0.0, 1.0) open interval.
290
+ The RewardBreakdown model auto-clamps all score fields via validators.
291
  """
292
+ # Score each axis β€” safe_score guarantees (0, 1)
293
+ correctness = safe_score(_score_correctness(
294
  response,
295
  grading_rubric.get("correctness", {}),
296
  ))
297
+ tone = safe_score(_score_tone(
298
  response,
299
  grading_rubric.get("tone", {}),
300
  ))
301
+ completeness = safe_score(_score_completeness(
302
  response,
303
  grading_rubric.get("completeness", {}),
304
  ticket_info,
 
320
  + completeness * w_completeness
321
  )
322
 
323
+ # Apply penalties β€” safe_score guarantees strict (0, 1)
324
+ total = safe_score(weighted + penalties)
325
 
326
  # The efficiency field re-uses the weighted pre-penalty score
327
+ efficiency = safe_score(weighted)
328
 
329
  # Debug logging
330
+ logger.info(
331
+ f"[GRADER] correctness={correctness:.4f} tone={tone:.4f} "
332
+ f"completeness={completeness:.4f} weighted={weighted:.4f} "
333
+ f"penalties={penalties:.4f} total={total:.4f}"
334
+ )
335
 
336
  # Build explanation
337
  parts = []
 
342
  parts.append(f"Penalties: {penalties:.4f}")
343
  parts.append(f"Total: {total:.4f}")
344
 
345
+ # RewardBreakdown auto-clamps all score fields via field_validator
346
  return RewardBreakdown(
347
+ correctness=correctness,
348
+ tone=tone,
349
+ completeness=completeness,
350
+ efficiency=efficiency,
351
  penalties=round(penalties, 4),
352
+ total=total,
353
  explanation=" | ".join(parts),
354
  )
inference.py CHANGED
@@ -74,40 +74,94 @@ logging.basicConfig(
74
  logger = logging.getLogger(__name__)
75
 
76
 
77
- def _strict_score(value: Any) -> float:
78
- """Normalize any numeric-like score to strict open interval (0, 1).
 
 
 
 
 
 
 
 
79
 
80
  CRITICAL: Every score passed to the evaluator MUST satisfy 0 < score < 1.
81
  This function is the last line of defence.
 
 
 
 
 
 
 
82
  """
 
 
 
 
 
 
 
83
  try:
84
  numeric = float(value)
85
  except (TypeError, ValueError):
86
- numeric = 0.5
87
  # Guard against NaN / Inf
88
  if numeric != numeric or numeric == float('inf') or numeric == float('-inf'):
89
- numeric = 0.5
90
- clamped = max(0.0001, min(0.9999, numeric))
91
- print(f"[DEBUG] _strict_score: input={value!r} -> {clamped:.4f}")
92
- return clamped
93
 
94
 
95
  def _sanitize_task_result(task_result: Dict[str, Any]) -> Dict[str, Any]:
96
  """Ensure task result contains evaluator-safe score fields.
97
 
98
- CRITICAL: total_reward and avg_reward MUST both be in strict (0, 1).
99
  The evaluator checks per-task scores and rejects 0.0 or 1.0.
100
  """
101
  safe = dict(task_result)
102
  safe["steps"] = int(safe.get("steps", 0) or 0)
103
- safe["total_reward"] = _strict_score(safe.get("total_reward", 0.5))
104
- safe["avg_reward"] = _strict_score(safe.get("avg_reward", 0.5))
105
  safe["elapsed"] = float(safe.get("elapsed", 0.0) or 0.0)
106
- print(f"[DEBUG] _sanitize_task_result: task={safe.get('task_id')} "
107
- f"total_reward={safe['total_reward']:.4f} avg_reward={safe['avg_reward']:.4f}")
 
 
 
 
 
 
108
  return safe
109
 
110
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  # ──────────────────────────────────────────────────────────────────
112
  # LLM Client (uses OpenAI SDK β€” required by checklist item 4)
113
  # ──────────────────────────────────────────────────────────────────
@@ -305,6 +359,7 @@ def build_messages(
305
  def run_task(env_client: EnvClient, task_id: str) -> Dict[str, Any]:
306
  """
307
  Run a single task to completion and return results.
 
308
  """
309
  logger.info(f"[START] task_id={task_id}")
310
  start_time = time.time()
@@ -341,7 +396,7 @@ def run_task(env_client: EnvClient, task_id: str) -> Dict[str, Any]:
341
 
342
  step_count += 1
343
  # Guard against endpoint-side boundary values (0.0 or 1.0)
344
- step_reward = _strict_score(result.get("reward", 0.01))
345
  total_reward += step_reward
346
  done = result.get("done", False)
347
  obs = result.get("observation", {})
@@ -352,20 +407,20 @@ def run_task(env_client: EnvClient, task_id: str) -> Dict[str, Any]:
352
  logger.info(
353
  f"[STEP] task={task_id} step={step_count} "
354
  f"reward={step_reward:.4f} "
355
- f"correctness={reward_breakdown.get('correctness', 0):.2f} "
356
- f"tone={reward_breakdown.get('tone', 0):.2f} "
357
- f"completeness={reward_breakdown.get('completeness', 0):.2f} "
358
  f"done={done}"
359
  )
360
 
361
  # Compute average reward for this task β€” clamped to strict (0, 1)
362
- avg_reward = _strict_score(total_reward / max(step_count, 1))
363
  elapsed = time.time() - start_time
364
 
365
  # CRITICAL: total_reward accumulates across steps and WILL exceed 1.0
366
  # (e.g. 3 steps Γ— 0.5 = 1.5). The evaluator checks per-task values,
367
- # so we MUST clamp it to strict (0, 1) before output.
368
- safe_total_reward = _strict_score(total_reward / max(step_count, 1))
369
 
370
  logger.info(
371
  f"[END] task_id={task_id} "
@@ -381,6 +436,7 @@ def run_task(env_client: EnvClient, task_id: str) -> Dict[str, Any]:
381
  "steps": step_count,
382
  "total_reward": safe_total_reward,
383
  "avg_reward": avg_reward,
 
384
  "elapsed": elapsed,
385
  }
386
 
@@ -407,20 +463,8 @@ def main():
407
  """Write sanitized results and return sanitized final score."""
408
  sanitized_results = [_sanitize_task_result(r) for r in results]
409
 
410
- # Add 'score' alias β€” evaluator may read this field name
411
- for r in sanitized_results:
412
- r["score"] = _strict_score(r.get("avg_reward", 0.5))
413
-
414
  total_avg = sum(r["avg_reward"] for r in sanitized_results)
415
- final = _strict_score(total_avg / len(sanitized_results)) if sanitized_results else 0.5
416
-
417
- # FINAL VALIDATION β€” catch any remaining boundary values
418
- for r in sanitized_results:
419
- for key in ["total_reward", "avg_reward", "score"]:
420
- val = r.get(key)
421
- if val is not None and (val <= 0.0 or val >= 1.0):
422
- logger.error(f"[CRITICAL] {r.get('task_id')}.{key}={val} VIOLATES (0,1)! Clamping.")
423
- r[key] = _strict_score(val)
424
 
425
  output = {
426
  "final_score": final,
@@ -432,11 +476,27 @@ def main():
432
  },
433
  }
434
 
 
 
 
435
  logger.info(f"[DEBUG] Final output JSON scores:")
436
- logger.info(f" final_score: {final:.6f}")
437
- for r in sanitized_results:
438
- logger.info(f" {r.get('task_id')}: total_reward={r.get('total_reward'):.6f} "
439
- f"avg_reward={r.get('avg_reward'):.6f} score={r.get('score'):.6f}")
 
 
 
 
 
 
 
 
 
 
 
 
 
440
 
441
  try:
442
  os.makedirs("outputs", exist_ok=True)
@@ -446,7 +506,7 @@ def main():
446
  except Exception as e:
447
  logger.error(f"[ERROR] Failed to save results: {e}")
448
 
449
- return final
450
 
451
  # Wait for environment to be ready
452
  logger.info("[START] Waiting for environment server...")
@@ -464,6 +524,7 @@ def main():
464
  "steps": 0,
465
  "total_reward": 0.01,
466
  "avg_reward": 0.01,
 
467
  "elapsed": 0.0,
468
  "error": "environment_unavailable",
469
  }
@@ -486,6 +547,7 @@ def main():
486
  "steps": 0,
487
  "total_reward": 0.01,
488
  "avg_reward": 0.01,
 
489
  "elapsed": 0.0,
490
  "error": str(e),
491
  })
@@ -507,7 +569,7 @@ def main():
507
  )
508
  total_avg += r.get("avg_reward", 0)
509
 
510
- final_score = _strict_score(total_avg / len(results)) if results else 0.01
511
  logger.info("-" * 60)
512
  logger.info(f" FINAL SCORE: {final_score:.4f} (0.0 -- 1.0)")
513
  logger.info("=" * 60)
 
74
  logger = logging.getLogger(__name__)
75
 
76
 
77
+ # ──────────────────────────────────────────────────────────────────
78
+ # Safe score utility β€” THE last line of defence
79
+ # ──────────────────────────────────────────────────────────────────
80
+
81
+ _SCORE_FLOOR = 0.0001
82
+ _SCORE_CEIL = 0.9999
83
+
84
+
85
+ def safe_score(value: Any) -> float:
86
+ """Normalize any value to strict open interval (0, 1).
87
 
88
  CRITICAL: Every score passed to the evaluator MUST satisfy 0 < score < 1.
89
  This function is the last line of defence.
90
+
91
+ Rules:
92
+ * None β†’ 0.5
93
+ * Strings / non-numeric β†’ 0.5
94
+ * NaN / Β±Inf β†’ 0.5
95
+ * ≀ 0 β†’ 0.0001
96
+ * β‰₯ 1 β†’ 0.9999
97
  """
98
+ if value is None:
99
+ return 0.5
100
+ if isinstance(value, str):
101
+ try:
102
+ value = float(value)
103
+ except (TypeError, ValueError):
104
+ return 0.5
105
  try:
106
  numeric = float(value)
107
  except (TypeError, ValueError):
108
+ return 0.5
109
  # Guard against NaN / Inf
110
  if numeric != numeric or numeric == float('inf') or numeric == float('-inf'):
111
+ return 0.5
112
+ return max(_SCORE_FLOOR, min(_SCORE_CEIL, numeric))
 
 
113
 
114
 
115
  def _sanitize_task_result(task_result: Dict[str, Any]) -> Dict[str, Any]:
116
  """Ensure task result contains evaluator-safe score fields.
117
 
118
+ CRITICAL: total_reward, avg_reward, and score MUST all be in strict (0, 1).
119
  The evaluator checks per-task scores and rejects 0.0 or 1.0.
120
  """
121
  safe = dict(task_result)
122
  safe["steps"] = int(safe.get("steps", 0) or 0)
123
+ safe["total_reward"] = safe_score(safe.get("total_reward", 0.5))
124
+ safe["avg_reward"] = safe_score(safe.get("avg_reward", 0.5))
125
  safe["elapsed"] = float(safe.get("elapsed", 0.0) or 0.0)
126
+ # ALWAYS include a 'score' field β€” evaluator may read this
127
+ safe["score"] = safe_score(safe.get("score", safe.get("avg_reward", 0.5)))
128
+ logger.info(
129
+ f"[DEBUG] _sanitize: task={safe.get('task_id')} "
130
+ f"total_reward={safe['total_reward']:.4f} "
131
+ f"avg_reward={safe['avg_reward']:.4f} "
132
+ f"score={safe['score']:.4f}"
133
+ )
134
  return safe
135
 
136
 
137
+ def _sanitize_full_output(output: Dict[str, Any]) -> Dict[str, Any]:
138
+ """Final global sanitization pass over the entire output dict.
139
+
140
+ Walks all task_results and clamps every numeric score field.
141
+ This is the ABSOLUTE LAST safeguard before JSON serialization.
142
+ """
143
+ sanitized = dict(output)
144
+
145
+ # Clamp final_score
146
+ sanitized["final_score"] = safe_score(sanitized.get("final_score", 0.5))
147
+
148
+ # Clamp every score in every task result
149
+ score_keys = ["total_reward", "avg_reward", "score"]
150
+ for r in sanitized.get("task_results", []):
151
+ for key in score_keys:
152
+ if key in r:
153
+ val = r[key]
154
+ clamped = safe_score(val)
155
+ if val != clamped:
156
+ logger.warning(
157
+ f"[SANITIZE] {r.get('task_id')}.{key}: "
158
+ f"{val} β†’ {clamped} (was out of bounds)"
159
+ )
160
+ r[key] = clamped
161
+
162
+ return sanitized
163
+
164
+
165
  # ──────────────────────────────────────────────────────────────────
166
  # LLM Client (uses OpenAI SDK β€” required by checklist item 4)
167
  # ──────────────────────────────────────────────────────────────────
 
359
  def run_task(env_client: EnvClient, task_id: str) -> Dict[str, Any]:
360
  """
361
  Run a single task to completion and return results.
362
+ All scores are clamped to strict (0, 1) before returning.
363
  """
364
  logger.info(f"[START] task_id={task_id}")
365
  start_time = time.time()
 
396
 
397
  step_count += 1
398
  # Guard against endpoint-side boundary values (0.0 or 1.0)
399
+ step_reward = safe_score(result.get("reward", 0.01))
400
  total_reward += step_reward
401
  done = result.get("done", False)
402
  obs = result.get("observation", {})
 
407
  logger.info(
408
  f"[STEP] task={task_id} step={step_count} "
409
  f"reward={step_reward:.4f} "
410
+ f"correctness={safe_score(reward_breakdown.get('correctness', 0.5)):.2f} "
411
+ f"tone={safe_score(reward_breakdown.get('tone', 0.5)):.2f} "
412
+ f"completeness={safe_score(reward_breakdown.get('completeness', 0.5)):.2f} "
413
  f"done={done}"
414
  )
415
 
416
  # Compute average reward for this task β€” clamped to strict (0, 1)
417
+ avg_reward = safe_score(total_reward / max(step_count, 1))
418
  elapsed = time.time() - start_time
419
 
420
  # CRITICAL: total_reward accumulates across steps and WILL exceed 1.0
421
  # (e.g. 3 steps Γ— 0.5 = 1.5). The evaluator checks per-task values,
422
+ # so we MUST use avg_reward (which is already clamped) for total_reward too.
423
+ safe_total_reward = safe_score(total_reward / max(step_count, 1))
424
 
425
  logger.info(
426
  f"[END] task_id={task_id} "
 
436
  "steps": step_count,
437
  "total_reward": safe_total_reward,
438
  "avg_reward": avg_reward,
439
+ "score": avg_reward, # Always include 'score' field
440
  "elapsed": elapsed,
441
  }
442
 
 
463
  """Write sanitized results and return sanitized final score."""
464
  sanitized_results = [_sanitize_task_result(r) for r in results]
465
 
 
 
 
 
466
  total_avg = sum(r["avg_reward"] for r in sanitized_results)
467
+ final = safe_score(total_avg / len(sanitized_results)) if sanitized_results else 0.5
 
 
 
 
 
 
 
 
468
 
469
  output = {
470
  "final_score": final,
 
476
  },
477
  }
478
 
479
+ # FINAL GLOBAL SANITIZATION β€” the absolute last safeguard
480
+ output = _sanitize_full_output(output)
481
+
482
  logger.info(f"[DEBUG] Final output JSON scores:")
483
+ logger.info(f" final_score: {output['final_score']:.6f}")
484
+ for r in output["task_results"]:
485
+ logger.info(
486
+ f" {r.get('task_id')}: total_reward={r.get('total_reward'):.6f} "
487
+ f"avg_reward={r.get('avg_reward'):.6f} score={r.get('score'):.6f}"
488
+ )
489
+
490
+ # ASSERTION: Catch any remaining violations (log & auto-correct, never crash)
491
+ for r in output["task_results"]:
492
+ for key in ["total_reward", "avg_reward", "score"]:
493
+ val = r.get(key)
494
+ if val is not None and (val <= 0.0 or val >= 1.0):
495
+ logger.error(
496
+ f"[CRITICAL] ASSERTION FAILED: {r.get('task_id')}.{key}={val} "
497
+ f"VIOLATES strict (0,1)! Auto-correcting..."
498
+ )
499
+ r[key] = safe_score(val)
500
 
501
  try:
502
  os.makedirs("outputs", exist_ok=True)
 
506
  except Exception as e:
507
  logger.error(f"[ERROR] Failed to save results: {e}")
508
 
509
+ return output["final_score"]
510
 
511
  # Wait for environment to be ready
512
  logger.info("[START] Waiting for environment server...")
 
524
  "steps": 0,
525
  "total_reward": 0.01,
526
  "avg_reward": 0.01,
527
+ "score": 0.01,
528
  "elapsed": 0.0,
529
  "error": "environment_unavailable",
530
  }
 
547
  "steps": 0,
548
  "total_reward": 0.01,
549
  "avg_reward": 0.01,
550
+ "score": 0.01,
551
  "elapsed": 0.0,
552
  "error": str(e),
553
  })
 
569
  )
570
  total_avg += r.get("avg_reward", 0)
571
 
572
+ final_score = safe_score(total_avg / len(results)) if results else 0.01
573
  logger.info("-" * 60)
574
  logger.info(f" FINAL SCORE: {final_score:.4f} (0.0 -- 1.0)")
575
  logger.info("=" * 60)
models.py CHANGED
@@ -3,12 +3,55 @@ Pydantic models for the Customer Support Ticket Resolution Environment.
3
 
4
  Defines the Action, Observation, State, and Reward models used for
5
  type-safe communication between the agent and environment.
 
 
 
 
6
  """
7
 
8
  from enum import Enum
9
  from typing import Any, Dict, List, Optional
10
 
11
- from pydantic import BaseModel, Field
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
 
14
  # ──────────────────────────────────────────────────────────────────
@@ -126,29 +169,30 @@ class SupportObservation(BaseModel):
126
 
127
 
128
  # ──────────────────────────────────────────────────────────────────
129
- # Reward Model
130
  # ──────────────────────────────────────────────────────────────────
131
 
132
  class RewardBreakdown(BaseModel):
133
- """Detailed breakdown of the reward score."""
 
 
 
 
 
134
  correctness: float = Field(
135
  default=0.01,
136
- gt=0.0, lt=1.0,
137
  description="Score for factual correctness β€” strict (0, 1)",
138
  )
139
  tone: float = Field(
140
  default=0.01,
141
- gt=0.0, lt=1.0,
142
  description="Score for professional tone β€” strict (0, 1)",
143
  )
144
  completeness: float = Field(
145
  default=0.01,
146
- gt=0.0, lt=1.0,
147
  description="Score for response completeness β€” strict (0, 1)",
148
  )
149
  efficiency: float = Field(
150
  default=0.01,
151
- gt=0.0, lt=1.0,
152
  description="Score for resolution efficiency β€” strict (0, 1)",
153
  )
154
  penalties: float = Field(
@@ -158,7 +202,6 @@ class RewardBreakdown(BaseModel):
158
  )
159
  total: float = Field(
160
  default=0.01,
161
- gt=0.0, lt=1.0,
162
  description="Overall weighted score β€” strict (0, 1)",
163
  )
164
  explanation: str = Field(
@@ -166,6 +209,15 @@ class RewardBreakdown(BaseModel):
166
  description="Human-readable explanation of the score",
167
  )
168
 
 
 
 
 
 
 
 
 
 
169
 
170
  # ──────────────────────────────────────────────────────────────────
171
  # State Model
@@ -194,12 +246,18 @@ class SupportState(BaseModel):
194
 
195
 
196
  # ──────────────────────────────────────────────────────────────────
197
- # Step Result (matches OpenEnv convention)
198
  # ──────────────────���───────────────────────────────────────────────
199
 
200
  class StepResult(BaseModel):
201
  """Result returned from step(), matching OpenEnv convention."""
202
  observation: SupportObservation
203
- reward: float = Field(gt=0.0, lt=1.0)
204
  done: bool
205
  info: Dict[str, Any] = Field(default_factory=dict)
 
 
 
 
 
 
 
3
 
4
  Defines the Action, Observation, State, and Reward models used for
5
  type-safe communication between the agent and environment.
6
+
7
+ IMPORTANT: Score fields use custom validators that AUTO-CLAMP to (0, 1)
8
+ instead of raising ValidationError. This prevents the evaluator from ever
9
+ seeing boundary values (0.0 or 1.0).
10
  """
11
 
12
  from enum import Enum
13
  from typing import Any, Dict, List, Optional
14
 
15
+ from pydantic import BaseModel, Field, field_validator
16
+
17
+
18
+ # ──────────────────────────────────────────────────────────────────
19
+ # Central safe-score utility β€” shared by all modules
20
+ # ──────────────────────────────────────────────────────────────────
21
+
22
+ _SCORE_FLOOR = 0.0001
23
+ _SCORE_CEIL = 0.9999
24
+
25
+
26
+ def safe_score(value: Any) -> float:
27
+ """Clamp *any* value into the strict open interval (0, 1).
28
+
29
+ This is the SINGLE source of truth for score normalisation across
30
+ the entire project. Every score must pass through this function
31
+ before leaving any boundary (model field, API response, JSON output).
32
+
33
+ Rules:
34
+ * ``None`` β†’ 0.5 (safe default)
35
+ * Strings / non-numeric β†’ 0.5
36
+ * NaN / Β±Inf β†’ 0.5
37
+ * ≀ 0 β†’ 0.0001
38
+ * β‰₯ 1 β†’ 0.9999
39
+ """
40
+ if value is None:
41
+ return 0.5
42
+ if isinstance(value, str):
43
+ try:
44
+ value = float(value)
45
+ except (TypeError, ValueError):
46
+ return 0.5
47
+ try:
48
+ v = float(value)
49
+ except (TypeError, ValueError):
50
+ return 0.5
51
+ # Guard NaN / Inf
52
+ if v != v or v == float("inf") or v == float("-inf"):
53
+ return 0.5
54
+ return max(_SCORE_FLOOR, min(_SCORE_CEIL, v))
55
 
56
 
57
  # ──────────────────────────────────────────────────────────────────
 
169
 
170
 
171
  # ──────────────────────────────────────────────────────────────────
172
+ # Reward Model β€” uses auto-clamping validators instead of gt/lt
173
  # ──────────────────────────────────────────────────────────────────
174
 
175
  class RewardBreakdown(BaseModel):
176
+ """Detailed breakdown of the reward score.
177
+
178
+ IMPORTANT: All score fields auto-clamp to strict (0, 1) via validators.
179
+ This prevents Pydantic from raising ValidationError on boundary values
180
+ and ensures the evaluator NEVER receives 0.0 or 1.0.
181
+ """
182
  correctness: float = Field(
183
  default=0.01,
 
184
  description="Score for factual correctness β€” strict (0, 1)",
185
  )
186
  tone: float = Field(
187
  default=0.01,
 
188
  description="Score for professional tone β€” strict (0, 1)",
189
  )
190
  completeness: float = Field(
191
  default=0.01,
 
192
  description="Score for response completeness β€” strict (0, 1)",
193
  )
194
  efficiency: float = Field(
195
  default=0.01,
 
196
  description="Score for resolution efficiency β€” strict (0, 1)",
197
  )
198
  penalties: float = Field(
 
202
  )
203
  total: float = Field(
204
  default=0.01,
 
205
  description="Overall weighted score β€” strict (0, 1)",
206
  )
207
  explanation: str = Field(
 
209
  description="Human-readable explanation of the score",
210
  )
211
 
212
+ @field_validator(
213
+ "correctness", "tone", "completeness", "efficiency", "total",
214
+ mode="before",
215
+ )
216
+ @classmethod
217
+ def _clamp_score(cls, v: Any) -> float:
218
+ """Auto-clamp score fields to strict (0, 1)."""
219
+ return safe_score(v)
220
+
221
 
222
  # ──────────────────────────────────────────────────────────────────
223
  # State Model
 
246
 
247
 
248
  # ──────────────────────────────────────────────────────────────────
249
+ # Step Result (matches OpenEnv convention) β€” auto-clamps reward
250
  # ──────────────────���───────────────────────────────────────────────
251
 
252
  class StepResult(BaseModel):
253
  """Result returned from step(), matching OpenEnv convention."""
254
  observation: SupportObservation
255
+ reward: float = Field(default=0.01)
256
  done: bool
257
  info: Dict[str, Any] = Field(default_factory=dict)
258
+
259
+ @field_validator("reward", mode="before")
260
+ @classmethod
261
+ def _clamp_reward(cls, v: Any) -> float:
262
+ """Auto-clamp reward to strict (0, 1)."""
263
+ return safe_score(v)
pyproject.toml CHANGED
@@ -37,4 +37,4 @@ packages = [
37
  ]
38
 
39
  [tool.pyright]
40
- extraPaths = ["."]
 
37
  ]
38
 
39
  [tool.pyright]
40
+ extraPaths = [".", "openenv"]
server/app.py CHANGED
@@ -23,24 +23,13 @@ from typing import Any, Dict, Optional
23
 
24
  from fastapi import FastAPI, HTTPException
25
  from fastapi.middleware.cors import CORSMiddleware
26
- from pydantic import BaseModel, Field
27
 
28
- from models import SupportAction, SupportObservation, SupportState
29
  from server.environment import CustomerSupportEnvironment
30
  from tasks import TASK_IDS, TASKS
31
 
32
 
33
- def _safe_score(value) -> float:
34
- """Clamp any value to strict (0, 1) for evaluator safety."""
35
- try:
36
- v = float(value)
37
- except (TypeError, ValueError):
38
- v = 0.5
39
- if v != v or v == float('inf') or v == float('-inf'):
40
- v = 0.5
41
- return max(0.0001, min(0.9999, v))
42
-
43
-
44
  # ──────────────────────────────────────────────────────────────────
45
  # Request / Response schemas
46
  # ──────────────────────────────────────────────────────────────────
@@ -55,11 +44,23 @@ class StepRequest(BaseModel):
55
 
56
 
57
  class StepResponse(BaseModel):
 
 
 
 
 
 
58
  observation: SupportObservation
59
- reward: float = Field(gt=0.0, lt=1.0)
60
  done: bool
61
  info: Dict[str, Any]
62
 
 
 
 
 
 
 
63
 
64
  class TaskInfo(BaseModel):
65
  task_id: str
@@ -154,17 +155,21 @@ def step(request: StepRequest):
154
  """Execute an agent action and return the result."""
155
  try:
156
  obs, reward, done, info = env.step(action=request.action)
157
- # Clamp reward to strict (0, 1) β€” evaluator rejects 0.0 or 1.0
158
- safe_reward = _safe_score(reward)
 
 
 
159
  # Also clamp all scores inside reward_breakdown in info
160
  if "reward_breakdown" in info and isinstance(info["reward_breakdown"], dict):
161
  rb = info["reward_breakdown"]
162
  for key in ["correctness", "tone", "completeness", "efficiency", "total"]:
163
  if key in rb:
164
- rb[key] = _safe_score(rb[key])
 
165
  return StepResponse(
166
  observation=obs,
167
- reward=safe_reward,
168
  done=done,
169
  info=info,
170
  )
 
23
 
24
  from fastapi import FastAPI, HTTPException
25
  from fastapi.middleware.cors import CORSMiddleware
26
+ from pydantic import BaseModel, Field, field_validator
27
 
28
+ from models import SupportAction, SupportObservation, SupportState, safe_score
29
  from server.environment import CustomerSupportEnvironment
30
  from tasks import TASK_IDS, TASKS
31
 
32
 
 
 
 
 
 
 
 
 
 
 
 
33
  # ──────────────────────────────────────────────────────────────────
34
  # Request / Response schemas
35
  # ──────────────────────────────────────────────────────────────────
 
44
 
45
 
46
  class StepResponse(BaseModel):
47
+ """Response from the /step endpoint.
48
+
49
+ Uses an auto-clamping validator instead of gt/lt constraints.
50
+ This prevents Pydantic from raising ValidationError on boundary
51
+ values and ensures the evaluator NEVER receives 0.0 or 1.0.
52
+ """
53
  observation: SupportObservation
54
+ reward: float = Field(default=0.01, description="Step reward in strict (0, 1)")
55
  done: bool
56
  info: Dict[str, Any]
57
 
58
+ @field_validator("reward", mode="before")
59
+ @classmethod
60
+ def _clamp_reward(cls, v: Any) -> float:
61
+ """Auto-clamp reward to strict (0, 1)."""
62
+ return safe_score(v)
63
+
64
 
65
  class TaskInfo(BaseModel):
66
  task_id: str
 
155
  """Execute an agent action and return the result."""
156
  try:
157
  obs, reward, done, info = env.step(action=request.action)
158
+
159
+ # Triple-safe: clamp reward via safe_score before passing to StepResponse
160
+ # (StepResponse also has its own auto-clamping validator)
161
+ clamped_reward = safe_score(reward)
162
+
163
  # Also clamp all scores inside reward_breakdown in info
164
  if "reward_breakdown" in info and isinstance(info["reward_breakdown"], dict):
165
  rb = info["reward_breakdown"]
166
  for key in ["correctness", "tone", "completeness", "efficiency", "total"]:
167
  if key in rb:
168
+ rb[key] = safe_score(rb[key])
169
+
170
  return StepResponse(
171
  observation=obs,
172
+ reward=clamped_reward,
173
  done=done,
174
  info=info,
175
  )
server/environment.py CHANGED
@@ -11,6 +11,7 @@ Implements the standard OpenEnv interface:
11
  - state() β†’ SupportState
12
  """
13
 
 
14
  import sys
15
  import os
16
  from typing import Any, Dict, List, Optional, Tuple
@@ -34,10 +35,13 @@ from models import (
34
  TicketInfo,
35
  TicketPriority,
36
  TicketStatus,
 
37
  )
38
  from grader import grade_response
39
  from tasks import TASKS, TASK_IDS, get_task
40
 
 
 
41
 
42
  class CustomerSupportEnvironment:
43
  """
@@ -129,6 +133,7 @@ class CustomerSupportEnvironment:
129
 
130
  Returns:
131
  Tuple of (observation, reward, done, info).
 
132
  """
133
  if self._state is None or self._state.done:
134
  raise RuntimeError(
@@ -155,9 +160,12 @@ class CustomerSupportEnvironment:
155
  conversation_history=[m.model_dump() for m in self._conversation],
156
  )
157
 
158
- # Clamp step reward to strict (0, 1) β€” never exactly 0.0 or 1.0
159
- step_reward = max(0.0001, min(0.9999, reward_breakdown.total))
160
- print(f"[DEBUG] environment.step: raw_total={reward_breakdown.total:.6f} step_reward={step_reward:.6f}")
 
 
 
161
  self._cumulative_reward += step_reward
162
  self._state.cumulative_reward = self._cumulative_reward
163
  self._state.reward_history.append(reward_breakdown)
@@ -183,7 +191,6 @@ class CustomerSupportEnvironment:
183
  next_msg = follow_ups[self._follow_up_index]
184
  self._follow_up_index += 1
185
  else:
186
- # Generate a contextual customer acknowledgement
187
  next_msg = self._generate_contextual_reply(action)
188
 
189
  self._current_message = next_msg
@@ -196,12 +203,17 @@ class CustomerSupportEnvironment:
196
  )
197
 
198
  # Compute average reward β€” clamped to strict (0, 1)
199
- avg_reward = self._cumulative_reward / self._state.step_count
200
- avg_reward = max(0.0001, min(0.9999, avg_reward))
201
 
202
  # Build info dict β€” all scores strictly in (0, 1)
 
 
 
 
 
 
203
  info = {
204
- "reward_breakdown": reward_breakdown.model_dump(),
205
  "step_reward": step_reward,
206
  "cumulative_reward": self._cumulative_reward,
207
  "average_reward": avg_reward,
 
11
  - state() β†’ SupportState
12
  """
13
 
14
+ import logging
15
  import sys
16
  import os
17
  from typing import Any, Dict, List, Optional, Tuple
 
35
  TicketInfo,
36
  TicketPriority,
37
  TicketStatus,
38
+ safe_score,
39
  )
40
  from grader import grade_response
41
  from tasks import TASKS, TASK_IDS, get_task
42
 
43
+ logger = logging.getLogger(__name__)
44
+
45
 
46
  class CustomerSupportEnvironment:
47
  """
 
133
 
134
  Returns:
135
  Tuple of (observation, reward, done, info).
136
+ reward is ALWAYS in strict (0, 1).
137
  """
138
  if self._state is None or self._state.done:
139
  raise RuntimeError(
 
160
  conversation_history=[m.model_dump() for m in self._conversation],
161
  )
162
 
163
+ # Clamp step reward to strict (0, 1) β€” safe_score guarantees this
164
+ step_reward = safe_score(reward_breakdown.total)
165
+ logger.info(
166
+ f"[ENV] step: raw_total={reward_breakdown.total:.6f} "
167
+ f"step_reward={step_reward:.6f}"
168
+ )
169
  self._cumulative_reward += step_reward
170
  self._state.cumulative_reward = self._cumulative_reward
171
  self._state.reward_history.append(reward_breakdown)
 
191
  next_msg = follow_ups[self._follow_up_index]
192
  self._follow_up_index += 1
193
  else:
 
194
  next_msg = self._generate_contextual_reply(action)
195
 
196
  self._current_message = next_msg
 
203
  )
204
 
205
  # Compute average reward β€” clamped to strict (0, 1)
206
+ avg_reward = safe_score(self._cumulative_reward / self._state.step_count)
 
207
 
208
  # Build info dict β€” all scores strictly in (0, 1)
209
+ # Clamp every numeric score in reward_breakdown before exposing
210
+ rb_dict = reward_breakdown.model_dump()
211
+ for key in ["correctness", "tone", "completeness", "efficiency", "total"]:
212
+ if key in rb_dict:
213
+ rb_dict[key] = safe_score(rb_dict[key])
214
+
215
  info = {
216
+ "reward_breakdown": rb_dict,
217
  "step_reward": step_reward,
218
  "cumulative_reward": self._cumulative_reward,
219
  "average_reward": avg_reward,
validate.py CHANGED
@@ -5,7 +5,7 @@ Runs through all 3 tasks with deterministic responses and verifies:
5
  βœ“ reset() returns valid SupportObservation
6
  βœ“ step() returns (observation, reward, done, info) with correct types
7
  βœ“ state() returns valid SupportState
8
- βœ“ Rewards are non-constant and in [0.0, 1.0]
9
  βœ“ Episodes terminate correctly
10
  βœ“ Grader produces varying scores for different responses
11
 
@@ -19,7 +19,7 @@ import os
19
  # Ensure project root is on path
20
  sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
21
 
22
- from models import SupportAction, SupportObservation, SupportState, RewardBreakdown
23
  from server.environment import CustomerSupportEnvironment
24
  from tasks import TASK_IDS
25
 
@@ -66,9 +66,9 @@ def validate_task(env: CustomerSupportEnvironment, task_id: str, responses: list
66
  rewards.append(reward)
67
  breakdown = info.get("reward_breakdown", {})
68
  print(f" βœ“ step({i+1}) β†’ reward={reward:.4f} | "
69
- f"correctness={breakdown.get('correctness', 0):.2f} "
70
- f"tone={breakdown.get('tone', 0):.2f} "
71
- f"completeness={breakdown.get('completeness', 0):.2f} "
72
  f"done={done}")
73
 
74
  if done:
@@ -82,7 +82,7 @@ def validate_task(env: CustomerSupportEnvironment, task_id: str, responses: list
82
  return {
83
  "task_id": task_id,
84
  "rewards": rewards,
85
- "avg_reward": max(0.0001, min(0.9999, sum(rewards) / len(rewards))) if rewards else 0.5,
86
  "steps": len(rewards),
87
  }
88
 
@@ -137,6 +137,11 @@ def validate_grader_variance():
137
  print(f" βœ“ Grader produces varying scores (NOT constant)")
138
  print(f" βœ“ Good > Bad > Irrelevant ordering confirmed")
139
 
 
 
 
 
 
140
 
141
  def main():
142
  print("=" * 50)
@@ -208,8 +213,7 @@ def main():
208
  for r in all_results:
209
  print(f" βœ“ {r['task_id']:20s} β†’ avg_reward={r['avg_reward']:.4f} steps={r['steps']}")
210
  total_avg += r['avg_reward']
211
- overall = total_avg / len(all_results) if all_results else 0.01
212
- overall = max(0.0001, min(0.9999, overall))
213
  print(f"\n Overall Score: {overall:.4f}")
214
  print(f"\n βœ… ALL VALIDATIONS PASSED!")
215
  return 0
 
5
  βœ“ reset() returns valid SupportObservation
6
  βœ“ step() returns (observation, reward, done, info) with correct types
7
  βœ“ state() returns valid SupportState
8
+ βœ“ Rewards are non-constant and in (0.0, 1.0) strict open interval
9
  βœ“ Episodes terminate correctly
10
  βœ“ Grader produces varying scores for different responses
11
 
 
19
  # Ensure project root is on path
20
  sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
21
 
22
+ from models import SupportAction, SupportObservation, SupportState, RewardBreakdown, safe_score
23
  from server.environment import CustomerSupportEnvironment
24
  from tasks import TASK_IDS
25
 
 
66
  rewards.append(reward)
67
  breakdown = info.get("reward_breakdown", {})
68
  print(f" βœ“ step({i+1}) β†’ reward={reward:.4f} | "
69
+ f"correctness={safe_score(breakdown.get('correctness', 0.5)):.2f} "
70
+ f"tone={safe_score(breakdown.get('tone', 0.5)):.2f} "
71
+ f"completeness={safe_score(breakdown.get('completeness', 0.5)):.2f} "
72
  f"done={done}")
73
 
74
  if done:
 
82
  return {
83
  "task_id": task_id,
84
  "rewards": rewards,
85
+ "avg_reward": safe_score(sum(rewards) / len(rewards)) if rewards else 0.5,
86
  "steps": len(rewards),
87
  }
88
 
 
137
  print(f" βœ“ Grader produces varying scores (NOT constant)")
138
  print(f" βœ“ Good > Bad > Irrelevant ordering confirmed")
139
 
140
+ # Verify ALL rewards are strictly in (0, 1)
141
+ for label, r in [("good", good_reward), ("bad", bad_reward), ("irr", irr_reward)]:
142
+ assert 0.0 < r < 1.0, f"{label} reward {r} violates strict (0, 1)!"
143
+ print(f" βœ“ All rewards strictly in (0, 1) open interval")
144
+
145
 
146
  def main():
147
  print("=" * 50)
 
213
  for r in all_results:
214
  print(f" βœ“ {r['task_id']:20s} β†’ avg_reward={r['avg_reward']:.4f} steps={r['steps']}")
215
  total_avg += r['avg_reward']
216
+ overall = safe_score(total_avg / len(all_results)) if all_results else 0.01
 
217
  print(f"\n Overall Score: {overall:.4f}")
218
  print(f"\n βœ… ALL VALIDATIONS PASSED!")
219
  return 0