vedkdev commited on
Commit
dc990fa
·
verified ·
1 Parent(s): bbd2278

Upload folder using huggingface_hub

Browse files
Dockerfile CHANGED
@@ -14,4 +14,5 @@ COPY . .
14
 
15
  EXPOSE 8000
16
 
 
17
  CMD ["python", "-m", "server.app"]
 
14
 
15
  EXPOSE 8000
16
 
17
+ ENV ENABLE_WEB_INTERFACE=true
18
  CMD ["python", "-m", "server.app"]
GRADING.md CHANGED
@@ -105,10 +105,10 @@ progress = max(-0.25, base_reward - spam_penalty)
105
  Binary exact-match scorer:
106
 
107
  ```text
108
- if action_type != "classify_flakiness": return 0.0
109
- if predicted not in {"flaky","stable"}: return 0.0
110
  truth = task["label"] (default "flaky")
111
- terminal_score = 1.0 if predicted == truth else 0.0
112
  ```
113
 
114
  Notes:
@@ -130,7 +130,7 @@ Prediction and truth are normalized by:
130
  - `OD-VIC` -> `OD-Vic`
131
  - etc.
132
 
133
- If normalized value is not in valid set, score is `0.0`.
134
 
135
  Truth category is the **first** category if semicolon-separated:
136
 
@@ -141,8 +141,8 @@ raw_truth = str(task["category"]).split(";")[0]
141
  ### 5.2 Similarity scoring
142
 
143
  ```text
144
- if predicted == truth: return 1.0
145
- else return similarity[predicted,truth] or similarity[truth,predicted] or 0.0
146
  ```
147
 
148
  The similarity matrix is loaded from `dataset/category_similarity.json`.
@@ -171,11 +171,11 @@ Any missing pair defaults to `0.0`.
171
  Hybrid weighted scorer:
172
 
173
  ```text
174
- if action_type != "propose_fix": return 0.0
175
- if proposed_fix is empty: return 0.0
176
 
177
  total = 0.35 * pattern_score + 0.25 * apply_score + 0.40 * judge_score
178
- terminal_score = round(clamp(total, 0.0, 1.0), 4)
179
  ```
180
 
181
  ### 6.1 `pattern_score`
@@ -186,7 +186,7 @@ For category with pattern list:
186
 
187
  ```text
188
  matches = number of patterns found (case-insensitive substring)
189
- pattern_score = min(1.0, matches / max(1, len(patterns) * 0.4))
190
  ```
191
 
192
  If category has no pattern list:
@@ -202,11 +202,11 @@ Current pattern lists:
202
  ### 6.2 `apply_score` (`_check_diff_applies`)
203
 
204
  ```text
205
- if diff does not contain both '---' and '+++': return 0.0
206
  if sandbox_root missing or not existing: return 0.3
207
  else run: patch --dry-run -p1 -i <temp_patch>
208
- return 1.0 if patch exit code == 0
209
- return 0.0 otherwise
210
  on exception: return 0.3
211
  ```
212
 
@@ -237,22 +237,22 @@ API/model resolution in judge:
237
  ### Example A: Task 1 correct classify early
238
 
239
  - `cumulative_progress = 0.05`
240
- - `terminal_score = 1.0`
241
  - `late_penalty = 0.0`
242
  - `wrong_dir_penalty = 0.0`
243
 
244
  ```text
245
- reward = clamp(0.05 + 1.0 - 0 - 0, 0, 1) = 1.0
246
  ```
247
 
248
  ### Example B: Task 2 wrong category but some exploration
249
 
250
  - `cumulative_progress = 0.05`
251
- - `terminal_score = 0.0` (no similarity match)
252
  - penalties = `0`
253
 
254
  ```text
255
- reward = clamp(0.05 + 0.0, 0, 1) = 0.05
256
  ```
257
 
258
  ### Example C: Task 3 with weak fix and no API key
 
105
  Binary exact-match scorer:
106
 
107
  ```text
108
+ if action_type != "classify_flakiness": return 0.001
109
+ if predicted not in {"flaky","stable"}: return 0.001
110
  truth = task["label"] (default "flaky")
111
+ terminal_score = 0.999 if predicted == truth else 0.001
112
  ```
113
 
114
  Notes:
 
130
  - `OD-VIC` -> `OD-Vic`
131
  - etc.
132
 
133
+ If normalized value is not in valid set, score is `0.001`.
134
 
135
  Truth category is the **first** category if semicolon-separated:
136
 
 
141
  ### 5.2 Similarity scoring
142
 
143
  ```text
144
+ if predicted == truth: return 0.999
145
+ else return clamp(similarity[predicted,truth] or similarity[truth,predicted] or 0.0, 0.001, 0.999)
146
  ```
147
 
148
  The similarity matrix is loaded from `dataset/category_similarity.json`.
 
171
  Hybrid weighted scorer:
172
 
173
  ```text
174
+ if action_type != "propose_fix": return 0.001
175
+ if proposed_fix is empty: return 0.001
176
 
177
  total = 0.35 * pattern_score + 0.25 * apply_score + 0.40 * judge_score
178
+ terminal_score = round(clamp(total, 0.001, 0.999), 4)
179
  ```
180
 
181
  ### 6.1 `pattern_score`
 
186
 
187
  ```text
188
  matches = number of patterns found (case-insensitive substring)
189
+ pattern_score = min(0.999, matches / max(1, len(patterns) * 0.4))
190
  ```
191
 
192
  If category has no pattern list:
 
202
  ### 6.2 `apply_score` (`_check_diff_applies`)
203
 
204
  ```text
205
+ if diff does not contain both '---' and '+++': return 0.001
206
  if sandbox_root missing or not existing: return 0.3
207
  else run: patch --dry-run -p1 -i <temp_patch>
208
+ return 0.999 if patch exit code == 0
209
+ return 0.001 otherwise
210
  on exception: return 0.3
211
  ```
212
 
 
237
  ### Example A: Task 1 correct classify early
238
 
239
  - `cumulative_progress = 0.05`
240
+ - `terminal_score = 0.999`
241
  - `late_penalty = 0.0`
242
  - `wrong_dir_penalty = 0.0`
243
 
244
  ```text
245
+ reward = clamp(0.05 + 0.999 - 0 - 0, 0, 1) = 0.999
246
  ```
247
 
248
  ### Example B: Task 2 wrong category but some exploration
249
 
250
  - `cumulative_progress = 0.05`
251
+ - `terminal_score = 0.001` (no similarity match)
252
  - penalties = `0`
253
 
254
  ```text
255
+ reward = clamp(0.05 + 0.001, 0, 1) = 0.051
256
  ```
257
 
258
  ### Example C: Task 3 with weak fix and no API key
env/environment.py CHANGED
@@ -95,9 +95,9 @@ class FlakySleuthEnv:
95
  wrong_dir_penalty = 0.2
96
 
97
  reward = min(
98
- 1.0,
99
  max(
100
- 0.0,
101
  self.cumulative_progress + terminal_score - late_penalty - wrong_dir_penalty,
102
  ),
103
  )
@@ -117,7 +117,7 @@ class FlakySleuthEnv:
117
  if not done and self.step_count >= self.max_steps:
118
  done = True
119
  info = {
120
- "terminal_score": 0.0,
121
  "progress_score": self.cumulative_progress,
122
  "late_penalty": max(0, self.step_count - 15) * 0.05,
123
  "timeout": True,
 
95
  wrong_dir_penalty = 0.2
96
 
97
  reward = min(
98
+ 0.999,
99
  max(
100
+ 0.001,
101
  self.cumulative_progress + terminal_score - late_penalty - wrong_dir_penalty,
102
  ),
103
  )
 
117
  if not done and self.step_count >= self.max_steps:
118
  done = True
119
  info = {
120
+ "terminal_score": 0.001,
121
  "progress_score": self.cumulative_progress,
122
  "late_penalty": max(0, self.step_count - 15) * 0.05,
123
  "timeout": True,
flakysleuth_build_plan.md CHANGED
@@ -527,7 +527,7 @@ class FlakySleuthEnv:
527
  and self.current_task.get("label") == "flaky"):
528
  wrong_dir_penalty = 0.2
529
 
530
- reward = min(1.0, max(0.0,
531
  self.cumulative_progress + terminal_score
532
  - late_penalty - wrong_dir_penalty
533
  ))
@@ -594,16 +594,16 @@ class FlakySleuthEnv:
594
  test_file = task.get("test_file", "")
595
 
596
  if test_file and test_file in filepath:
597
- return 0.07 # reading the actual test file
598
  if any(filepath.endswith(ext) for ext in (".py",)):
599
- return 0.03 # any python file
600
- return 0.01 # non-python file (requirements, config, etc.)
601
 
602
  def _search_relevance_reward(self, pattern: str) -> float:
603
  pattern_lower = pattern.lower()
604
  if any(sig in pattern_lower for sig in FLAKY_SIGNAL_PATTERNS):
605
- return 0.04 # searching for known flakiness signals
606
- return 0.01 # generic search
607
 
608
  def _make_obs(self, tool_output=None) -> FlakySleuthObservation:
609
  task = self.current_task
@@ -639,7 +639,7 @@ def grade_action(action: FlakySleuthAction, task: dict) -> float:
639
  return grade_t2(action, task)
640
  elif tt == "fix_proposal":
641
  return grade_t3(action, task)
642
- return 0.0
643
  ```
644
 
645
  ### 7.2 Task 1 Grader (`graders/task1_grader.py`)
@@ -650,16 +650,16 @@ from env.models import FlakySleuthAction
650
  def grade(action: FlakySleuthAction, task: dict) -> float:
651
  """Binary classification: flaky or stable. Exact match only."""
652
  if action.action_type != "classify_flakiness":
653
- return 0.0
654
 
655
  predicted = action.argument.strip().lower()
656
  if predicted not in ("flaky", "stable"):
657
- return 0.0
658
 
659
  # All IDoFT rows are flaky; stable examples are synthetically added
660
  # with label="stable" during dataset construction
661
  ground_truth = task.get("label", "flaky")
662
- return 1.0 if predicted == ground_truth else 0.0
663
  ```
664
 
665
  ### 7.3 Task 2 Grader (`graders/task2_grader.py`)
@@ -677,7 +677,7 @@ with open(_SIM_PATH) as f:
677
 
678
  def _get_similarity(pred: str, true: str) -> float:
679
  if pred == true:
680
- return 1.0
681
  key1 = f"{pred},{true}"
682
  key2 = f"{true},{pred}"
683
  return _RAW_SIM.get(key1, _RAW_SIM.get(key2, 0.0))
@@ -695,7 +695,7 @@ def grade(action: FlakySleuthAction, task: dict) -> float:
695
  Wrong family = 0.0
696
  """
697
  if action.action_type != "classify_root_cause":
698
- return 0.0
699
 
700
  predicted = action.argument.strip().upper()
701
 
@@ -703,7 +703,7 @@ def grade(action: FlakySleuthAction, task: dict) -> float:
703
  predicted = predicted.replace(" ", "-") # "OD Brit" → "OD-Brit"
704
 
705
  if predicted not in VALID_CATEGORIES:
706
- return 0.0 # invalid category string
707
 
708
  # Take primary category from dataset (first if semicolon-separated)
709
  true_category = str(task.get("category", "")).split(";")[0].strip().upper()
@@ -745,11 +745,11 @@ def grade(action: FlakySleuthAction, task: dict) -> float:
745
  Component C: LLM judge — 0.40 weight
746
  """
747
  if action.action_type != "propose_fix":
748
- return 0.0
749
 
750
  proposed_fix = action.argument.strip()
751
  if not proposed_fix:
752
- return 0.0
753
 
754
  category = str(task.get("category", "")).split(";")[0].strip().upper()
755
  known_fix = task.get("known_fix_diff", "") or ""
@@ -759,7 +759,7 @@ def grade(action: FlakySleuthAction, task: dict) -> float:
759
  patterns = EXPECTED_FIX_PATTERNS.get(category, [])
760
  if patterns:
761
  matches = sum(1 for p in patterns if p in proposed_fix)
762
- pattern_score = min(1.0, matches / max(1, len(patterns) * 0.4))
763
  else:
764
  pattern_score = 0.5
765
 
@@ -770,7 +770,7 @@ def grade(action: FlakySleuthAction, task: dict) -> float:
770
  judge_score = _llm_judge(proposed_fix, known_fix, category, test_code)
771
 
772
  total = (0.35 * pattern_score) + (0.25 * apply_score) + (0.40 * judge_score)
773
- return round(min(1.0, max(0.0, total)), 4)
774
 
775
 
776
  def _check_diff_applies(fix: str, task: dict) -> float:
@@ -791,7 +791,7 @@ def _check_diff_applies(fix: str, task: dict) -> float:
791
  capture_output=True, text=True, timeout=10
792
  )
793
  os.unlink(patch_path)
794
- return 1.0 if result.returncode == 0 else 0.0
795
  except Exception:
796
  return 0.3 # can't verify, neutral
797
 
@@ -905,7 +905,7 @@ description: >
905
 
906
  observation_type: FlakySleuthObservation
907
  action_type: FlakySleuthAction
908
- reward_range: [0.0, 1.0]
909
 
910
  tasks:
911
  - id: task1_classify
@@ -1176,7 +1176,7 @@ DAY 3 — Graders
1176
  □ Implement graders/task2_grader.py + verify similarity matrix
1177
  □ Implement graders/task3_grader.py (pattern + diff + LLM judge)
1178
  □ Unit test all 3 graders with hardcoded inputs
1179
- □ Verify scores are always in [0.0, 1.0]
1180
 
1181
  DAY 4 — Server + Spec Compliance
1182
  ──────────────────────────────────
 
527
  and self.current_task.get("label") == "flaky"):
528
  wrong_dir_penalty = 0.2
529
 
530
+ reward = min(0.999, max(0.001,
531
  self.cumulative_progress + terminal_score
532
  - late_penalty - wrong_dir_penalty
533
  ))
 
594
  test_file = task.get("test_file", "")
595
 
596
  if test_file and test_file in filepath:
597
+ return 0.0017 # reading the actual test file
598
  if any(filepath.endswith(ext) for ext in (".py",)):
599
+ return 0.0013 # any python file
600
+ return 0.0011 # non-python file (requirements, config, etc.)
601
 
602
  def _search_relevance_reward(self, pattern: str) -> float:
603
  pattern_lower = pattern.lower()
604
  if any(sig in pattern_lower for sig in FLAKY_SIGNAL_PATTERNS):
605
+ return 0.0014 # searching for known flakiness signals
606
+ return 0.0011 # generic search
607
 
608
  def _make_obs(self, tool_output=None) -> FlakySleuthObservation:
609
  task = self.current_task
 
639
  return grade_t2(action, task)
640
  elif tt == "fix_proposal":
641
  return grade_t3(action, task)
642
+ return 0.001
643
  ```
644
 
645
  ### 7.2 Task 1 Grader (`graders/task1_grader.py`)
 
650
  def grade(action: FlakySleuthAction, task: dict) -> float:
651
  """Binary classification: flaky or stable. Exact match only."""
652
  if action.action_type != "classify_flakiness":
653
+ return 0.001
654
 
655
  predicted = action.argument.strip().lower()
656
  if predicted not in ("flaky", "stable"):
657
+ return 0.001
658
 
659
  # All IDoFT rows are flaky; stable examples are synthetically added
660
  # with label="stable" during dataset construction
661
  ground_truth = task.get("label", "flaky")
662
+ return 0.999 if predicted == ground_truth else 0.0
663
  ```
664
 
665
  ### 7.3 Task 2 Grader (`graders/task2_grader.py`)
 
677
 
678
  def _get_similarity(pred: str, true: str) -> float:
679
  if pred == true:
680
+ return 0.999
681
  key1 = f"{pred},{true}"
682
  key2 = f"{true},{pred}"
683
  return _RAW_SIM.get(key1, _RAW_SIM.get(key2, 0.0))
 
695
  Wrong family = 0.0
696
  """
697
  if action.action_type != "classify_root_cause":
698
+ return 0.001
699
 
700
  predicted = action.argument.strip().upper()
701
 
 
703
  predicted = predicted.replace(" ", "-") # "OD Brit" → "OD-Brit"
704
 
705
  if predicted not in VALID_CATEGORIES:
706
+ return 0.001 # invalid category string
707
 
708
  # Take primary category from dataset (first if semicolon-separated)
709
  true_category = str(task.get("category", "")).split(";")[0].strip().upper()
 
745
  Component C: LLM judge — 0.40 weight
746
  """
747
  if action.action_type != "propose_fix":
748
+ return 0.001
749
 
750
  proposed_fix = action.argument.strip()
751
  if not proposed_fix:
752
+ return 0.001
753
 
754
  category = str(task.get("category", "")).split(";")[0].strip().upper()
755
  known_fix = task.get("known_fix_diff", "") or ""
 
759
  patterns = EXPECTED_FIX_PATTERNS.get(category, [])
760
  if patterns:
761
  matches = sum(1 for p in patterns if p in proposed_fix)
762
+ pattern_score = min(0.999, matches / max(1, len(patterns) * 0.4))
763
  else:
764
  pattern_score = 0.5
765
 
 
770
  judge_score = _llm_judge(proposed_fix, known_fix, category, test_code)
771
 
772
  total = (0.35 * pattern_score) + (0.25 * apply_score) + (0.40 * judge_score)
773
+ return round(min(0.999, max(0.001, total)), 4)
774
 
775
 
776
  def _check_diff_applies(fix: str, task: dict) -> float:
 
791
  capture_output=True, text=True, timeout=10
792
  )
793
  os.unlink(patch_path)
794
+ return 0.999 if result.returncode == 0 else 0.0
795
  except Exception:
796
  return 0.3 # can't verify, neutral
797
 
 
905
 
906
  observation_type: FlakySleuthObservation
907
  action_type: FlakySleuthAction
908
+ reward_range: (0.001, 0.999)
909
 
910
  tasks:
911
  - id: task1_classify
 
1176
  □ Implement graders/task2_grader.py + verify similarity matrix
1177
  □ Implement graders/task3_grader.py (pattern + diff + LLM judge)
1178
  □ Unit test all 3 graders with hardcoded inputs
1179
+ □ Verify scores are always in (0.001, 0.999)
1180
 
1181
  DAY 4 — Server + Spec Compliance
1182
  ──────────────────────────────────
graders/__init__.py CHANGED
@@ -14,4 +14,4 @@ def grade_action(action: FlakySleuthAction, task: dict) -> float:
14
  return grade_t2(action, task)
15
  if task_type == "fix_proposal":
16
  return grade_t3(action, task)
17
- return 0.0
 
14
  return grade_t2(action, task)
15
  if task_type == "fix_proposal":
16
  return grade_t3(action, task)
17
+ return 0.001
graders/task1_grader.py CHANGED
@@ -6,11 +6,11 @@ from env.models import FlakySleuthAction
6
  def grade(action: FlakySleuthAction, task: dict) -> float:
7
  """Binary classification: flaky or stable. Exact match only."""
8
  if action.action_type != "classify_flakiness":
9
- return 0.0
10
 
11
  predicted = action.argument.strip().lower()
12
  if predicted not in ("flaky", "stable"):
13
- return 0.0
14
 
15
  ground_truth = str(task.get("label", "flaky")).strip().lower() or "flaky"
16
- return 1.0 if predicted == ground_truth else 0.0
 
6
  def grade(action: FlakySleuthAction, task: dict) -> float:
7
  """Binary classification: flaky or stable. Exact match only."""
8
  if action.action_type != "classify_flakiness":
9
+ return 0.001
10
 
11
  predicted = action.argument.strip().lower()
12
  if predicted not in ("flaky", "stable"):
13
+ return 0.001
14
 
15
  ground_truth = str(task.get("label", "flaky")).strip().lower() or "flaky"
16
+ return 0.999 if predicted == ground_truth else 0.001
graders/task2_grader.py CHANGED
@@ -36,7 +36,7 @@ def _normalize_category(value: str) -> str:
36
 
37
  def _get_similarity(predicted: str, truth: str) -> float:
38
  if predicted == truth:
39
- return 1.0
40
  key_a = f"{predicted},{truth}"
41
  key_b = f"{truth},{predicted}"
42
  return float(_RAW_SIM.get(key_a, _RAW_SIM.get(key_b, 0.0)))
@@ -45,15 +45,16 @@ def _get_similarity(predicted: str, truth: str) -> float:
45
  def grade(action: FlakySleuthAction, task: dict) -> float:
46
  """Root cause category classification with matrix-based partial credit."""
47
  if action.action_type != "classify_root_cause":
48
- return 0.0
49
 
50
  predicted = _normalize_category(action.argument)
51
  if predicted not in VALID_CATEGORIES:
52
- return 0.0
53
 
54
  raw_truth = str(task.get("category", "")).split(";")[0]
55
  truth = _normalize_category(raw_truth)
56
  if truth not in VALID_CATEGORIES:
57
- return 0.0
58
 
59
- return _get_similarity(predicted, truth)
 
 
36
 
37
  def _get_similarity(predicted: str, truth: str) -> float:
38
  if predicted == truth:
39
+ return 0.999
40
  key_a = f"{predicted},{truth}"
41
  key_b = f"{truth},{predicted}"
42
  return float(_RAW_SIM.get(key_a, _RAW_SIM.get(key_b, 0.0)))
 
45
  def grade(action: FlakySleuthAction, task: dict) -> float:
46
  """Root cause category classification with matrix-based partial credit."""
47
  if action.action_type != "classify_root_cause":
48
+ return 0.001
49
 
50
  predicted = _normalize_category(action.argument)
51
  if predicted not in VALID_CATEGORIES:
52
+ return 0.001
53
 
54
  raw_truth = str(task.get("category", "")).split(";")[0]
55
  truth = _normalize_category(raw_truth)
56
  if truth not in VALID_CATEGORIES:
57
+ return 0.001
58
 
59
+ sim = _get_similarity(predicted, truth)
60
+ return max(0.001, min(0.999, sim))
graders/task3_grader.py CHANGED
@@ -30,11 +30,11 @@ EXPECTED_FIX_PATTERNS = {
30
  def grade(action: FlakySleuthAction, task: dict) -> float:
31
  """Hybrid fixer grader: pattern + dry-run apply + LLM judge."""
32
  if action.action_type != "propose_fix":
33
- return 0.0
34
 
35
  proposed_fix = action.argument.strip()
36
  if not proposed_fix:
37
- return 0.0
38
 
39
  category = str(task.get("category", "")).split(";")[0].strip().upper()
40
  known_fix = task.get("known_fix_diff", "") or ""
@@ -45,7 +45,7 @@ def grade(action: FlakySleuthAction, task: dict) -> float:
45
  matches = sum(
46
  1 for pattern in patterns if pattern.lower() in proposed_fix.lower()
47
  )
48
- pattern_score = min(1.0, matches / max(1, len(patterns) * 0.4))
49
  else:
50
  pattern_score = 0.5
51
 
@@ -53,12 +53,12 @@ def grade(action: FlakySleuthAction, task: dict) -> float:
53
  judge_score = _llm_judge(proposed_fix, known_fix, category, test_code)
54
 
55
  total = (0.35 * pattern_score) + (0.25 * apply_score) + (0.40 * judge_score)
56
- return round(min(1.0, max(0.0, total)), 4)
57
 
58
 
59
  def _check_diff_applies(diff_text: str, task: dict) -> float:
60
  if "+++" not in diff_text or "---" not in diff_text:
61
- return 0.0
62
 
63
  repo_root = str(task.get("sandbox_root", "")).strip()
64
  if not repo_root or not Path(repo_root).exists():
@@ -79,7 +79,7 @@ def _check_diff_applies(diff_text: str, task: dict) -> float:
79
  text=True,
80
  timeout=10,
81
  )
82
- return 1.0 if result.returncode == 0 else 0.0
83
  except Exception:
84
  return 0.3
85
  finally:
@@ -156,6 +156,7 @@ Respond ONLY with JSON:
156
  raw = raw.replace("```json", "").replace("```", "").strip()
157
  payload = json.loads(raw)
158
  score = int(payload.get("score", 5))
159
- return max(0.0, min(10.0, score)) / 10.0
 
160
  except Exception:
161
  return 0.5
 
30
  def grade(action: FlakySleuthAction, task: dict) -> float:
31
  """Hybrid fixer grader: pattern + dry-run apply + LLM judge."""
32
  if action.action_type != "propose_fix":
33
+ return 0.001
34
 
35
  proposed_fix = action.argument.strip()
36
  if not proposed_fix:
37
+ return 0.001
38
 
39
  category = str(task.get("category", "")).split(";")[0].strip().upper()
40
  known_fix = task.get("known_fix_diff", "") or ""
 
45
  matches = sum(
46
  1 for pattern in patterns if pattern.lower() in proposed_fix.lower()
47
  )
48
+ pattern_score = min(0.999, matches / max(1, len(patterns) * 0.4))
49
  else:
50
  pattern_score = 0.5
51
 
 
53
  judge_score = _llm_judge(proposed_fix, known_fix, category, test_code)
54
 
55
  total = (0.35 * pattern_score) + (0.25 * apply_score) + (0.40 * judge_score)
56
+ return round(min(0.999, max(0.001, total)), 4)
57
 
58
 
59
  def _check_diff_applies(diff_text: str, task: dict) -> float:
60
  if "+++" not in diff_text or "---" not in diff_text:
61
+ return 0.001
62
 
63
  repo_root = str(task.get("sandbox_root", "")).strip()
64
  if not repo_root or not Path(repo_root).exists():
 
79
  text=True,
80
  timeout=10,
81
  )
82
+ return 0.999 if result.returncode == 0 else 0.001
83
  except Exception:
84
  return 0.3
85
  finally:
 
156
  raw = raw.replace("```json", "").replace("```", "").strip()
157
  payload = json.loads(raw)
158
  score = int(payload.get("score", 5))
159
+ raw_score = max(0.0, min(10.0, score)) / 10.0
160
+ return max(0.001, min(0.999, raw_score))
161
  except Exception:
162
  return 0.5
inference.py CHANGED
@@ -790,7 +790,7 @@ def run_episode(
790
  _compliance_log_end(
791
  success=success,
792
  steps=steps_taken,
793
- score=min(max(final_episode_score, 0.0), 1.0),
794
  rewards=rewards,
795
  )
796
 
 
790
  _compliance_log_end(
791
  success=success,
792
  steps=steps_taken,
793
+ score=min(max(final_episode_score, 0.001), 0.999),
794
  rewards=rewards,
795
  )
796
 
inference_compliance.py CHANGED
@@ -173,7 +173,7 @@ async def main() -> None:
173
  break
174
 
175
  score = sum(rewards) / MAX_TOTAL_REWARD if MAX_TOTAL_REWARD > 0 else 0.0
176
- score = min(max(score, 0.0), 1.0) # clamp to [0, 1]
177
  success = score >= SUCCESS_SCORE_THRESHOLD
178
 
179
  finally:
 
173
  break
174
 
175
  score = sum(rewards) / MAX_TOTAL_REWARD if MAX_TOTAL_REWARD > 0 else 0.0
176
+ score = min(max(score, 0.001), 0.999) # clamp to (0.001, 0.999)
177
  success = score >= SUCCESS_SCORE_THRESHOLD
178
 
179
  finally:
inference_debug.py CHANGED
@@ -783,7 +783,7 @@ def run_episode(
783
  _compliance_log_end(
784
  success=success,
785
  steps=steps_taken,
786
- score=min(max(final_episode_score, 0.0), 1.0),
787
  rewards=rewards,
788
  )
789
 
 
783
  _compliance_log_end(
784
  success=success,
785
  steps=steps_taken,
786
+ score=min(max(final_episode_score, 0.001), 0.999),
787
  rewards=rewards,
788
  )
789
 
openenv.yaml CHANGED
@@ -13,7 +13,7 @@ description: >
13
 
14
  action_type: FlakySleuthAction
15
  observation_type: FlakySleuthObservation
16
- reward_range: [0.0, 1.0]
17
  episode_max_steps: 20
18
  baseline_script: inference.py
19
 
 
13
 
14
  action_type: FlakySleuthAction
15
  observation_type: FlakySleuthObservation
16
+ reward_range: (0.001, 0.999)
17
  episode_max_steps: 20
18
  baseline_script: inference.py
19