Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- Dockerfile +1 -0
- GRADING.md +17 -17
- env/environment.py +3 -3
- flakysleuth_build_plan.md +20 -20
- graders/__init__.py +1 -1
- graders/task1_grader.py +3 -3
- graders/task2_grader.py +6 -5
- graders/task3_grader.py +8 -7
- inference.py +1 -1
- inference_compliance.py +1 -1
- inference_debug.py +1 -1
- openenv.yaml +1 -1
Dockerfile
CHANGED
|
@@ -14,4 +14,5 @@ COPY . .
|
|
| 14 |
|
| 15 |
EXPOSE 8000
|
| 16 |
|
|
|
|
| 17 |
CMD ["python", "-m", "server.app"]
|
|
|
|
| 14 |
|
| 15 |
EXPOSE 8000
|
| 16 |
|
| 17 |
+
ENV ENABLE_WEB_INTERFACE=true
|
| 18 |
CMD ["python", "-m", "server.app"]
|
GRADING.md
CHANGED
|
@@ -105,10 +105,10 @@ progress = max(-0.25, base_reward - spam_penalty)
|
|
| 105 |
Binary exact-match scorer:
|
| 106 |
|
| 107 |
```text
|
| 108 |
-
if action_type != "classify_flakiness": return 0.
|
| 109 |
-
if predicted not in {"flaky","stable"}: return 0.
|
| 110 |
truth = task["label"] (default "flaky")
|
| 111 |
-
terminal_score =
|
| 112 |
```
|
| 113 |
|
| 114 |
Notes:
|
|
@@ -130,7 +130,7 @@ Prediction and truth are normalized by:
|
|
| 130 |
- `OD-VIC` -> `OD-Vic`
|
| 131 |
- etc.
|
| 132 |
|
| 133 |
-
If normalized value is not in valid set, score is `0.
|
| 134 |
|
| 135 |
Truth category is the **first** category if semicolon-separated:
|
| 136 |
|
|
@@ -141,8 +141,8 @@ raw_truth = str(task["category"]).split(";")[0]
|
|
| 141 |
### 5.2 Similarity scoring
|
| 142 |
|
| 143 |
```text
|
| 144 |
-
if predicted == truth: return
|
| 145 |
-
else return similarity[predicted,truth] or similarity[truth,predicted] or 0.0
|
| 146 |
```
|
| 147 |
|
| 148 |
The similarity matrix is loaded from `dataset/category_similarity.json`.
|
|
@@ -171,11 +171,11 @@ Any missing pair defaults to `0.0`.
|
|
| 171 |
Hybrid weighted scorer:
|
| 172 |
|
| 173 |
```text
|
| 174 |
-
if action_type != "propose_fix": return 0.
|
| 175 |
-
if proposed_fix is empty: return 0.
|
| 176 |
|
| 177 |
total = 0.35 * pattern_score + 0.25 * apply_score + 0.40 * judge_score
|
| 178 |
-
terminal_score = round(clamp(total, 0.
|
| 179 |
```
|
| 180 |
|
| 181 |
### 6.1 `pattern_score`
|
|
@@ -186,7 +186,7 @@ For category with pattern list:
|
|
| 186 |
|
| 187 |
```text
|
| 188 |
matches = number of patterns found (case-insensitive substring)
|
| 189 |
-
pattern_score = min(
|
| 190 |
```
|
| 191 |
|
| 192 |
If category has no pattern list:
|
|
@@ -202,11 +202,11 @@ Current pattern lists:
|
|
| 202 |
### 6.2 `apply_score` (`_check_diff_applies`)
|
| 203 |
|
| 204 |
```text
|
| 205 |
-
if diff does not contain both '---' and '+++': return 0.
|
| 206 |
if sandbox_root missing or not existing: return 0.3
|
| 207 |
else run: patch --dry-run -p1 -i <temp_patch>
|
| 208 |
-
return
|
| 209 |
-
return 0.
|
| 210 |
on exception: return 0.3
|
| 211 |
```
|
| 212 |
|
|
@@ -237,22 +237,22 @@ API/model resolution in judge:
|
|
| 237 |
### Example A: Task 1 correct classify early
|
| 238 |
|
| 239 |
- `cumulative_progress = 0.05`
|
| 240 |
-
- `terminal_score =
|
| 241 |
- `late_penalty = 0.0`
|
| 242 |
- `wrong_dir_penalty = 0.0`
|
| 243 |
|
| 244 |
```text
|
| 245 |
-
reward = clamp(0.05 +
|
| 246 |
```
|
| 247 |
|
| 248 |
### Example B: Task 2 wrong category but some exploration
|
| 249 |
|
| 250 |
- `cumulative_progress = 0.05`
|
| 251 |
-
- `terminal_score = 0.
|
| 252 |
- penalties = `0`
|
| 253 |
|
| 254 |
```text
|
| 255 |
-
reward = clamp(0.05 + 0.
|
| 256 |
```
|
| 257 |
|
| 258 |
### Example C: Task 3 with weak fix and no API key
|
|
|
|
| 105 |
Binary exact-match scorer:
|
| 106 |
|
| 107 |
```text
|
| 108 |
+
if action_type != "classify_flakiness": return 0.001
|
| 109 |
+
if predicted not in {"flaky","stable"}: return 0.001
|
| 110 |
truth = task["label"] (default "flaky")
|
| 111 |
+
terminal_score = 0.999 if predicted == truth else 0.001
|
| 112 |
```
|
| 113 |
|
| 114 |
Notes:
|
|
|
|
| 130 |
- `OD-VIC` -> `OD-Vic`
|
| 131 |
- etc.
|
| 132 |
|
| 133 |
+
If normalized value is not in valid set, score is `0.001`.
|
| 134 |
|
| 135 |
Truth category is the **first** category if semicolon-separated:
|
| 136 |
|
|
|
|
| 141 |
### 5.2 Similarity scoring
|
| 142 |
|
| 143 |
```text
|
| 144 |
+
if predicted == truth: return 0.999
|
| 145 |
+
else return clamp(similarity[predicted,truth] or similarity[truth,predicted] or 0.0, 0.001, 0.999)
|
| 146 |
```
|
| 147 |
|
| 148 |
The similarity matrix is loaded from `dataset/category_similarity.json`.
|
|
|
|
| 171 |
Hybrid weighted scorer:
|
| 172 |
|
| 173 |
```text
|
| 174 |
+
if action_type != "propose_fix": return 0.001
|
| 175 |
+
if proposed_fix is empty: return 0.001
|
| 176 |
|
| 177 |
total = 0.35 * pattern_score + 0.25 * apply_score + 0.40 * judge_score
|
| 178 |
+
terminal_score = round(clamp(total, 0.001, 0.999), 4)
|
| 179 |
```
|
| 180 |
|
| 181 |
### 6.1 `pattern_score`
|
|
|
|
| 186 |
|
| 187 |
```text
|
| 188 |
matches = number of patterns found (case-insensitive substring)
|
| 189 |
+
pattern_score = min(0.999, matches / max(1, len(patterns) * 0.4))
|
| 190 |
```
|
| 191 |
|
| 192 |
If category has no pattern list:
|
|
|
|
| 202 |
### 6.2 `apply_score` (`_check_diff_applies`)
|
| 203 |
|
| 204 |
```text
|
| 205 |
+
if diff does not contain both '---' and '+++': return 0.001
|
| 206 |
if sandbox_root missing or not existing: return 0.3
|
| 207 |
else run: patch --dry-run -p1 -i <temp_patch>
|
| 208 |
+
return 0.999 if patch exit code == 0
|
| 209 |
+
return 0.001 otherwise
|
| 210 |
on exception: return 0.3
|
| 211 |
```
|
| 212 |
|
|
|
|
| 237 |
### Example A: Task 1 correct classify early
|
| 238 |
|
| 239 |
- `cumulative_progress = 0.05`
|
| 240 |
+
- `terminal_score = 0.999`
|
| 241 |
- `late_penalty = 0.0`
|
| 242 |
- `wrong_dir_penalty = 0.0`
|
| 243 |
|
| 244 |
```text
|
| 245 |
+
reward = clamp(0.05 + 0.999 - 0 - 0, 0, 1) = 0.999
|
| 246 |
```
|
| 247 |
|
| 248 |
### Example B: Task 2 wrong category but some exploration
|
| 249 |
|
| 250 |
- `cumulative_progress = 0.05`
|
| 251 |
+
- `terminal_score = 0.001` (no similarity match)
|
| 252 |
- penalties = `0`
|
| 253 |
|
| 254 |
```text
|
| 255 |
+
reward = clamp(0.05 + 0.001, 0, 1) = 0.051
|
| 256 |
```
|
| 257 |
|
| 258 |
### Example C: Task 3 with weak fix and no API key
|
env/environment.py
CHANGED
|
@@ -95,9 +95,9 @@ class FlakySleuthEnv:
|
|
| 95 |
wrong_dir_penalty = 0.2
|
| 96 |
|
| 97 |
reward = min(
|
| 98 |
-
|
| 99 |
max(
|
| 100 |
-
0.
|
| 101 |
self.cumulative_progress + terminal_score - late_penalty - wrong_dir_penalty,
|
| 102 |
),
|
| 103 |
)
|
|
@@ -117,7 +117,7 @@ class FlakySleuthEnv:
|
|
| 117 |
if not done and self.step_count >= self.max_steps:
|
| 118 |
done = True
|
| 119 |
info = {
|
| 120 |
-
"terminal_score": 0.
|
| 121 |
"progress_score": self.cumulative_progress,
|
| 122 |
"late_penalty": max(0, self.step_count - 15) * 0.05,
|
| 123 |
"timeout": True,
|
|
|
|
| 95 |
wrong_dir_penalty = 0.2
|
| 96 |
|
| 97 |
reward = min(
|
| 98 |
+
0.999,
|
| 99 |
max(
|
| 100 |
+
0.001,
|
| 101 |
self.cumulative_progress + terminal_score - late_penalty - wrong_dir_penalty,
|
| 102 |
),
|
| 103 |
)
|
|
|
|
| 117 |
if not done and self.step_count >= self.max_steps:
|
| 118 |
done = True
|
| 119 |
info = {
|
| 120 |
+
"terminal_score": 0.001,
|
| 121 |
"progress_score": self.cumulative_progress,
|
| 122 |
"late_penalty": max(0, self.step_count - 15) * 0.05,
|
| 123 |
"timeout": True,
|
flakysleuth_build_plan.md
CHANGED
|
@@ -527,7 +527,7 @@ class FlakySleuthEnv:
|
|
| 527 |
and self.current_task.get("label") == "flaky"):
|
| 528 |
wrong_dir_penalty = 0.2
|
| 529 |
|
| 530 |
-
reward = min(
|
| 531 |
self.cumulative_progress + terminal_score
|
| 532 |
- late_penalty - wrong_dir_penalty
|
| 533 |
))
|
|
@@ -594,16 +594,16 @@ class FlakySleuthEnv:
|
|
| 594 |
test_file = task.get("test_file", "")
|
| 595 |
|
| 596 |
if test_file and test_file in filepath:
|
| 597 |
-
return 0.
|
| 598 |
if any(filepath.endswith(ext) for ext in (".py",)):
|
| 599 |
-
return 0.
|
| 600 |
-
return 0.
|
| 601 |
|
| 602 |
def _search_relevance_reward(self, pattern: str) -> float:
|
| 603 |
pattern_lower = pattern.lower()
|
| 604 |
if any(sig in pattern_lower for sig in FLAKY_SIGNAL_PATTERNS):
|
| 605 |
-
return 0.
|
| 606 |
-
return 0.
|
| 607 |
|
| 608 |
def _make_obs(self, tool_output=None) -> FlakySleuthObservation:
|
| 609 |
task = self.current_task
|
|
@@ -639,7 +639,7 @@ def grade_action(action: FlakySleuthAction, task: dict) -> float:
|
|
| 639 |
return grade_t2(action, task)
|
| 640 |
elif tt == "fix_proposal":
|
| 641 |
return grade_t3(action, task)
|
| 642 |
-
return 0.
|
| 643 |
```
|
| 644 |
|
| 645 |
### 7.2 Task 1 Grader (`graders/task1_grader.py`)
|
|
@@ -650,16 +650,16 @@ from env.models import FlakySleuthAction
|
|
| 650 |
def grade(action: FlakySleuthAction, task: dict) -> float:
|
| 651 |
"""Binary classification: flaky or stable. Exact match only."""
|
| 652 |
if action.action_type != "classify_flakiness":
|
| 653 |
-
return 0.
|
| 654 |
|
| 655 |
predicted = action.argument.strip().lower()
|
| 656 |
if predicted not in ("flaky", "stable"):
|
| 657 |
-
return 0.
|
| 658 |
|
| 659 |
# All IDoFT rows are flaky; stable examples are synthetically added
|
| 660 |
# with label="stable" during dataset construction
|
| 661 |
ground_truth = task.get("label", "flaky")
|
| 662 |
-
return
|
| 663 |
```
|
| 664 |
|
| 665 |
### 7.3 Task 2 Grader (`graders/task2_grader.py`)
|
|
@@ -677,7 +677,7 @@ with open(_SIM_PATH) as f:
|
|
| 677 |
|
| 678 |
def _get_similarity(pred: str, true: str) -> float:
|
| 679 |
if pred == true:
|
| 680 |
-
return
|
| 681 |
key1 = f"{pred},{true}"
|
| 682 |
key2 = f"{true},{pred}"
|
| 683 |
return _RAW_SIM.get(key1, _RAW_SIM.get(key2, 0.0))
|
|
@@ -695,7 +695,7 @@ def grade(action: FlakySleuthAction, task: dict) -> float:
|
|
| 695 |
Wrong family = 0.0
|
| 696 |
"""
|
| 697 |
if action.action_type != "classify_root_cause":
|
| 698 |
-
return 0.
|
| 699 |
|
| 700 |
predicted = action.argument.strip().upper()
|
| 701 |
|
|
@@ -703,7 +703,7 @@ def grade(action: FlakySleuthAction, task: dict) -> float:
|
|
| 703 |
predicted = predicted.replace(" ", "-") # "OD Brit" → "OD-Brit"
|
| 704 |
|
| 705 |
if predicted not in VALID_CATEGORIES:
|
| 706 |
-
return 0.
|
| 707 |
|
| 708 |
# Take primary category from dataset (first if semicolon-separated)
|
| 709 |
true_category = str(task.get("category", "")).split(";")[0].strip().upper()
|
|
@@ -745,11 +745,11 @@ def grade(action: FlakySleuthAction, task: dict) -> float:
|
|
| 745 |
Component C: LLM judge — 0.40 weight
|
| 746 |
"""
|
| 747 |
if action.action_type != "propose_fix":
|
| 748 |
-
return 0.
|
| 749 |
|
| 750 |
proposed_fix = action.argument.strip()
|
| 751 |
if not proposed_fix:
|
| 752 |
-
return 0.
|
| 753 |
|
| 754 |
category = str(task.get("category", "")).split(";")[0].strip().upper()
|
| 755 |
known_fix = task.get("known_fix_diff", "") or ""
|
|
@@ -759,7 +759,7 @@ def grade(action: FlakySleuthAction, task: dict) -> float:
|
|
| 759 |
patterns = EXPECTED_FIX_PATTERNS.get(category, [])
|
| 760 |
if patterns:
|
| 761 |
matches = sum(1 for p in patterns if p in proposed_fix)
|
| 762 |
-
pattern_score = min(
|
| 763 |
else:
|
| 764 |
pattern_score = 0.5
|
| 765 |
|
|
@@ -770,7 +770,7 @@ def grade(action: FlakySleuthAction, task: dict) -> float:
|
|
| 770 |
judge_score = _llm_judge(proposed_fix, known_fix, category, test_code)
|
| 771 |
|
| 772 |
total = (0.35 * pattern_score) + (0.25 * apply_score) + (0.40 * judge_score)
|
| 773 |
-
return round(min(
|
| 774 |
|
| 775 |
|
| 776 |
def _check_diff_applies(fix: str, task: dict) -> float:
|
|
@@ -791,7 +791,7 @@ def _check_diff_applies(fix: str, task: dict) -> float:
|
|
| 791 |
capture_output=True, text=True, timeout=10
|
| 792 |
)
|
| 793 |
os.unlink(patch_path)
|
| 794 |
-
return
|
| 795 |
except Exception:
|
| 796 |
return 0.3 # can't verify, neutral
|
| 797 |
|
|
@@ -905,7 +905,7 @@ description: >
|
|
| 905 |
|
| 906 |
observation_type: FlakySleuthObservation
|
| 907 |
action_type: FlakySleuthAction
|
| 908 |
-
reward_range:
|
| 909 |
|
| 910 |
tasks:
|
| 911 |
- id: task1_classify
|
|
@@ -1176,7 +1176,7 @@ DAY 3 — Graders
|
|
| 1176 |
□ Implement graders/task2_grader.py + verify similarity matrix
|
| 1177 |
□ Implement graders/task3_grader.py (pattern + diff + LLM judge)
|
| 1178 |
□ Unit test all 3 graders with hardcoded inputs
|
| 1179 |
-
□ Verify scores are always in
|
| 1180 |
|
| 1181 |
DAY 4 — Server + Spec Compliance
|
| 1182 |
──────────────────────────────────
|
|
|
|
| 527 |
and self.current_task.get("label") == "flaky"):
|
| 528 |
wrong_dir_penalty = 0.2
|
| 529 |
|
| 530 |
+
reward = min(0.999, max(0.001,
|
| 531 |
self.cumulative_progress + terminal_score
|
| 532 |
- late_penalty - wrong_dir_penalty
|
| 533 |
))
|
|
|
|
| 594 |
test_file = task.get("test_file", "")
|
| 595 |
|
| 596 |
if test_file and test_file in filepath:
|
| 597 |
+
return 0.0017 # reading the actual test file
|
| 598 |
if any(filepath.endswith(ext) for ext in (".py",)):
|
| 599 |
+
return 0.0013 # any python file
|
| 600 |
+
return 0.0011 # non-python file (requirements, config, etc.)
|
| 601 |
|
| 602 |
def _search_relevance_reward(self, pattern: str) -> float:
|
| 603 |
pattern_lower = pattern.lower()
|
| 604 |
if any(sig in pattern_lower for sig in FLAKY_SIGNAL_PATTERNS):
|
| 605 |
+
return 0.0014 # searching for known flakiness signals
|
| 606 |
+
return 0.0011 # generic search
|
| 607 |
|
| 608 |
def _make_obs(self, tool_output=None) -> FlakySleuthObservation:
|
| 609 |
task = self.current_task
|
|
|
|
| 639 |
return grade_t2(action, task)
|
| 640 |
elif tt == "fix_proposal":
|
| 641 |
return grade_t3(action, task)
|
| 642 |
+
return 0.001
|
| 643 |
```
|
| 644 |
|
| 645 |
### 7.2 Task 1 Grader (`graders/task1_grader.py`)
|
|
|
|
| 650 |
def grade(action: FlakySleuthAction, task: dict) -> float:
|
| 651 |
"""Binary classification: flaky or stable. Exact match only."""
|
| 652 |
if action.action_type != "classify_flakiness":
|
| 653 |
+
return 0.001
|
| 654 |
|
| 655 |
predicted = action.argument.strip().lower()
|
| 656 |
if predicted not in ("flaky", "stable"):
|
| 657 |
+
return 0.001
|
| 658 |
|
| 659 |
# All IDoFT rows are flaky; stable examples are synthetically added
|
| 660 |
# with label="stable" during dataset construction
|
| 661 |
ground_truth = task.get("label", "flaky")
|
| 662 |
+
return 0.999 if predicted == ground_truth else 0.0
|
| 663 |
```
|
| 664 |
|
| 665 |
### 7.3 Task 2 Grader (`graders/task2_grader.py`)
|
|
|
|
| 677 |
|
| 678 |
def _get_similarity(pred: str, true: str) -> float:
|
| 679 |
if pred == true:
|
| 680 |
+
return 0.999
|
| 681 |
key1 = f"{pred},{true}"
|
| 682 |
key2 = f"{true},{pred}"
|
| 683 |
return _RAW_SIM.get(key1, _RAW_SIM.get(key2, 0.0))
|
|
|
|
| 695 |
Wrong family = 0.0
|
| 696 |
"""
|
| 697 |
if action.action_type != "classify_root_cause":
|
| 698 |
+
return 0.001
|
| 699 |
|
| 700 |
predicted = action.argument.strip().upper()
|
| 701 |
|
|
|
|
| 703 |
predicted = predicted.replace(" ", "-") # "OD Brit" → "OD-Brit"
|
| 704 |
|
| 705 |
if predicted not in VALID_CATEGORIES:
|
| 706 |
+
return 0.001 # invalid category string
|
| 707 |
|
| 708 |
# Take primary category from dataset (first if semicolon-separated)
|
| 709 |
true_category = str(task.get("category", "")).split(";")[0].strip().upper()
|
|
|
|
| 745 |
Component C: LLM judge — 0.40 weight
|
| 746 |
"""
|
| 747 |
if action.action_type != "propose_fix":
|
| 748 |
+
return 0.001
|
| 749 |
|
| 750 |
proposed_fix = action.argument.strip()
|
| 751 |
if not proposed_fix:
|
| 752 |
+
return 0.001
|
| 753 |
|
| 754 |
category = str(task.get("category", "")).split(";")[0].strip().upper()
|
| 755 |
known_fix = task.get("known_fix_diff", "") or ""
|
|
|
|
| 759 |
patterns = EXPECTED_FIX_PATTERNS.get(category, [])
|
| 760 |
if patterns:
|
| 761 |
matches = sum(1 for p in patterns if p in proposed_fix)
|
| 762 |
+
pattern_score = min(0.999, matches / max(1, len(patterns) * 0.4))
|
| 763 |
else:
|
| 764 |
pattern_score = 0.5
|
| 765 |
|
|
|
|
| 770 |
judge_score = _llm_judge(proposed_fix, known_fix, category, test_code)
|
| 771 |
|
| 772 |
total = (0.35 * pattern_score) + (0.25 * apply_score) + (0.40 * judge_score)
|
| 773 |
+
return round(min(0.999, max(0.001, total)), 4)
|
| 774 |
|
| 775 |
|
| 776 |
def _check_diff_applies(fix: str, task: dict) -> float:
|
|
|
|
| 791 |
capture_output=True, text=True, timeout=10
|
| 792 |
)
|
| 793 |
os.unlink(patch_path)
|
| 794 |
+
return 0.999 if result.returncode == 0 else 0.0
|
| 795 |
except Exception:
|
| 796 |
return 0.3 # can't verify, neutral
|
| 797 |
|
|
|
|
| 905 |
|
| 906 |
observation_type: FlakySleuthObservation
|
| 907 |
action_type: FlakySleuthAction
|
| 908 |
+
reward_range: (0.001, 0.999)
|
| 909 |
|
| 910 |
tasks:
|
| 911 |
- id: task1_classify
|
|
|
|
| 1176 |
□ Implement graders/task2_grader.py + verify similarity matrix
|
| 1177 |
□ Implement graders/task3_grader.py (pattern + diff + LLM judge)
|
| 1178 |
□ Unit test all 3 graders with hardcoded inputs
|
| 1179 |
+
□ Verify scores are always in (0.001, 0.999)
|
| 1180 |
|
| 1181 |
DAY 4 — Server + Spec Compliance
|
| 1182 |
──────────────────────────────────
|
graders/__init__.py
CHANGED
|
@@ -14,4 +14,4 @@ def grade_action(action: FlakySleuthAction, task: dict) -> float:
|
|
| 14 |
return grade_t2(action, task)
|
| 15 |
if task_type == "fix_proposal":
|
| 16 |
return grade_t3(action, task)
|
| 17 |
-
return 0.
|
|
|
|
| 14 |
return grade_t2(action, task)
|
| 15 |
if task_type == "fix_proposal":
|
| 16 |
return grade_t3(action, task)
|
| 17 |
+
return 0.001
|
graders/task1_grader.py
CHANGED
|
@@ -6,11 +6,11 @@ from env.models import FlakySleuthAction
|
|
| 6 |
def grade(action: FlakySleuthAction, task: dict) -> float:
|
| 7 |
"""Binary classification: flaky or stable. Exact match only."""
|
| 8 |
if action.action_type != "classify_flakiness":
|
| 9 |
-
return 0.
|
| 10 |
|
| 11 |
predicted = action.argument.strip().lower()
|
| 12 |
if predicted not in ("flaky", "stable"):
|
| 13 |
-
return 0.
|
| 14 |
|
| 15 |
ground_truth = str(task.get("label", "flaky")).strip().lower() or "flaky"
|
| 16 |
-
return
|
|
|
|
| 6 |
def grade(action: FlakySleuthAction, task: dict) -> float:
|
| 7 |
"""Binary classification: flaky or stable. Exact match only."""
|
| 8 |
if action.action_type != "classify_flakiness":
|
| 9 |
+
return 0.001
|
| 10 |
|
| 11 |
predicted = action.argument.strip().lower()
|
| 12 |
if predicted not in ("flaky", "stable"):
|
| 13 |
+
return 0.001
|
| 14 |
|
| 15 |
ground_truth = str(task.get("label", "flaky")).strip().lower() or "flaky"
|
| 16 |
+
return 0.999 if predicted == ground_truth else 0.001
|
graders/task2_grader.py
CHANGED
|
@@ -36,7 +36,7 @@ def _normalize_category(value: str) -> str:
|
|
| 36 |
|
| 37 |
def _get_similarity(predicted: str, truth: str) -> float:
|
| 38 |
if predicted == truth:
|
| 39 |
-
return
|
| 40 |
key_a = f"{predicted},{truth}"
|
| 41 |
key_b = f"{truth},{predicted}"
|
| 42 |
return float(_RAW_SIM.get(key_a, _RAW_SIM.get(key_b, 0.0)))
|
|
@@ -45,15 +45,16 @@ def _get_similarity(predicted: str, truth: str) -> float:
|
|
| 45 |
def grade(action: FlakySleuthAction, task: dict) -> float:
|
| 46 |
"""Root cause category classification with matrix-based partial credit."""
|
| 47 |
if action.action_type != "classify_root_cause":
|
| 48 |
-
return 0.
|
| 49 |
|
| 50 |
predicted = _normalize_category(action.argument)
|
| 51 |
if predicted not in VALID_CATEGORIES:
|
| 52 |
-
return 0.
|
| 53 |
|
| 54 |
raw_truth = str(task.get("category", "")).split(";")[0]
|
| 55 |
truth = _normalize_category(raw_truth)
|
| 56 |
if truth not in VALID_CATEGORIES:
|
| 57 |
-
return 0.
|
| 58 |
|
| 59 |
-
|
|
|
|
|
|
| 36 |
|
| 37 |
def _get_similarity(predicted: str, truth: str) -> float:
|
| 38 |
if predicted == truth:
|
| 39 |
+
return 0.999
|
| 40 |
key_a = f"{predicted},{truth}"
|
| 41 |
key_b = f"{truth},{predicted}"
|
| 42 |
return float(_RAW_SIM.get(key_a, _RAW_SIM.get(key_b, 0.0)))
|
|
|
|
| 45 |
def grade(action: FlakySleuthAction, task: dict) -> float:
|
| 46 |
"""Root cause category classification with matrix-based partial credit."""
|
| 47 |
if action.action_type != "classify_root_cause":
|
| 48 |
+
return 0.001
|
| 49 |
|
| 50 |
predicted = _normalize_category(action.argument)
|
| 51 |
if predicted not in VALID_CATEGORIES:
|
| 52 |
+
return 0.001
|
| 53 |
|
| 54 |
raw_truth = str(task.get("category", "")).split(";")[0]
|
| 55 |
truth = _normalize_category(raw_truth)
|
| 56 |
if truth not in VALID_CATEGORIES:
|
| 57 |
+
return 0.001
|
| 58 |
|
| 59 |
+
sim = _get_similarity(predicted, truth)
|
| 60 |
+
return max(0.001, min(0.999, sim))
|
graders/task3_grader.py
CHANGED
|
@@ -30,11 +30,11 @@ EXPECTED_FIX_PATTERNS = {
|
|
| 30 |
def grade(action: FlakySleuthAction, task: dict) -> float:
|
| 31 |
"""Hybrid fixer grader: pattern + dry-run apply + LLM judge."""
|
| 32 |
if action.action_type != "propose_fix":
|
| 33 |
-
return 0.
|
| 34 |
|
| 35 |
proposed_fix = action.argument.strip()
|
| 36 |
if not proposed_fix:
|
| 37 |
-
return 0.
|
| 38 |
|
| 39 |
category = str(task.get("category", "")).split(";")[0].strip().upper()
|
| 40 |
known_fix = task.get("known_fix_diff", "") or ""
|
|
@@ -45,7 +45,7 @@ def grade(action: FlakySleuthAction, task: dict) -> float:
|
|
| 45 |
matches = sum(
|
| 46 |
1 for pattern in patterns if pattern.lower() in proposed_fix.lower()
|
| 47 |
)
|
| 48 |
-
pattern_score = min(
|
| 49 |
else:
|
| 50 |
pattern_score = 0.5
|
| 51 |
|
|
@@ -53,12 +53,12 @@ def grade(action: FlakySleuthAction, task: dict) -> float:
|
|
| 53 |
judge_score = _llm_judge(proposed_fix, known_fix, category, test_code)
|
| 54 |
|
| 55 |
total = (0.35 * pattern_score) + (0.25 * apply_score) + (0.40 * judge_score)
|
| 56 |
-
return round(min(
|
| 57 |
|
| 58 |
|
| 59 |
def _check_diff_applies(diff_text: str, task: dict) -> float:
|
| 60 |
if "+++" not in diff_text or "---" not in diff_text:
|
| 61 |
-
return 0.
|
| 62 |
|
| 63 |
repo_root = str(task.get("sandbox_root", "")).strip()
|
| 64 |
if not repo_root or not Path(repo_root).exists():
|
|
@@ -79,7 +79,7 @@ def _check_diff_applies(diff_text: str, task: dict) -> float:
|
|
| 79 |
text=True,
|
| 80 |
timeout=10,
|
| 81 |
)
|
| 82 |
-
return
|
| 83 |
except Exception:
|
| 84 |
return 0.3
|
| 85 |
finally:
|
|
@@ -156,6 +156,7 @@ Respond ONLY with JSON:
|
|
| 156 |
raw = raw.replace("```json", "").replace("```", "").strip()
|
| 157 |
payload = json.loads(raw)
|
| 158 |
score = int(payload.get("score", 5))
|
| 159 |
-
|
|
|
|
| 160 |
except Exception:
|
| 161 |
return 0.5
|
|
|
|
| 30 |
def grade(action: FlakySleuthAction, task: dict) -> float:
|
| 31 |
"""Hybrid fixer grader: pattern + dry-run apply + LLM judge."""
|
| 32 |
if action.action_type != "propose_fix":
|
| 33 |
+
return 0.001
|
| 34 |
|
| 35 |
proposed_fix = action.argument.strip()
|
| 36 |
if not proposed_fix:
|
| 37 |
+
return 0.001
|
| 38 |
|
| 39 |
category = str(task.get("category", "")).split(";")[0].strip().upper()
|
| 40 |
known_fix = task.get("known_fix_diff", "") or ""
|
|
|
|
| 45 |
matches = sum(
|
| 46 |
1 for pattern in patterns if pattern.lower() in proposed_fix.lower()
|
| 47 |
)
|
| 48 |
+
pattern_score = min(0.999, matches / max(1, len(patterns) * 0.4))
|
| 49 |
else:
|
| 50 |
pattern_score = 0.5
|
| 51 |
|
|
|
|
| 53 |
judge_score = _llm_judge(proposed_fix, known_fix, category, test_code)
|
| 54 |
|
| 55 |
total = (0.35 * pattern_score) + (0.25 * apply_score) + (0.40 * judge_score)
|
| 56 |
+
return round(min(0.999, max(0.001, total)), 4)
|
| 57 |
|
| 58 |
|
| 59 |
def _check_diff_applies(diff_text: str, task: dict) -> float:
|
| 60 |
if "+++" not in diff_text or "---" not in diff_text:
|
| 61 |
+
return 0.001
|
| 62 |
|
| 63 |
repo_root = str(task.get("sandbox_root", "")).strip()
|
| 64 |
if not repo_root or not Path(repo_root).exists():
|
|
|
|
| 79 |
text=True,
|
| 80 |
timeout=10,
|
| 81 |
)
|
| 82 |
+
return 0.999 if result.returncode == 0 else 0.001
|
| 83 |
except Exception:
|
| 84 |
return 0.3
|
| 85 |
finally:
|
|
|
|
| 156 |
raw = raw.replace("```json", "").replace("```", "").strip()
|
| 157 |
payload = json.loads(raw)
|
| 158 |
score = int(payload.get("score", 5))
|
| 159 |
+
raw_score = max(0.0, min(10.0, score)) / 10.0
|
| 160 |
+
return max(0.001, min(0.999, raw_score))
|
| 161 |
except Exception:
|
| 162 |
return 0.5
|
inference.py
CHANGED
|
@@ -790,7 +790,7 @@ def run_episode(
|
|
| 790 |
_compliance_log_end(
|
| 791 |
success=success,
|
| 792 |
steps=steps_taken,
|
| 793 |
-
score=min(max(final_episode_score, 0.
|
| 794 |
rewards=rewards,
|
| 795 |
)
|
| 796 |
|
|
|
|
| 790 |
_compliance_log_end(
|
| 791 |
success=success,
|
| 792 |
steps=steps_taken,
|
| 793 |
+
score=min(max(final_episode_score, 0.001), 0.999),
|
| 794 |
rewards=rewards,
|
| 795 |
)
|
| 796 |
|
inference_compliance.py
CHANGED
|
@@ -173,7 +173,7 @@ async def main() -> None:
|
|
| 173 |
break
|
| 174 |
|
| 175 |
score = sum(rewards) / MAX_TOTAL_REWARD if MAX_TOTAL_REWARD > 0 else 0.0
|
| 176 |
-
score = min(max(score, 0.
|
| 177 |
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 178 |
|
| 179 |
finally:
|
|
|
|
| 173 |
break
|
| 174 |
|
| 175 |
score = sum(rewards) / MAX_TOTAL_REWARD if MAX_TOTAL_REWARD > 0 else 0.0
|
| 176 |
+
score = min(max(score, 0.001), 0.999) # clamp to (0.001, 0.999)
|
| 177 |
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 178 |
|
| 179 |
finally:
|
inference_debug.py
CHANGED
|
@@ -783,7 +783,7 @@ def run_episode(
|
|
| 783 |
_compliance_log_end(
|
| 784 |
success=success,
|
| 785 |
steps=steps_taken,
|
| 786 |
-
score=min(max(final_episode_score, 0.
|
| 787 |
rewards=rewards,
|
| 788 |
)
|
| 789 |
|
|
|
|
| 783 |
_compliance_log_end(
|
| 784 |
success=success,
|
| 785 |
steps=steps_taken,
|
| 786 |
+
score=min(max(final_episode_score, 0.001), 0.999),
|
| 787 |
rewards=rewards,
|
| 788 |
)
|
| 789 |
|
openenv.yaml
CHANGED
|
@@ -13,7 +13,7 @@ description: >
|
|
| 13 |
|
| 14 |
action_type: FlakySleuthAction
|
| 15 |
observation_type: FlakySleuthObservation
|
| 16 |
-
reward_range:
|
| 17 |
episode_max_steps: 20
|
| 18 |
baseline_script: inference.py
|
| 19 |
|
|
|
|
| 13 |
|
| 14 |
action_type: FlakySleuthAction
|
| 15 |
observation_type: FlakySleuthObservation
|
| 16 |
+
reward_range: (0.001, 0.999)
|
| 17 |
episode_max_steps: 20
|
| 18 |
baseline_script: inference.py
|
| 19 |
|