Spaces:

vedkdev
/

FlakyTestSleuthOpenEnvRL

Sleeping

App Files Files Community

vedkdev commited on Apr 8

Commit

dc990fa

verified ·

1 Parent(s): bbd2278

Upload folder using huggingface_hub

Browse files

Files changed (12) hide show

Dockerfile +1 -0
GRADING.md +17 -17
env/environment.py +3 -3
flakysleuth_build_plan.md +20 -20
graders/__init__.py +1 -1
graders/task1_grader.py +3 -3
graders/task2_grader.py +6 -5
graders/task3_grader.py +8 -7
inference.py +1 -1
inference_compliance.py +1 -1
inference_debug.py +1 -1
openenv.yaml +1 -1

Dockerfile CHANGED Viewed

@@ -14,4 +14,5 @@ COPY . .
 EXPOSE 8000
 CMD ["python", "-m", "server.app"]

 EXPOSE 8000
+ENV ENABLE_WEB_INTERFACE=true
 CMD ["python", "-m", "server.app"]

GRADING.md CHANGED Viewed

@@ -105,10 +105,10 @@ progress = max(-0.25, base_reward - spam_penalty)
 Binary exact-match scorer:
 ```text
-if action_type != "classify_flakiness": return 0.0
-if predicted not in {"flaky","stable"}: return 0.0
 truth = task["label"] (default "flaky")
-terminal_score = 1.0 if predicted == truth else 0.0
 ```
 Notes:
@@ -130,7 +130,7 @@ Prediction and truth are normalized by:
   - `OD-VIC` -> `OD-Vic`
   - etc.
-If normalized value is not in valid set, score is `0.0`.
 Truth category is the **first** category if semicolon-separated:
@@ -141,8 +141,8 @@ raw_truth = str(task["category"]).split(";")[0]
 ### 5.2 Similarity scoring
 ```text
-if predicted == truth: return 1.0
-else return similarity[predicted,truth] or similarity[truth,predicted] or 0.0
 ```
 The similarity matrix is loaded from `dataset/category_similarity.json`.
@@ -171,11 +171,11 @@ Any missing pair defaults to `0.0`.
 Hybrid weighted scorer:
 ```text
-if action_type != "propose_fix": return 0.0
-if proposed_fix is empty: return 0.0
 total = 0.35 * pattern_score + 0.25 * apply_score + 0.40 * judge_score
-terminal_score = round(clamp(total, 0.0, 1.0), 4)
 ```
 ### 6.1 `pattern_score`
@@ -186,7 +186,7 @@ For category with pattern list:
 ```text
 matches = number of patterns found (case-insensitive substring)
-pattern_score = min(1.0, matches / max(1, len(patterns) * 0.4))
 ```
 If category has no pattern list:
@@ -202,11 +202,11 @@ Current pattern lists:
 ### 6.2 `apply_score` (`_check_diff_applies`)
 ```text
-if diff does not contain both '---' and '+++': return 0.0
 if sandbox_root missing or not existing: return 0.3
 else run: patch --dry-run -p1 -i <temp_patch>
-  return 1.0 if patch exit code == 0
-  return 0.0 otherwise
 on exception: return 0.3
 ```
@@ -237,22 +237,22 @@ API/model resolution in judge:
 ### Example A: Task 1 correct classify early
 - `cumulative_progress = 0.05`
-- `terminal_score = 1.0`
 - `late_penalty = 0.0`
 - `wrong_dir_penalty = 0.0`
 ```text
-reward = clamp(0.05 + 1.0 - 0 - 0, 0, 1) = 1.0
 ```
 ### Example B: Task 2 wrong category but some exploration
 - `cumulative_progress = 0.05`
-- `terminal_score = 0.0` (no similarity match)
 - penalties = `0`
 ```text
-reward = clamp(0.05 + 0.0, 0, 1) = 0.05
 ```
 ### Example C: Task 3 with weak fix and no API key

 Binary exact-match scorer:
 ```text
+if action_type != "classify_flakiness": return 0.001
+if predicted not in {"flaky","stable"}: return 0.001
 truth = task["label"] (default "flaky")
+terminal_score = 0.999 if predicted == truth else 0.001
 ```
 Notes:
   - `OD-VIC` -> `OD-Vic`
   - etc.
+If normalized value is not in valid set, score is `0.001`.
 Truth category is the **first** category if semicolon-separated:
 ### 5.2 Similarity scoring
 ```text
+if predicted == truth: return 0.999
+else return clamp(similarity[predicted,truth] or similarity[truth,predicted] or 0.0, 0.001, 0.999)
 ```
 The similarity matrix is loaded from `dataset/category_similarity.json`.
 Hybrid weighted scorer:
 ```text
+if action_type != "propose_fix": return 0.001
+if proposed_fix is empty: return 0.001
 total = 0.35 * pattern_score + 0.25 * apply_score + 0.40 * judge_score
+terminal_score = round(clamp(total, 0.001, 0.999), 4)
 ```
 ### 6.1 `pattern_score`
 ```text
 matches = number of patterns found (case-insensitive substring)
+pattern_score = min(0.999, matches / max(1, len(patterns) * 0.4))
 ```
 If category has no pattern list:
 ### 6.2 `apply_score` (`_check_diff_applies`)
 ```text
+if diff does not contain both '---' and '+++': return 0.001
 if sandbox_root missing or not existing: return 0.3
 else run: patch --dry-run -p1 -i <temp_patch>
+  return 0.999 if patch exit code == 0
+  return 0.001 otherwise
 on exception: return 0.3
 ```
 ### Example A: Task 1 correct classify early
 - `cumulative_progress = 0.05`
+- `terminal_score = 0.999`
 - `late_penalty = 0.0`
 - `wrong_dir_penalty = 0.0`
 ```text
+reward = clamp(0.05 + 0.999 - 0 - 0, 0, 1) = 0.999
 ```
 ### Example B: Task 2 wrong category but some exploration
 - `cumulative_progress = 0.05`
+- `terminal_score = 0.001` (no similarity match)
 - penalties = `0`
 ```text
+reward = clamp(0.05 + 0.001, 0, 1) = 0.051
 ```
 ### Example C: Task 3 with weak fix and no API key

env/environment.py CHANGED Viewed

@@ -95,9 +95,9 @@ class FlakySleuthEnv:
                 wrong_dir_penalty = 0.2
             reward = min(
-                1.0,
                 max(
-                    0.0,
                     self.cumulative_progress + terminal_score - late_penalty - wrong_dir_penalty,
                 ),
             )
@@ -117,7 +117,7 @@ class FlakySleuthEnv:
         if not done and self.step_count >= self.max_steps:
             done = True
             info = {
-                "terminal_score": 0.0,
                 "progress_score": self.cumulative_progress,
                 "late_penalty": max(0, self.step_count - 15) * 0.05,
                 "timeout": True,

                 wrong_dir_penalty = 0.2
             reward = min(
+                0.999,
                 max(
+                    0.001,
                     self.cumulative_progress + terminal_score - late_penalty - wrong_dir_penalty,
                 ),
             )
         if not done and self.step_count >= self.max_steps:
             done = True
             info = {
+                "terminal_score": 0.001,
                 "progress_score": self.cumulative_progress,
                 "late_penalty": max(0, self.step_count - 15) * 0.05,
                 "timeout": True,

flakysleuth_build_plan.md CHANGED Viewed

@@ -527,7 +527,7 @@ class FlakySleuthEnv:
                     and self.current_task.get("label") == "flaky"):
                 wrong_dir_penalty = 0.2
-            reward = min(1.0, max(0.0,
                 self.cumulative_progress + terminal_score
                 - late_penalty - wrong_dir_penalty
             ))
@@ -594,16 +594,16 @@ class FlakySleuthEnv:
         test_file = task.get("test_file", "")
         if test_file and test_file in filepath:
-            return 0.07   # reading the actual test file
         if any(filepath.endswith(ext) for ext in (".py",)):
-            return 0.03   # any python file
-        return 0.01       # non-python file (requirements, config, etc.)
     def _search_relevance_reward(self, pattern: str) -> float:
         pattern_lower = pattern.lower()
         if any(sig in pattern_lower for sig in FLAKY_SIGNAL_PATTERNS):
-            return 0.04   # searching for known flakiness signals
-        return 0.01       # generic search
     def _make_obs(self, tool_output=None) -> FlakySleuthObservation:
         task = self.current_task
@@ -639,7 +639,7 @@ def grade_action(action: FlakySleuthAction, task: dict) -> float:
         return grade_t2(action, task)
     elif tt == "fix_proposal":
         return grade_t3(action, task)
-    return 0.0
 ```
 ### 7.2 Task 1 Grader (`graders/task1_grader.py`)
@@ -650,16 +650,16 @@ from env.models import FlakySleuthAction
 def grade(action: FlakySleuthAction, task: dict) -> float:
     """Binary classification: flaky or stable. Exact match only."""
     if action.action_type != "classify_flakiness":
-        return 0.0
     predicted = action.argument.strip().lower()
     if predicted not in ("flaky", "stable"):
-        return 0.0
     # All IDoFT rows are flaky; stable examples are synthetically added
     # with label="stable" during dataset construction
     ground_truth = task.get("label", "flaky")
-    return 1.0 if predicted == ground_truth else 0.0
 ```
 ### 7.3 Task 2 Grader (`graders/task2_grader.py`)
@@ -677,7 +677,7 @@ with open(_SIM_PATH) as f:
 def _get_similarity(pred: str, true: str) -> float:
     if pred == true:
-        return 1.0
     key1 = f"{pred},{true}"
     key2 = f"{true},{pred}"
     return _RAW_SIM.get(key1, _RAW_SIM.get(key2, 0.0))
@@ -695,7 +695,7 @@ def grade(action: FlakySleuthAction, task: dict) -> float:
     Wrong family = 0.0
     """
     if action.action_type != "classify_root_cause":
-        return 0.0
     predicted = action.argument.strip().upper()
@@ -703,7 +703,7 @@ def grade(action: FlakySleuthAction, task: dict) -> float:
     predicted = predicted.replace(" ", "-")  # "OD Brit" → "OD-Brit"
     if predicted not in VALID_CATEGORIES:
-        return 0.0   # invalid category string
     # Take primary category from dataset (first if semicolon-separated)
     true_category = str(task.get("category", "")).split(";")[0].strip().upper()
@@ -745,11 +745,11 @@ def grade(action: FlakySleuthAction, task: dict) -> float:
     Component C: LLM judge         — 0.40 weight
     """
     if action.action_type != "propose_fix":
-        return 0.0
     proposed_fix = action.argument.strip()
     if not proposed_fix:
-        return 0.0
     category = str(task.get("category", "")).split(";")[0].strip().upper()
     known_fix = task.get("known_fix_diff", "") or ""
@@ -759,7 +759,7 @@ def grade(action: FlakySleuthAction, task: dict) -> float:
     patterns = EXPECTED_FIX_PATTERNS.get(category, [])
     if patterns:
         matches = sum(1 for p in patterns if p in proposed_fix)
-        pattern_score = min(1.0, matches / max(1, len(patterns) * 0.4))
     else:
         pattern_score = 0.5
@@ -770,7 +770,7 @@ def grade(action: FlakySleuthAction, task: dict) -> float:
     judge_score = _llm_judge(proposed_fix, known_fix, category, test_code)
     total = (0.35 * pattern_score) + (0.25 * apply_score) + (0.40 * judge_score)
-    return round(min(1.0, max(0.0, total)), 4)
 def _check_diff_applies(fix: str, task: dict) -> float:
@@ -791,7 +791,7 @@ def _check_diff_applies(fix: str, task: dict) -> float:
             capture_output=True, text=True, timeout=10
         )
         os.unlink(patch_path)
-        return 1.0 if result.returncode == 0 else 0.0
     except Exception:
         return 0.3  # can't verify, neutral
@@ -905,7 +905,7 @@ description: >
 observation_type: FlakySleuthObservation
 action_type: FlakySleuthAction
-reward_range: [0.0, 1.0]
 tasks:
   - id: task1_classify
@@ -1176,7 +1176,7 @@ DAY 3 — Graders
 □ Implement graders/task2_grader.py + verify similarity matrix
 □ Implement graders/task3_grader.py (pattern + diff + LLM judge)
 □ Unit test all 3 graders with hardcoded inputs
-□ Verify scores are always in [0.0, 1.0]
 DAY 4 — Server + Spec Compliance
 ──────────────────────────────────

                     and self.current_task.get("label") == "flaky"):
                 wrong_dir_penalty = 0.2
+            reward = min(0.999, max(0.001,
                 self.cumulative_progress + terminal_score
                 - late_penalty - wrong_dir_penalty
             ))
         test_file = task.get("test_file", "")
         if test_file and test_file in filepath:
+            return 0.0017   # reading the actual test file
         if any(filepath.endswith(ext) for ext in (".py",)):
+            return 0.0013   # any python file
+        return 0.0011       # non-python file (requirements, config, etc.)
     def _search_relevance_reward(self, pattern: str) -> float:
         pattern_lower = pattern.lower()
         if any(sig in pattern_lower for sig in FLAKY_SIGNAL_PATTERNS):
+            return 0.0014   # searching for known flakiness signals
+        return 0.0011       # generic search
     def _make_obs(self, tool_output=None) -> FlakySleuthObservation:
         task = self.current_task
         return grade_t2(action, task)
     elif tt == "fix_proposal":
         return grade_t3(action, task)
+    return 0.001
 ```
 ### 7.2 Task 1 Grader (`graders/task1_grader.py`)
 def grade(action: FlakySleuthAction, task: dict) -> float:
     """Binary classification: flaky or stable. Exact match only."""
     if action.action_type != "classify_flakiness":
+        return 0.001
     predicted = action.argument.strip().lower()
     if predicted not in ("flaky", "stable"):
+        return 0.001
     # All IDoFT rows are flaky; stable examples are synthetically added
     # with label="stable" during dataset construction
     ground_truth = task.get("label", "flaky")
+    return 0.999 if predicted == ground_truth else 0.0
 ```
 ### 7.3 Task 2 Grader (`graders/task2_grader.py`)
 def _get_similarity(pred: str, true: str) -> float:
     if pred == true:
+        return 0.999
     key1 = f"{pred},{true}"
     key2 = f"{true},{pred}"
     return _RAW_SIM.get(key1, _RAW_SIM.get(key2, 0.0))
     Wrong family = 0.0
     """
     if action.action_type != "classify_root_cause":
+        return 0.001
     predicted = action.argument.strip().upper()
     predicted = predicted.replace(" ", "-")  # "OD Brit" → "OD-Brit"
     if predicted not in VALID_CATEGORIES:
+        return 0.001   # invalid category string
     # Take primary category from dataset (first if semicolon-separated)
     true_category = str(task.get("category", "")).split(";")[0].strip().upper()
     Component C: LLM judge         — 0.40 weight
     """
     if action.action_type != "propose_fix":
+        return 0.001
     proposed_fix = action.argument.strip()
     if not proposed_fix:
+        return 0.001
     category = str(task.get("category", "")).split(";")[0].strip().upper()
     known_fix = task.get("known_fix_diff", "") or ""
     patterns = EXPECTED_FIX_PATTERNS.get(category, [])
     if patterns:
         matches = sum(1 for p in patterns if p in proposed_fix)
+        pattern_score = min(0.999, matches / max(1, len(patterns) * 0.4))
     else:
         pattern_score = 0.5
     judge_score = _llm_judge(proposed_fix, known_fix, category, test_code)
     total = (0.35 * pattern_score) + (0.25 * apply_score) + (0.40 * judge_score)
+    return round(min(0.999, max(0.001, total)), 4)
 def _check_diff_applies(fix: str, task: dict) -> float:
             capture_output=True, text=True, timeout=10
         )
         os.unlink(patch_path)
+        return 0.999 if result.returncode == 0 else 0.0
     except Exception:
         return 0.3  # can't verify, neutral
 observation_type: FlakySleuthObservation
 action_type: FlakySleuthAction
+reward_range: (0.001, 0.999)
 tasks:
   - id: task1_classify
 □ Implement graders/task2_grader.py + verify similarity matrix
 □ Implement graders/task3_grader.py (pattern + diff + LLM judge)
 □ Unit test all 3 graders with hardcoded inputs
+□ Verify scores are always in (0.001, 0.999)
 DAY 4 — Server + Spec Compliance
 ──────────────────────────────────

graders/__init__.py CHANGED Viewed

@@ -14,4 +14,4 @@ def grade_action(action: FlakySleuthAction, task: dict) -> float:
         return grade_t2(action, task)
     if task_type == "fix_proposal":
         return grade_t3(action, task)
-    return 0.0

         return grade_t2(action, task)
     if task_type == "fix_proposal":
         return grade_t3(action, task)
+    return 0.001

graders/task1_grader.py CHANGED Viewed

@@ -6,11 +6,11 @@ from env.models import FlakySleuthAction
 def grade(action: FlakySleuthAction, task: dict) -> float:
     """Binary classification: flaky or stable. Exact match only."""
     if action.action_type != "classify_flakiness":
-        return 0.0
     predicted = action.argument.strip().lower()
     if predicted not in ("flaky", "stable"):
-        return 0.0
     ground_truth = str(task.get("label", "flaky")).strip().lower() or "flaky"
-    return 1.0 if predicted == ground_truth else 0.0

 def grade(action: FlakySleuthAction, task: dict) -> float:
     """Binary classification: flaky or stable. Exact match only."""
     if action.action_type != "classify_flakiness":
+        return 0.001
     predicted = action.argument.strip().lower()
     if predicted not in ("flaky", "stable"):
+        return 0.001
     ground_truth = str(task.get("label", "flaky")).strip().lower() or "flaky"
+    return 0.999 if predicted == ground_truth else 0.001

graders/task2_grader.py CHANGED Viewed

@@ -36,7 +36,7 @@ def _normalize_category(value: str) -> str:
 def _get_similarity(predicted: str, truth: str) -> float:
     if predicted == truth:
-        return 1.0
     key_a = f"{predicted},{truth}"
     key_b = f"{truth},{predicted}"
     return float(_RAW_SIM.get(key_a, _RAW_SIM.get(key_b, 0.0)))
@@ -45,15 +45,16 @@ def _get_similarity(predicted: str, truth: str) -> float:
 def grade(action: FlakySleuthAction, task: dict) -> float:
     """Root cause category classification with matrix-based partial credit."""
     if action.action_type != "classify_root_cause":
-        return 0.0
     predicted = _normalize_category(action.argument)
     if predicted not in VALID_CATEGORIES:
-        return 0.0
     raw_truth = str(task.get("category", "")).split(";")[0]
     truth = _normalize_category(raw_truth)
     if truth not in VALID_CATEGORIES:
-        return 0.0
-    return _get_similarity(predicted, truth)

 def _get_similarity(predicted: str, truth: str) -> float:
     if predicted == truth:
+        return 0.999
     key_a = f"{predicted},{truth}"
     key_b = f"{truth},{predicted}"
     return float(_RAW_SIM.get(key_a, _RAW_SIM.get(key_b, 0.0)))
 def grade(action: FlakySleuthAction, task: dict) -> float:
     """Root cause category classification with matrix-based partial credit."""
     if action.action_type != "classify_root_cause":
+        return 0.001
     predicted = _normalize_category(action.argument)
     if predicted not in VALID_CATEGORIES:
+        return 0.001
     raw_truth = str(task.get("category", "")).split(";")[0]
     truth = _normalize_category(raw_truth)
     if truth not in VALID_CATEGORIES:
+        return 0.001
+    sim = _get_similarity(predicted, truth)
+    return max(0.001, min(0.999, sim))

graders/task3_grader.py CHANGED Viewed

@@ -30,11 +30,11 @@ EXPECTED_FIX_PATTERNS = {
 def grade(action: FlakySleuthAction, task: dict) -> float:
     """Hybrid fixer grader: pattern + dry-run apply + LLM judge."""
     if action.action_type != "propose_fix":
-        return 0.0
     proposed_fix = action.argument.strip()
     if not proposed_fix:
-        return 0.0
     category = str(task.get("category", "")).split(";")[0].strip().upper()
     known_fix = task.get("known_fix_diff", "") or ""
@@ -45,7 +45,7 @@ def grade(action: FlakySleuthAction, task: dict) -> float:
         matches = sum(
             1 for pattern in patterns if pattern.lower() in proposed_fix.lower()
         )
-        pattern_score = min(1.0, matches / max(1, len(patterns) * 0.4))
     else:
         pattern_score = 0.5
@@ -53,12 +53,12 @@ def grade(action: FlakySleuthAction, task: dict) -> float:
     judge_score = _llm_judge(proposed_fix, known_fix, category, test_code)
     total = (0.35 * pattern_score) + (0.25 * apply_score) + (0.40 * judge_score)
-    return round(min(1.0, max(0.0, total)), 4)
 def _check_diff_applies(diff_text: str, task: dict) -> float:
     if "+++" not in diff_text or "---" not in diff_text:
-        return 0.0
     repo_root = str(task.get("sandbox_root", "")).strip()
     if not repo_root or not Path(repo_root).exists():
@@ -79,7 +79,7 @@ def _check_diff_applies(diff_text: str, task: dict) -> float:
             text=True,
             timeout=10,
         )
-        return 1.0 if result.returncode == 0 else 0.0
     except Exception:
         return 0.3
     finally:
@@ -156,6 +156,7 @@ Respond ONLY with JSON:
         raw = raw.replace("```json", "").replace("```", "").strip()
         payload = json.loads(raw)
         score = int(payload.get("score", 5))
-        return max(0.0, min(10.0, score)) / 10.0
     except Exception:
         return 0.5

 def grade(action: FlakySleuthAction, task: dict) -> float:
     """Hybrid fixer grader: pattern + dry-run apply + LLM judge."""
     if action.action_type != "propose_fix":
+        return 0.001
     proposed_fix = action.argument.strip()
     if not proposed_fix:
+        return 0.001
     category = str(task.get("category", "")).split(";")[0].strip().upper()
     known_fix = task.get("known_fix_diff", "") or ""
         matches = sum(
             1 for pattern in patterns if pattern.lower() in proposed_fix.lower()
         )
+        pattern_score = min(0.999, matches / max(1, len(patterns) * 0.4))
     else:
         pattern_score = 0.5
     judge_score = _llm_judge(proposed_fix, known_fix, category, test_code)
     total = (0.35 * pattern_score) + (0.25 * apply_score) + (0.40 * judge_score)
+    return round(min(0.999, max(0.001, total)), 4)
 def _check_diff_applies(diff_text: str, task: dict) -> float:
     if "+++" not in diff_text or "---" not in diff_text:
+        return 0.001
     repo_root = str(task.get("sandbox_root", "")).strip()
     if not repo_root or not Path(repo_root).exists():
             text=True,
             timeout=10,
         )
+        return 0.999 if result.returncode == 0 else 0.001
     except Exception:
         return 0.3
     finally:
         raw = raw.replace("```json", "").replace("```", "").strip()
         payload = json.loads(raw)
         score = int(payload.get("score", 5))
+        raw_score = max(0.0, min(10.0, score)) / 10.0
+        return max(0.001, min(0.999, raw_score))
     except Exception:
         return 0.5

inference.py CHANGED Viewed

@@ -790,7 +790,7 @@ def run_episode(
             _compliance_log_end(
                 success=success,
                 steps=steps_taken,
-                score=min(max(final_episode_score, 0.0), 1.0),
                 rewards=rewards,
             )

             _compliance_log_end(
                 success=success,
                 steps=steps_taken,
+                score=min(max(final_episode_score, 0.001), 0.999),
                 rewards=rewards,
             )

inference_compliance.py CHANGED Viewed

@@ -173,7 +173,7 @@ async def main() -> None:
                 break
         score = sum(rewards) / MAX_TOTAL_REWARD if MAX_TOTAL_REWARD > 0 else 0.0
-        score = min(max(score, 0.0), 1.0)  # clamp to [0, 1]
         success = score >= SUCCESS_SCORE_THRESHOLD
     finally:

                 break
         score = sum(rewards) / MAX_TOTAL_REWARD if MAX_TOTAL_REWARD > 0 else 0.0
+        score = min(max(score, 0.001), 0.999)  # clamp to (0.001, 0.999)
         success = score >= SUCCESS_SCORE_THRESHOLD
     finally:

inference_debug.py CHANGED Viewed

@@ -783,7 +783,7 @@ def run_episode(
             _compliance_log_end(
                 success=success,
                 steps=steps_taken,
-                score=min(max(final_episode_score, 0.0), 1.0),
                 rewards=rewards,
             )

             _compliance_log_end(
                 success=success,
                 steps=steps_taken,
+                score=min(max(final_episode_score, 0.001), 0.999),
                 rewards=rewards,
             )

openenv.yaml CHANGED Viewed

@@ -13,7 +13,7 @@ description: >
 action_type: FlakySleuthAction
 observation_type: FlakySleuthObservation
-reward_range: [0.0, 1.0]
 episode_max_steps: 20
 baseline_script: inference.py

 action_type: FlakySleuthAction
 observation_type: FlakySleuthObservation
+reward_range: (0.001, 0.999)
 episode_max_steps: 20
 baseline_script: inference.py