Spaces:
Running
Running
ajaxwin
refactor: Update task configurations and grading logic for improved scoring and consistency
dccaaac | """ | |
| grader.py (Task 2 – Property Discovery) | |
| ----------------------------------------- | |
| Deterministic scorer for natural-language property submissions. | |
| One submission attempt per episode. | |
| Grade range: 0.0 – 1.0 (matchscore output, already normalised). | |
| """ | |
| from typing import Tuple | |
| from utils import SemanticMatcher | |
| class Task2Grader: | |
| """ | |
| Grades a Task 2 property submission. | |
| Parameters | |
| ---------- | |
| function_name : name of the target function | |
| property : the 'property' field from the target function's data | |
| """ | |
| def __init__(self, function_name: str, property: str, n: int) -> None: | |
| self.function_name = function_name | |
| self.property = property | |
| self.n = n | |
| self._decay = 0.75 | |
| def _clamp(self, reward: float) -> float: | |
| return max(0.001, min(0.999, reward)) | |
| def grade(self, submitted: str, steps: int, cummulative_cost: int) -> Tuple[float, str]: | |
| """Deterministic grade strictly in (0, 1).""" | |
| if not submitted or not submitted.strip(): | |
| return 0.001, "no_match" | |
| matcher = SemanticMatcher() | |
| match_score = matcher.matchscore(self.property, submitted) | |
| free_budget = (cummulative_cost / steps) * (self.n + 2) | |
| final_score = (match_score * 0.5) + (self._decay ** max(0, cummulative_cost - free_budget)) | |
| return self._clamp(final_score), matcher.confidence() |