Spaces:

jester1177
/

cloudnative-devops-debug-env

Sleeping

App Files Files Community

Krishna1107 commited on Apr 7

Commit

6a5922c

1 Parent(s): 19ed2d4

grading fix

Browse files

Files changed (2) hide show

server/graders/__init__.py +56 -32
server/models.py +1 -1

server/graders/__init__.py CHANGED Viewed

@@ -1,11 +1,14 @@
 """Deterministic grader for trajectory scoring.
 Scoring weights:
-  partial fixes   40%  (proportional to fix ratio)
-  complete bonus  30%  (all issues fixed)
-  efficiency      30%  (decays with extra steps)
-  hint penalty    -5%  each
-  failed edit     -2%  each
 """
 from typing import Any, Dict, List
@@ -13,20 +16,30 @@ from typing import Any, Dict, List
 from server.models import GraderResult
 from server.tasks.task_registry import TASK_REGISTRY
-# Tunable weights
-PARTIAL_FIX_WEIGHT = 0.40
-COMPLETE_BONUS = 0.30
-EFFICIENCY_MAX = 0.30
 EFFICIENCY_DECAY = 0.03  # per extra step beyond optimal
-HINT_PENALTY = 0.05
 FAILED_ACTION_PENALTY = 0.02
 EDIT_ACTION_TYPES = frozenset({
     "edit_file", "replace_line", "add_line",
     "delete_line", "add_block", "delete_block",
 })
 def run_grader(task_id: str, trajectory: List[Dict[str, Any]]) -> GraderResult:
     if task_id not in TASK_REGISTRY:
         raise ValueError(f"Unknown task: {task_id}")
@@ -34,16 +47,26 @@ def run_grader(task_id: str, trajectory: List[Dict[str, Any]]) -> GraderResult:
     if not trajectory:
         return GraderResult(
             task_id=task_id,
-            score=0.0,
-            breakdown={"partial_fixes": 0.0, "complete_solution": 0.0, "efficiency": 0.0, "hint_penalty": 0.0},
-            feedback="No actions taken",
             steps_taken=0,
             hints_used=0,
         )
     final_step = trajectory[-1]
     steps_taken = len(trajectory)
-    hints_used = sum(1 for s in trajectory if s.get("action", {}).get("action_type") == "request_hint")
     issues_fixed = int(final_step.get("info", {}).get("issues_fixed", 0))
     issues_total = max(1, int(final_step.get("info", {}).get("issues_total", 1)))
@@ -67,7 +90,7 @@ def run_grader(task_id: str, trajectory: List[Dict[str, Any]]) -> GraderResult:
     # Component 4: Hint penalty
     hint_pen = HINT_PENALTY * hints_used
-    # Component 5: Failed action penalty (edits with no valid file_path)
     failed_edits = 0
     for step in trajectory:
         action = step.get("action", {})
@@ -77,29 +100,30 @@ def run_grader(task_id: str, trajectory: List[Dict[str, Any]]) -> GraderResult:
                 failed_edits += 1
     failed_pen = FAILED_ACTION_PENALTY * failed_edits
-    score = partial_score + complete_bonus + efficiency_score - hint_pen - failed_pen
-    score = max(0.0, min(1.0, round(score, 3)))
-    if score >= 0.9:
-        feedback = "Excellent! All issues fixed efficiently."
-    elif score >= 0.7:
-        feedback = "Good job! Most issues fixed."
-    elif score >= 0.5:
-        feedback = "Partial success. Some issues remain."
-    elif score >= 0.3:
-        feedback = "Limited progress. Review the error messages carefully."
     else:
-        feedback = "Needs improvement. Try analyzing the error phase first."
     return GraderResult(
         task_id=task_id,
         score=score,
         breakdown={
-            "partial_fixes": round(partial_score, 3),
-            "complete_solution": round(complete_bonus, 3),
-            "efficiency": round(efficiency_score, 3),
-            "hint_penalty": round(-hint_pen, 3),
-            "failed_action_penalty": round(-failed_pen, 3),
         },
         feedback=feedback,
         steps_taken=steps_taken,

 """Deterministic grader for trajectory scoring.
 Scoring weights:
+  base score      5%   (participation — guarantees score > 0)
+  partial fixes  35%   (proportional to fix ratio)
+  complete bonus 25%   (all issues fixed)
+  efficiency     25%   (decays with extra steps)
+  hint penalty   -4%   each
+  failed edit    -2%   each
+Score is always clamped to (0.01, 0.99) so it never hits 0 or 1.
 """
 from typing import Any, Dict, List
 from server.models import GraderResult
 from server.tasks.task_registry import TASK_REGISTRY
+# Tunable weights — max possible = 0.05 + 0.35 + 0.25 + 0.25 = 0.90
+BASE_SCORE = 0.05
+PARTIAL_FIX_WEIGHT = 0.35
+COMPLETE_BONUS = 0.25
+EFFICIENCY_MAX = 0.25
 EFFICIENCY_DECAY = 0.03  # per extra step beyond optimal
+HINT_PENALTY = 0.04
 FAILED_ACTION_PENALTY = 0.02
+# Hard boundaries — score can never be exactly 0 or 1
+SCORE_FLOOR = 0.01
+SCORE_CEIL = 0.99
 EDIT_ACTION_TYPES = frozenset({
     "edit_file", "replace_line", "add_line",
     "delete_line", "add_block", "delete_block",
 })
+def _clamp(value: float) -> float:
+    """Clamp score to the open interval (0, 1)."""
+    return max(SCORE_FLOOR, min(SCORE_CEIL, round(value, 4)))
 def run_grader(task_id: str, trajectory: List[Dict[str, Any]]) -> GraderResult:
     if task_id not in TASK_REGISTRY:
         raise ValueError(f"Unknown task: {task_id}")
     if not trajectory:
         return GraderResult(
             task_id=task_id,
+            score=_clamp(BASE_SCORE),
+            breakdown={
+                "base": BASE_SCORE,
+                "partial_fixes": 0.0,
+                "complete_solution": 0.0,
+                "efficiency": 0.0,
+                "hint_penalty": 0.0,
+                "failed_action_penalty": 0.0,
+            },
+            feedback="No actions taken.",
             steps_taken=0,
             hints_used=0,
         )
     final_step = trajectory[-1]
     steps_taken = len(trajectory)
+    hints_used = sum(
+        1 for s in trajectory
+        if s.get("action", {}).get("action_type") == "request_hint"
+    )
     issues_fixed = int(final_step.get("info", {}).get("issues_fixed", 0))
     issues_total = max(1, int(final_step.get("info", {}).get("issues_total", 1)))
     # Component 4: Hint penalty
     hint_pen = HINT_PENALTY * hints_used
+    # Component 5: Failed action penalty
     failed_edits = 0
     for step in trajectory:
         action = step.get("action", {})
                 failed_edits += 1
     failed_pen = FAILED_ACTION_PENALTY * failed_edits
+    raw = BASE_SCORE + partial_score + complete_bonus + efficiency_score - hint_pen - failed_pen
+    score = _clamp(raw)
+    if score >= 0.85:
+        feedback = "Excellent — all issues fixed efficiently."
+    elif score >= 0.65:
+        feedback = "Good job — most issues fixed."
+    elif score >= 0.45:
+        feedback = "Partial success — some issues remain."
+    elif score >= 0.25:
+        feedback = "Limited progress — review the error messages carefully."
     else:
+        feedback = "Needs improvement — try analyzing the error phase first."
     return GraderResult(
         task_id=task_id,
         score=score,
         breakdown={
+            "base": BASE_SCORE,
+            "partial_fixes": round(partial_score, 4),
+            "complete_solution": round(complete_bonus, 4),
+            "efficiency": round(efficiency_score, 4),
+            "hint_penalty": round(-hint_pen, 4),
+            "failed_action_penalty": round(-failed_pen, 4),
         },
         feedback=feedback,
         steps_taken=steps_taken,

server/models.py CHANGED Viewed

@@ -108,7 +108,7 @@ class EnvironmentInfo(BaseModel):
 class GraderResult(BaseModel):
     task_id: str
-    score: float = Field(..., ge=0.0, le=1.0)
     max_score: float = 1.0
     breakdown: Dict[str, float] = Field(default_factory=dict)
     feedback: str = ""

 class GraderResult(BaseModel):
     task_id: str
+    score: float = Field(..., gt=0.0, lt=1.0)
     max_score: float = 1.0
     breakdown: Dict[str, float] = Field(default_factory=dict)
     feedback: str = ""