Krishna1107 commited on
Commit
6a5922c
Β·
1 Parent(s): 19ed2d4

grading fix

Browse files
Files changed (2) hide show
  1. server/graders/__init__.py +56 -32
  2. server/models.py +1 -1
server/graders/__init__.py CHANGED
@@ -1,11 +1,14 @@
1
  """Deterministic grader for trajectory scoring.
2
 
3
  Scoring weights:
4
- partial fixes 40% (proportional to fix ratio)
5
- complete bonus 30% (all issues fixed)
6
- efficiency 30% (decays with extra steps)
7
- hint penalty -5% each
8
- failed edit -2% each
 
 
 
9
  """
10
 
11
  from typing import Any, Dict, List
@@ -13,20 +16,30 @@ from typing import Any, Dict, List
13
  from server.models import GraderResult
14
  from server.tasks.task_registry import TASK_REGISTRY
15
 
16
- # Tunable weights
17
- PARTIAL_FIX_WEIGHT = 0.40
18
- COMPLETE_BONUS = 0.30
19
- EFFICIENCY_MAX = 0.30
 
20
  EFFICIENCY_DECAY = 0.03 # per extra step beyond optimal
21
- HINT_PENALTY = 0.05
22
  FAILED_ACTION_PENALTY = 0.02
23
 
 
 
 
 
24
  EDIT_ACTION_TYPES = frozenset({
25
  "edit_file", "replace_line", "add_line",
26
  "delete_line", "add_block", "delete_block",
27
  })
28
 
29
 
 
 
 
 
 
30
  def run_grader(task_id: str, trajectory: List[Dict[str, Any]]) -> GraderResult:
31
  if task_id not in TASK_REGISTRY:
32
  raise ValueError(f"Unknown task: {task_id}")
@@ -34,16 +47,26 @@ def run_grader(task_id: str, trajectory: List[Dict[str, Any]]) -> GraderResult:
34
  if not trajectory:
35
  return GraderResult(
36
  task_id=task_id,
37
- score=0.0,
38
- breakdown={"partial_fixes": 0.0, "complete_solution": 0.0, "efficiency": 0.0, "hint_penalty": 0.0},
39
- feedback="No actions taken",
 
 
 
 
 
 
 
40
  steps_taken=0,
41
  hints_used=0,
42
  )
43
 
44
  final_step = trajectory[-1]
45
  steps_taken = len(trajectory)
46
- hints_used = sum(1 for s in trajectory if s.get("action", {}).get("action_type") == "request_hint")
 
 
 
47
 
48
  issues_fixed = int(final_step.get("info", {}).get("issues_fixed", 0))
49
  issues_total = max(1, int(final_step.get("info", {}).get("issues_total", 1)))
@@ -67,7 +90,7 @@ def run_grader(task_id: str, trajectory: List[Dict[str, Any]]) -> GraderResult:
67
  # Component 4: Hint penalty
68
  hint_pen = HINT_PENALTY * hints_used
69
 
70
- # Component 5: Failed action penalty (edits with no valid file_path)
71
  failed_edits = 0
72
  for step in trajectory:
73
  action = step.get("action", {})
@@ -77,29 +100,30 @@ def run_grader(task_id: str, trajectory: List[Dict[str, Any]]) -> GraderResult:
77
  failed_edits += 1
78
  failed_pen = FAILED_ACTION_PENALTY * failed_edits
79
 
80
- score = partial_score + complete_bonus + efficiency_score - hint_pen - failed_pen
81
- score = max(0.0, min(1.0, round(score, 3)))
82
-
83
- if score >= 0.9:
84
- feedback = "Excellent! All issues fixed efficiently."
85
- elif score >= 0.7:
86
- feedback = "Good job! Most issues fixed."
87
- elif score >= 0.5:
88
- feedback = "Partial success. Some issues remain."
89
- elif score >= 0.3:
90
- feedback = "Limited progress. Review the error messages carefully."
91
  else:
92
- feedback = "Needs improvement. Try analyzing the error phase first."
93
 
94
  return GraderResult(
95
  task_id=task_id,
96
  score=score,
97
  breakdown={
98
- "partial_fixes": round(partial_score, 3),
99
- "complete_solution": round(complete_bonus, 3),
100
- "efficiency": round(efficiency_score, 3),
101
- "hint_penalty": round(-hint_pen, 3),
102
- "failed_action_penalty": round(-failed_pen, 3),
 
103
  },
104
  feedback=feedback,
105
  steps_taken=steps_taken,
 
1
  """Deterministic grader for trajectory scoring.
2
 
3
  Scoring weights:
4
+ base score 5% (participation β€” guarantees score > 0)
5
+ partial fixes 35% (proportional to fix ratio)
6
+ complete bonus 25% (all issues fixed)
7
+ efficiency 25% (decays with extra steps)
8
+ hint penalty -4% each
9
+ failed edit -2% each
10
+
11
+ Score is always clamped to (0.01, 0.99) so it never hits 0 or 1.
12
  """
13
 
14
  from typing import Any, Dict, List
 
16
  from server.models import GraderResult
17
  from server.tasks.task_registry import TASK_REGISTRY
18
 
19
+ # Tunable weights β€” max possible = 0.05 + 0.35 + 0.25 + 0.25 = 0.90
20
+ BASE_SCORE = 0.05
21
+ PARTIAL_FIX_WEIGHT = 0.35
22
+ COMPLETE_BONUS = 0.25
23
+ EFFICIENCY_MAX = 0.25
24
  EFFICIENCY_DECAY = 0.03 # per extra step beyond optimal
25
+ HINT_PENALTY = 0.04
26
  FAILED_ACTION_PENALTY = 0.02
27
 
28
+ # Hard boundaries β€” score can never be exactly 0 or 1
29
+ SCORE_FLOOR = 0.01
30
+ SCORE_CEIL = 0.99
31
+
32
  EDIT_ACTION_TYPES = frozenset({
33
  "edit_file", "replace_line", "add_line",
34
  "delete_line", "add_block", "delete_block",
35
  })
36
 
37
 
38
+ def _clamp(value: float) -> float:
39
+ """Clamp score to the open interval (0, 1)."""
40
+ return max(SCORE_FLOOR, min(SCORE_CEIL, round(value, 4)))
41
+
42
+
43
  def run_grader(task_id: str, trajectory: List[Dict[str, Any]]) -> GraderResult:
44
  if task_id not in TASK_REGISTRY:
45
  raise ValueError(f"Unknown task: {task_id}")
 
47
  if not trajectory:
48
  return GraderResult(
49
  task_id=task_id,
50
+ score=_clamp(BASE_SCORE),
51
+ breakdown={
52
+ "base": BASE_SCORE,
53
+ "partial_fixes": 0.0,
54
+ "complete_solution": 0.0,
55
+ "efficiency": 0.0,
56
+ "hint_penalty": 0.0,
57
+ "failed_action_penalty": 0.0,
58
+ },
59
+ feedback="No actions taken.",
60
  steps_taken=0,
61
  hints_used=0,
62
  )
63
 
64
  final_step = trajectory[-1]
65
  steps_taken = len(trajectory)
66
+ hints_used = sum(
67
+ 1 for s in trajectory
68
+ if s.get("action", {}).get("action_type") == "request_hint"
69
+ )
70
 
71
  issues_fixed = int(final_step.get("info", {}).get("issues_fixed", 0))
72
  issues_total = max(1, int(final_step.get("info", {}).get("issues_total", 1)))
 
90
  # Component 4: Hint penalty
91
  hint_pen = HINT_PENALTY * hints_used
92
 
93
+ # Component 5: Failed action penalty
94
  failed_edits = 0
95
  for step in trajectory:
96
  action = step.get("action", {})
 
100
  failed_edits += 1
101
  failed_pen = FAILED_ACTION_PENALTY * failed_edits
102
 
103
+ raw = BASE_SCORE + partial_score + complete_bonus + efficiency_score - hint_pen - failed_pen
104
+ score = _clamp(raw)
105
+
106
+ if score >= 0.85:
107
+ feedback = "Excellent β€” all issues fixed efficiently."
108
+ elif score >= 0.65:
109
+ feedback = "Good job β€” most issues fixed."
110
+ elif score >= 0.45:
111
+ feedback = "Partial success β€” some issues remain."
112
+ elif score >= 0.25:
113
+ feedback = "Limited progress β€” review the error messages carefully."
114
  else:
115
+ feedback = "Needs improvement β€” try analyzing the error phase first."
116
 
117
  return GraderResult(
118
  task_id=task_id,
119
  score=score,
120
  breakdown={
121
+ "base": BASE_SCORE,
122
+ "partial_fixes": round(partial_score, 4),
123
+ "complete_solution": round(complete_bonus, 4),
124
+ "efficiency": round(efficiency_score, 4),
125
+ "hint_penalty": round(-hint_pen, 4),
126
+ "failed_action_penalty": round(-failed_pen, 4),
127
  },
128
  feedback=feedback,
129
  steps_taken=steps_taken,
server/models.py CHANGED
@@ -108,7 +108,7 @@ class EnvironmentInfo(BaseModel):
108
 
109
  class GraderResult(BaseModel):
110
  task_id: str
111
- score: float = Field(..., ge=0.0, le=1.0)
112
  max_score: float = 1.0
113
  breakdown: Dict[str, float] = Field(default_factory=dict)
114
  feedback: str = ""
 
108
 
109
  class GraderResult(BaseModel):
110
  task_id: str
111
+ score: float = Field(..., gt=0.0, lt=1.0)
112
  max_score: float = 1.0
113
  breakdown: Dict[str, float] = Field(default_factory=dict)
114
  feedback: str = ""