SyamSashank's picture
Initial commit with OpenEnv structure
6e7ce30
from environment.models import Issue, Action, Reward
from environment.graders import compute_f1
from typing import List
def compute_reward(
action: Action,
ground_truth: List[Issue],
step_count: int,
max_steps: int
) -> Reward:
"""
Dense reward:
- +0.2 per correctly identified issue (true positive)
- -0.1 per false positive
- -0.05 per step (encourage efficiency)
- +0.5 bonus if all issues found and final=True
- Final episode reward = F1 score at end if final=True, else 0
"""
# Compute current F1 based on issues reported so far
current_f1 = compute_f1(action.issues, ground_truth)
# Per-step penalty
step_penalty = -0.05 * step_count
# True positives: count matching (line, category)
truth_set = {(i.line, i.category) for i in ground_truth}
agent_set = {(i.line, i.category) for i in action.issues}
tp_count = len(truth_set & agent_set)
fp_count = len(agent_set - truth_set)
tp_reward = tp_count * 0.2
fp_penalty = fp_count * 0.1
reward_value = tp_reward - fp_penalty + step_penalty
# Bonus for early completion with all issues
all_found = (tp_count == len(ground_truth))
if action.final and all_found:
reward_value += 0.5
reason = f"Final answer correct! F1={current_f1}"
elif action.final:
reason = f"Final answer submitted with F1={current_f1}"
# If final but not all correct, still give F1 score as final reward
reward_value = current_f1
else:
reason = f"Step {step_count}: {tp_count}/{len(ground_truth)} issues found. +{tp_reward:.2f} -{fp_penalty:.2f} -{step_penalty:.2f}"
# Clip to [-1, 1] for stability
reward_value = max(-1.0, min(1.0, reward_value))
return Reward(value=reward_value, reason=reason)