Spaces:

CodeReview
/

codereview-env

Running

App Files Files Community

codereview-env / environment /rewards.py

SyamSashank

Initial commit with OpenEnv structure

6e7ce30 8 days ago

raw

history blame contribute delete

1.83 kB

	from environment.models import Issue, Action, Reward
	from environment.graders import compute_f1
	from typing import List

	def compute_reward(
	action: Action,
	ground_truth: List[Issue],
	step_count: int,
	max_steps: int
	) -> Reward:
	"""
	Dense reward:
	- +0.2 per correctly identified issue (true positive)
	- -0.1 per false positive
	- -0.05 per step (encourage efficiency)
	- +0.5 bonus if all issues found and final=True
	- Final episode reward = F1 score at end if final=True, else 0
	"""
	# Compute current F1 based on issues reported so far
	current_f1 = compute_f1(action.issues, ground_truth)

	# Per-step penalty
	step_penalty = -0.05 * step_count

	# True positives: count matching (line, category)
	truth_set = {(i.line, i.category) for i in ground_truth}
	agent_set = {(i.line, i.category) for i in action.issues}
	tp_count = len(truth_set & agent_set)
	fp_count = len(agent_set - truth_set)

	tp_reward = tp_count * 0.2
	fp_penalty = fp_count * 0.1

	reward_value = tp_reward - fp_penalty + step_penalty

	# Bonus for early completion with all issues
	all_found = (tp_count == len(ground_truth))
	if action.final and all_found:
	reward_value += 0.5
	reason = f"Final answer correct! F1={current_f1}"
	elif action.final:
	reason = f"Final answer submitted with F1={current_f1}"
	# If final but not all correct, still give F1 score as final reward
	reward_value = current_f1
	else:
	reason = f"Step {step_count}: {tp_count}/{len(ground_truth)} issues found. +{tp_reward:.2f} -{fp_penalty:.2f} -{step_penalty:.2f}"

	# Clip to [-1, 1] for stability
	reward_value = max(-1.0, min(1.0, reward_value))

	return Reward(value=reward_value, reason=reason)