Spaces:

Pratap-K
/

AutoMathReasoner

Sleeping

App Files Files Community

AutoMathReasoner / env /verifier.py

Pratap-K

Modify Task : Calculus

7813169 15 days ago

raw

history blame contribute delete

6.3 kB

	import re
	import math
	from typing import Dict, Any, Tuple

	class VerifierSystem:
	def __init__(self):
	pass

	def check_exact_match(self, prediction: str, ground_truth: str) -> bool:
	"""1. Exact match verifier"""
	return prediction.strip().lower() == ground_truth.strip().lower()

	def check_numeric_tolerance(self, prediction: str, ground_truth: str, tol: float = 1e-4) -> bool:
	"""2. Numeric tolerance checker"""
	try:
	pred_val = float(prediction.strip())
	gt_val = float(ground_truth.strip())
	return math.isclose(pred_val, gt_val, rel_tol=tol, abs_tol=tol)
	except ValueError:
	return False

	def check_python_execution(self, prediction: str, ground_truth: str) -> bool:
	"""3. Python execution (eval safe expressions)"""
	# If prediction is an expression like "2+3", try evaluating it safely
	safe_dict = {"__builtins__": None, "math": math}
	try:
	# We are verifying if evaluating the prediction gives ground truth
	pred_eval = eval(prediction.strip(), safe_dict, {})
	try:
	gt_eval = float(ground_truth.strip())
	return math.isclose(float(pred_eval), gt_eval, rel_tol=1e-4, abs_tol=1e-4)
	except ValueError:
	return str(pred_eval).strip().lower() == ground_truth.strip().lower()
	except Exception:
	return False

	def mock_llm_judge(self, reasoning: str, prediction: str, ground_truth: str) -> float:
	"""4. LLM judge (mock or placeholder scoring reasoning quality)
	Returns reasoning quality score Q (0.0 to 1.0)
	"""
	# A simple heuristic for mock judge:
	# Longer reasoning with step-like markers suggests higher quality in this mock
	step_markers = ['step', 'first', 'then', 'because', 'therefore', 'equals', '=', '+', '-']
	score = 0.0

	# Length bonus (up to 0.4)
	length = len(reasoning.split())
	score += min(0.4, length * 0.01)

	# Structure bonus (up to 0.6)
	lower_reasoning = reasoning.lower()
	marker_count = sum(1 for m in step_markers if m in lower_reasoning)
	score += min(0.6, marker_count * 0.1)

	return round(min(1.0, score), 2)

	def check_process_supervision(self, reasoning: str) -> float:
	"""
	[PAPER TRACEABILITY: Process Supervision (Lightweight PRM)]
	E. PROCESS SUPERVISION (STEP-AWARE REWARD)
	Validates reasoning steps (basic heuristics).
	Penalizes logical jumps and rewards structured step-by-step reasoning.
	"""
	lower_r = reasoning.lower()
	score = 0.0

	# Check stepwise structure
	if "step 1" in lower_r and "step 2" in lower_r:
	score += 0.5
	elif "first" in lower_r and ("then" in lower_r or "next" in lower_r):
	score += 0.3

	# Penalize missing steps if it's very short but claims complex operations
	if len(lower_r.split()) < 10 and ("=" in lower_r or "so" in lower_r):
	score -= 0.5 # Logical jump penalty

	return max(-1.0, min(1.0, score))

	def check_reflection(self, reasoning: str, c: float) -> float:
	"""
	[PAPER TRACEABILITY: Reflection Module]
	H. REFLECTION MODULE
	Model generates "What could be wrong?"
	Penalize if contradiction with final answer, reward correct self-correction.
	"""
	lower_r = reasoning.lower()
	score = 0.0

	reflection_phrases = ["what could be wrong", "wait,", "let me check", "alternatively"]
	if any(phrase in lower_r for phrase in reflection_phrases):
	# Reflection attempted
	if c >= 1.0:
	score += 1.0 # Correct self-correction / successful verification
	else:
	score -= 0.5 # Contradiction or failed correction

	return score

	def check_numerical_integration(self, prediction: str, sympy_f: Any) -> bool:
	"""
	[PAPER TRACEABILITY: Section 3.1.3 Solution Verification]
	Numerical multi-point quadrature verification.
	Instead of evaluating integrals, we differentiate the prediction F_pred(x)
	and compare it to the ground truth integrand f(x) at 5 random points.
	"""
	import sympy as sp
	import random
	x = sp.Symbol('x')
	try:
	# Clean prediction string
	clean_pred = prediction.strip()
	if "Answer:" in clean_pred:
	clean_pred = clean_pred.split("Answer:")[-1].strip()
	clean_pred = clean_pred.replace("+ C", "").replace("+C", "").strip()

	F_pred = sp.parse_expr(clean_pred)
	f_pred = sp.diff(F_pred, x)

	# Evaluate at 5 random points
	for _ in range(5):
	test_point = random.uniform(-5, 5)
	p_val = float(f_pred.subs(x, test_point).evalf())
	t_val = float(sympy_f.subs(x, test_point).evalf())

	# Paper uses 10^-2 relative tolerance
	if not math.isclose(p_val, t_val, rel_tol=1e-2, abs_tol=1e-2):
	return False
	return True
	except Exception:
	return False

	def verify(self, reasoning: str, prediction: str, ground_truth: str, sympy_f: Any = None) -> Tuple[float, float, float, float]:
	"""
	Run all verifiers.
	Returns Correctness (C), Reasoning Quality (Q), Process Supervision (P), and Reflection (R).
	"""
	c = 0.0
	if self.check_exact_match(prediction, ground_truth):
	c = 1.0
	elif sympy_f is not None and self.check_numerical_integration(prediction, sympy_f):
	c = 1.0
	elif self.check_numeric_tolerance(prediction, ground_truth):
	c = 1.0
	elif self.check_python_execution(prediction, ground_truth):
	c = 1.0

	q = self.mock_llm_judge(reasoning, prediction, ground_truth)

	p = self.check_process_supervision(reasoning)
	r = self.check_reflection(reasoning, c)

	return c, q, p, r