AutoClean-Ai / examples /batch_evaluation.py
sairaj2's picture
Upload folder using huggingface_hub
61da702 verified
"""
Batch Evaluation Script for HallucinationGuard-Env.
This script demonstrates how to run batch evaluations across multiple
tasks and difficulties, generating comprehensive benchmark reports.
Requirements:
pip install requests matplotlib pandas
"""
import json
import time
from typing import List, Dict, Any, Optional
from datetime import datetime
import requests
class BatchEvaluator:
"""
Run batch evaluations across tasks and difficulties.
Features:
- Multi-task evaluation (Factual Grounding, Multi-hop, Adversarial)
- Multiple difficulty levels
- Performance metrics and calibration analysis
- JSON report generation
"""
TASKS = [
"task_1_factual_grounding",
"task_2_multi_hop_synthesis",
"task_3_adversarial_resistance"
]
DIFFICULTIES = ["beginner", "intermediate", "advanced"]
def __init__(self, env_base_url: str = "https://samsankar-hallucination-guard-env.hf.space"):
"""Initialize evaluator with environment URL."""
self.env_base_url = env_base_url.rstrip('/')
self.session = requests.Session()
def get_tasks(self) -> List[Dict]:
"""Get available tasks from environment."""
response = self.session.get(f"{self.env_base_url}/tasks")
response.raise_for_status()
return response.json().get("tasks", [])
def evaluate_baseline(
self,
task_id: str,
num_episodes: int = 3,
difficulty: str = "intermediate"
) -> Dict[str, Any]:
"""
Run baseline evaluation for a specific task.
Uses a simple heuristic baseline:
- Extract key entities from context
- Match entities to question
- Provide confidence based on match quality
Args:
task_id: Task identifier
num_episodes: Number of episodes to run
difficulty: Difficulty level
Returns:
Evaluation results
"""
results = {
"task_id": task_id,
"difficulty": difficulty,
"episodes": [],
"summary": {}
}
all_rewards = []
all_hallucinations = []
all_correct = []
for episode_num in range(num_episodes):
# Reset environment
reset_data = self._reset(task_id=task_id, difficulty=difficulty)
episode_rewards = []
episode_hallucinations = 0
episode_correct = 0
steps = 0
max_steps = 10
while steps < max_steps:
# Get current observation
question = reset_data.get("question", "")
context = reset_data.get("context", "")
# Generate baseline answer
answer_data = self._generate_baseline_answer(question, context)
# Step environment
step_data = self._step(**answer_data)
# Track metrics
reward = step_data.get("reward", 0.0)
episode_rewards.append(reward)
if step_data.get("is_hallucination", False):
episode_hallucinations += 1
if step_data.get("grounding_score", 0) > 0.7:
episode_correct += 1
steps += 1
if step_data.get("done", False):
break
# Get next question
reset_data = step_data
# Episode statistics
episode_avg_reward = sum(episode_rewards) / max(1, len(episode_rewards))
all_rewards.append(episode_avg_reward)
all_hallucinations.append(episode_hallucinations / max(1, steps))
all_correct.append(episode_correct / max(1, steps))
results["episodes"].append({
"episode_num": episode_num + 1,
"avg_reward": episode_avg_reward,
"hallucination_rate": episode_hallucinations / max(1, steps),
"accuracy": episode_correct / max(1, steps),
"total_steps": steps
})
print(f"Episode {episode_num + 1}: Reward={episode_avg_reward:.3f}, "
f"Hallucinations={episode_hallucinations}/{steps}")
# Aggregate results
results["summary"] = {
"avg_reward": sum(all_rewards) / len(all_rewards),
"avg_hallucination_rate": sum(all_hallucinations) / len(all_hallucinations),
"avg_accuracy": sum(all_correct) / len(all_correct),
"total_episodes": num_episodes,
"timestamp": datetime.now().isoformat()
}
return results
def _reset(self, task_id: str = None, difficulty: str = "intermediate") -> dict:
"""Reset environment."""
payload = {"difficulty": difficulty}
if task_id:
payload["task_id"] = task_id
response = self.session.post(f"{self.env_base_url}/reset", json=payload)
response.raise_for_status()
return response.json()
def _step(self, answer: str, confidence: float, source_quote: str = "") -> dict:
"""Submit step."""
response = self.session.post(
f"{self.env_base_url}/step",
json={
"answer": answer,
"confidence": confidence,
"source_quote": source_quote
}
)
response.raise_for_status()
return response.json()
def _generate_baseline_answer(self, question: str, context: str) -> dict:
"""
Generate a simple baseline answer.
Strategy:
1. Extract sentences from context
2. Find sentence most similar to question
3. Use that as answer with moderate confidence
4. Use sentence as source quote
"""
import re
# Split context into sentences
sentences = re.split(r'[.!?]+', context)
sentences = [s.strip() for s in sentences if len(s.strip()) > 10]
if not sentences:
return {
"answer": "I cannot find the answer in the provided context.",
"confidence": 0.3,
"source_quote": ""
}
# Find most relevant sentence (simple keyword matching)
question_words = set(question.lower().split())
best_sentence = sentences[0]
best_overlap = 0
for sentence in sentences:
sentence_words = set(sentence.lower().split())
overlap = len(question_words & sentence_words)
if overlap > best_overlap:
best_overlap = overlap
best_sentence = sentence
# Check if answer is likely in context
if best_overlap < 2:
return {
"answer": "The answer does not appear to be in the provided context.",
"confidence": 0.4,
"source_quote": ""
}
# Extract key part of sentence as answer
answer = best_sentence[:200] if len(best_sentence) > 200 else best_sentence
return {
"answer": answer,
"confidence": 0.5 + (best_overlap / 20), # Higher confidence with more overlap
"source_quote": best_sentence[:150]
}
def run_full_evaluation(
self,
episodes_per_task: int = 3,
difficulties: List[str] = None
) -> Dict[str, Any]:
"""
Run full evaluation across all tasks and difficulties.
Args:
episodes_per_task: Episodes per task configuration
difficulties: List of difficulties to test
Returns:
Complete evaluation report
"""
difficulties = difficulties or ["beginner", "intermediate", "advanced"]
report = {
"evaluation_date": datetime.now().isoformat(),
"environment_url": self.env_base_url,
"configuration": {
"episodes_per_task": episodes_per_task,
"difficulties": difficulties
},
"results": {}
}
print("Starting Full Evaluation")
print("=" * 60)
for task_id in self.TASKS:
print(f"\nEvaluating: {task_id}")
print("-" * 40)
report["results"][task_id] = {}
for difficulty in difficulties:
print(f" Difficulty: {difficulty}")
task_results = self.evaluate_baseline(
task_id=task_id,
num_episodes=episodes_per_task,
difficulty=difficulty
)
report["results"][task_id][difficulty] = task_results
# Brief pause between evaluations
time.sleep(1)
# Generate summary
report["summary"] = self._generate_summary(report)
return report
def _generate_summary(self, report: dict) -> dict:
"""Generate cross-task summary."""
summary = {
"overall_avg_reward": 0.0,
"overall_avg_hallucination_rate": 0.0,
"overall_avg_accuracy": 0.0,
"best_task": "",
"best_difficulty": ""
}
all_rewards = []
all_hallucinations = []
all_accuracies = []
task_performances = {}
for task_id, difficulties in report.get("results", {}).items():
task_rewards = []
for difficulty, results in difficulties.items():
task_summary = results.get("summary", {})
all_rewards.append(task_summary.get("avg_reward", 0))
all_hallucinations.append(task_summary.get("avg_hallucination_rate", 0))
all_accuracies.append(task_summary.get("avg_accuracy", 0))
task_rewards.append(task_summary.get("avg_reward", 0))
task_performances[task_id] = sum(task_rewards) / len(task_rewards)
if all_rewards:
summary["overall_avg_reward"] = sum(all_rewards) / len(all_rewards)
if all_hallucinations:
summary["overall_avg_hallucination_rate"] = sum(all_hallucinations) / len(all_hallucinations)
if all_accuracies:
summary["overall_avg_accuracy"] = sum(all_accuracies) / len(all_accuracies)
if task_performances:
summary["best_task"] = max(task_performances, key=task_performances.get)
return summary
def save_report(self, report: dict, filename: str = None) -> str:
"""Save report to JSON file."""
if filename is None:
filename = f"hallucination_eval_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(filename, 'w') as f:
json.dump(report, f, indent=2)
print(f"Report saved to: {filename}")
return filename
def main():
"""Run batch evaluation."""
import argparse
parser = argparse.ArgumentParser(description="Run batch hallucination evaluation")
parser.add_argument("--env-url", default="https://samsankar-hallucination-guard-env.hf.space",
help="Environment server URL")
parser.add_argument("--episodes", type=int, default=3, help="Episodes per task")
parser.add_argument("--output", default=None, help="Output file name")
args = parser.parse_args()
evaluator = BatchEvaluator(env_base_url=args.env_url)
# Run full evaluation
report = evaluator.run_full_evaluation(
episodes_per_task=args.episodes
)
# Print summary
print("\n" + "=" * 60)
print("EVALUATION SUMMARY")
print("=" * 60)
summary = report.get("summary", {})
print(f"Overall Average Reward: {summary.get('overall_avg_reward', 0):.3f}")
print(f"Overall Hallucination Rate: {summary.get('overall_avg_hallucination_rate', 0):.1%}")
print(f"Overall Accuracy: {summary.get('overall_avg_accuracy', 0):.1%}")
print(f"Best Performing Task: {summary.get('best_task', 'N/A')}")
# Save report
evaluator.save_report(report, args.output)
if __name__ == "__main__":
main()