Spaces:
Sleeping
Sleeping
| """ | |
| Batch Evaluation Script for HallucinationGuard-Env. | |
| This script demonstrates how to run batch evaluations across multiple | |
| tasks and difficulties, generating comprehensive benchmark reports. | |
| Requirements: | |
| pip install requests matplotlib pandas | |
| """ | |
| import json | |
| import time | |
| from typing import List, Dict, Any, Optional | |
| from datetime import datetime | |
| import requests | |
| class BatchEvaluator: | |
| """ | |
| Run batch evaluations across tasks and difficulties. | |
| Features: | |
| - Multi-task evaluation (Factual Grounding, Multi-hop, Adversarial) | |
| - Multiple difficulty levels | |
| - Performance metrics and calibration analysis | |
| - JSON report generation | |
| """ | |
| TASKS = [ | |
| "task_1_factual_grounding", | |
| "task_2_multi_hop_synthesis", | |
| "task_3_adversarial_resistance" | |
| ] | |
| DIFFICULTIES = ["beginner", "intermediate", "advanced"] | |
| def __init__(self, env_base_url: str = "https://samsankar-hallucination-guard-env.hf.space"): | |
| """Initialize evaluator with environment URL.""" | |
| self.env_base_url = env_base_url.rstrip('/') | |
| self.session = requests.Session() | |
| def get_tasks(self) -> List[Dict]: | |
| """Get available tasks from environment.""" | |
| response = self.session.get(f"{self.env_base_url}/tasks") | |
| response.raise_for_status() | |
| return response.json().get("tasks", []) | |
| def evaluate_baseline( | |
| self, | |
| task_id: str, | |
| num_episodes: int = 3, | |
| difficulty: str = "intermediate" | |
| ) -> Dict[str, Any]: | |
| """ | |
| Run baseline evaluation for a specific task. | |
| Uses a simple heuristic baseline: | |
| - Extract key entities from context | |
| - Match entities to question | |
| - Provide confidence based on match quality | |
| Args: | |
| task_id: Task identifier | |
| num_episodes: Number of episodes to run | |
| difficulty: Difficulty level | |
| Returns: | |
| Evaluation results | |
| """ | |
| results = { | |
| "task_id": task_id, | |
| "difficulty": difficulty, | |
| "episodes": [], | |
| "summary": {} | |
| } | |
| all_rewards = [] | |
| all_hallucinations = [] | |
| all_correct = [] | |
| for episode_num in range(num_episodes): | |
| # Reset environment | |
| reset_data = self._reset(task_id=task_id, difficulty=difficulty) | |
| episode_rewards = [] | |
| episode_hallucinations = 0 | |
| episode_correct = 0 | |
| steps = 0 | |
| max_steps = 10 | |
| while steps < max_steps: | |
| # Get current observation | |
| question = reset_data.get("question", "") | |
| context = reset_data.get("context", "") | |
| # Generate baseline answer | |
| answer_data = self._generate_baseline_answer(question, context) | |
| # Step environment | |
| step_data = self._step(**answer_data) | |
| # Track metrics | |
| reward = step_data.get("reward", 0.0) | |
| episode_rewards.append(reward) | |
| if step_data.get("is_hallucination", False): | |
| episode_hallucinations += 1 | |
| if step_data.get("grounding_score", 0) > 0.7: | |
| episode_correct += 1 | |
| steps += 1 | |
| if step_data.get("done", False): | |
| break | |
| # Get next question | |
| reset_data = step_data | |
| # Episode statistics | |
| episode_avg_reward = sum(episode_rewards) / max(1, len(episode_rewards)) | |
| all_rewards.append(episode_avg_reward) | |
| all_hallucinations.append(episode_hallucinations / max(1, steps)) | |
| all_correct.append(episode_correct / max(1, steps)) | |
| results["episodes"].append({ | |
| "episode_num": episode_num + 1, | |
| "avg_reward": episode_avg_reward, | |
| "hallucination_rate": episode_hallucinations / max(1, steps), | |
| "accuracy": episode_correct / max(1, steps), | |
| "total_steps": steps | |
| }) | |
| print(f"Episode {episode_num + 1}: Reward={episode_avg_reward:.3f}, " | |
| f"Hallucinations={episode_hallucinations}/{steps}") | |
| # Aggregate results | |
| results["summary"] = { | |
| "avg_reward": sum(all_rewards) / len(all_rewards), | |
| "avg_hallucination_rate": sum(all_hallucinations) / len(all_hallucinations), | |
| "avg_accuracy": sum(all_correct) / len(all_correct), | |
| "total_episodes": num_episodes, | |
| "timestamp": datetime.now().isoformat() | |
| } | |
| return results | |
| def _reset(self, task_id: str = None, difficulty: str = "intermediate") -> dict: | |
| """Reset environment.""" | |
| payload = {"difficulty": difficulty} | |
| if task_id: | |
| payload["task_id"] = task_id | |
| response = self.session.post(f"{self.env_base_url}/reset", json=payload) | |
| response.raise_for_status() | |
| return response.json() | |
| def _step(self, answer: str, confidence: float, source_quote: str = "") -> dict: | |
| """Submit step.""" | |
| response = self.session.post( | |
| f"{self.env_base_url}/step", | |
| json={ | |
| "answer": answer, | |
| "confidence": confidence, | |
| "source_quote": source_quote | |
| } | |
| ) | |
| response.raise_for_status() | |
| return response.json() | |
| def _generate_baseline_answer(self, question: str, context: str) -> dict: | |
| """ | |
| Generate a simple baseline answer. | |
| Strategy: | |
| 1. Extract sentences from context | |
| 2. Find sentence most similar to question | |
| 3. Use that as answer with moderate confidence | |
| 4. Use sentence as source quote | |
| """ | |
| import re | |
| # Split context into sentences | |
| sentences = re.split(r'[.!?]+', context) | |
| sentences = [s.strip() for s in sentences if len(s.strip()) > 10] | |
| if not sentences: | |
| return { | |
| "answer": "I cannot find the answer in the provided context.", | |
| "confidence": 0.3, | |
| "source_quote": "" | |
| } | |
| # Find most relevant sentence (simple keyword matching) | |
| question_words = set(question.lower().split()) | |
| best_sentence = sentences[0] | |
| best_overlap = 0 | |
| for sentence in sentences: | |
| sentence_words = set(sentence.lower().split()) | |
| overlap = len(question_words & sentence_words) | |
| if overlap > best_overlap: | |
| best_overlap = overlap | |
| best_sentence = sentence | |
| # Check if answer is likely in context | |
| if best_overlap < 2: | |
| return { | |
| "answer": "The answer does not appear to be in the provided context.", | |
| "confidence": 0.4, | |
| "source_quote": "" | |
| } | |
| # Extract key part of sentence as answer | |
| answer = best_sentence[:200] if len(best_sentence) > 200 else best_sentence | |
| return { | |
| "answer": answer, | |
| "confidence": 0.5 + (best_overlap / 20), # Higher confidence with more overlap | |
| "source_quote": best_sentence[:150] | |
| } | |
| def run_full_evaluation( | |
| self, | |
| episodes_per_task: int = 3, | |
| difficulties: List[str] = None | |
| ) -> Dict[str, Any]: | |
| """ | |
| Run full evaluation across all tasks and difficulties. | |
| Args: | |
| episodes_per_task: Episodes per task configuration | |
| difficulties: List of difficulties to test | |
| Returns: | |
| Complete evaluation report | |
| """ | |
| difficulties = difficulties or ["beginner", "intermediate", "advanced"] | |
| report = { | |
| "evaluation_date": datetime.now().isoformat(), | |
| "environment_url": self.env_base_url, | |
| "configuration": { | |
| "episodes_per_task": episodes_per_task, | |
| "difficulties": difficulties | |
| }, | |
| "results": {} | |
| } | |
| print("Starting Full Evaluation") | |
| print("=" * 60) | |
| for task_id in self.TASKS: | |
| print(f"\nEvaluating: {task_id}") | |
| print("-" * 40) | |
| report["results"][task_id] = {} | |
| for difficulty in difficulties: | |
| print(f" Difficulty: {difficulty}") | |
| task_results = self.evaluate_baseline( | |
| task_id=task_id, | |
| num_episodes=episodes_per_task, | |
| difficulty=difficulty | |
| ) | |
| report["results"][task_id][difficulty] = task_results | |
| # Brief pause between evaluations | |
| time.sleep(1) | |
| # Generate summary | |
| report["summary"] = self._generate_summary(report) | |
| return report | |
| def _generate_summary(self, report: dict) -> dict: | |
| """Generate cross-task summary.""" | |
| summary = { | |
| "overall_avg_reward": 0.0, | |
| "overall_avg_hallucination_rate": 0.0, | |
| "overall_avg_accuracy": 0.0, | |
| "best_task": "", | |
| "best_difficulty": "" | |
| } | |
| all_rewards = [] | |
| all_hallucinations = [] | |
| all_accuracies = [] | |
| task_performances = {} | |
| for task_id, difficulties in report.get("results", {}).items(): | |
| task_rewards = [] | |
| for difficulty, results in difficulties.items(): | |
| task_summary = results.get("summary", {}) | |
| all_rewards.append(task_summary.get("avg_reward", 0)) | |
| all_hallucinations.append(task_summary.get("avg_hallucination_rate", 0)) | |
| all_accuracies.append(task_summary.get("avg_accuracy", 0)) | |
| task_rewards.append(task_summary.get("avg_reward", 0)) | |
| task_performances[task_id] = sum(task_rewards) / len(task_rewards) | |
| if all_rewards: | |
| summary["overall_avg_reward"] = sum(all_rewards) / len(all_rewards) | |
| if all_hallucinations: | |
| summary["overall_avg_hallucination_rate"] = sum(all_hallucinations) / len(all_hallucinations) | |
| if all_accuracies: | |
| summary["overall_avg_accuracy"] = sum(all_accuracies) / len(all_accuracies) | |
| if task_performances: | |
| summary["best_task"] = max(task_performances, key=task_performances.get) | |
| return summary | |
| def save_report(self, report: dict, filename: str = None) -> str: | |
| """Save report to JSON file.""" | |
| if filename is None: | |
| filename = f"hallucination_eval_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" | |
| with open(filename, 'w') as f: | |
| json.dump(report, f, indent=2) | |
| print(f"Report saved to: {filename}") | |
| return filename | |
| def main(): | |
| """Run batch evaluation.""" | |
| import argparse | |
| parser = argparse.ArgumentParser(description="Run batch hallucination evaluation") | |
| parser.add_argument("--env-url", default="https://samsankar-hallucination-guard-env.hf.space", | |
| help="Environment server URL") | |
| parser.add_argument("--episodes", type=int, default=3, help="Episodes per task") | |
| parser.add_argument("--output", default=None, help="Output file name") | |
| args = parser.parse_args() | |
| evaluator = BatchEvaluator(env_base_url=args.env_url) | |
| # Run full evaluation | |
| report = evaluator.run_full_evaluation( | |
| episodes_per_task=args.episodes | |
| ) | |
| # Print summary | |
| print("\n" + "=" * 60) | |
| print("EVALUATION SUMMARY") | |
| print("=" * 60) | |
| summary = report.get("summary", {}) | |
| print(f"Overall Average Reward: {summary.get('overall_avg_reward', 0):.3f}") | |
| print(f"Overall Hallucination Rate: {summary.get('overall_avg_hallucination_rate', 0):.1%}") | |
| print(f"Overall Accuracy: {summary.get('overall_avg_accuracy', 0):.1%}") | |
| print(f"Best Performing Task: {summary.get('best_task', 'N/A')}") | |
| # Save report | |
| evaluator.save_report(report, args.output) | |
| if __name__ == "__main__": | |
| main() |