| """ |
| Main evaluation orchestrator for bias detection framework. |
| |
| This module coordinates the evaluation process and provides the main interface |
| for running evaluations. |
| """ |
| from datetime import datetime |
| from pathlib import Path |
| from typing import List, Optional |
|
|
| from .models import Language, LanguageEvaluationResult |
| from .data_loader import GroundTruthLoader, ResultsWriter, DataLoadError |
| from .bias_detector import BiasDetector, BiasDetectionError |
| from .metrics_calculator import MetricsCalculator, MetricsFormatter |
|
|
|
|
| class EvaluationError(Exception): |
| """Custom exception for evaluation errors.""" |
| pass |
|
|
|
|
| class BiasEvaluationOrchestrator: |
| """ |
| Main orchestrator for bias detection evaluation. |
| |
| Coordinates data loading, bias detection, metrics calculation, and result output. |
| Provides a clean interface for running complete evaluations. |
| """ |
| |
| def __init__( |
| self, |
| data_dir: Path = Path("eval"), |
| rules_dir: Path = Path("rules"), |
| results_dir: Path = Path("eval/results") |
| ): |
| """ |
| Initialize the evaluation orchestrator. |
| |
| Args: |
| data_dir: Directory containing ground truth data |
| rules_dir: Directory containing bias detection rules |
| results_dir: Directory for writing results |
| """ |
| self.ground_truth_loader = GroundTruthLoader(data_dir) |
| self.bias_detector = BiasDetector(rules_dir) |
| self.metrics_calculator = MetricsCalculator() |
| self.metrics_formatter = MetricsFormatter() |
| self.results_writer = ResultsWriter(results_dir) |
| |
| def run_evaluation( |
| self, |
| languages: Optional[List[Language]] = None, |
| save_results: bool = True |
| ) -> List[LanguageEvaluationResult]: |
| """ |
| Run complete bias detection evaluation. |
| |
| Args: |
| languages: List of languages to evaluate (defaults to English and Swahili) |
| save_results: Whether to save results to files |
| |
| Returns: |
| List of evaluation results for each language |
| |
| Raises: |
| EvaluationError: If evaluation fails |
| """ |
| if languages is None: |
| |
| languages = [Language.ENGLISH, Language.SWAHILI, Language.FRENCH, Language.GIKUYU] |
| |
| results = [] |
| |
| try: |
| for language in languages: |
| print(f"Evaluating {language.value}...") |
| result = self._evaluate_language(language) |
| results.append(result) |
| |
| |
| lang_names = { |
| Language.ENGLISH: "English", |
| Language.SWAHILI: "Swahili", |
| Language.FRENCH: "French", |
| Language.GIKUYU: "Gikuyu" |
| } |
| lang_name = lang_names.get(language, language.value) |
| print(f"{lang_name} Results:") |
| print(f" Overall F1: {result.overall_metrics.f1_score:.3f}") |
| print(f" Precision: {result.overall_metrics.precision:.3f}") |
| print(f" Recall: {result.overall_metrics.recall:.3f}") |
| print() |
| |
| if save_results: |
| self._save_results(results) |
| |
| return results |
| |
| except Exception as e: |
| raise EvaluationError(f"Evaluation failed: {e}") from e |
| |
| def _evaluate_language(self, language: Language) -> LanguageEvaluationResult: |
| """Evaluate bias detection for a single language.""" |
| try: |
| |
| ground_truth = self.ground_truth_loader.load_ground_truth(language) |
| |
| |
| predictions = [] |
| for sample in ground_truth: |
| prediction = self.bias_detector.detect_bias(sample.text, language) |
| predictions.append(prediction) |
| |
| |
| result = self.metrics_calculator.calculate_language_metrics( |
| ground_truth, predictions, language |
| ) |
| |
| return result |
| |
| except (DataLoadError, BiasDetectionError) as e: |
| raise EvaluationError(f"Failed to evaluate {language}: {e}") from e |
| |
| def _save_results(self, results: List[LanguageEvaluationResult]) -> None: |
| """Save evaluation results to files.""" |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
| |
| try: |
| |
| csv_data = self.metrics_formatter.format_for_csv(results) |
| csv_filename = f"f1_report_{timestamp}.csv" |
| csv_path = self.results_writer.write_csv_report(csv_data, csv_filename) |
| print(f"Report saved to: {csv_path}") |
| |
| except Exception as e: |
| print(f"Warning: Failed to save results: {e}") |
|
|
|
|
| def main() -> None: |
| """Main entry point for evaluation script.""" |
| try: |
| print("Running bias detection evaluation...") |
| |
| orchestrator = BiasEvaluationOrchestrator() |
| results = orchestrator.run_evaluation() |
| |
| print("Evaluation completed successfully!") |
| |
| except EvaluationError as e: |
| print(f"Evaluation failed: {e}") |
| exit(1) |
| except KeyboardInterrupt: |
| print("\nEvaluation interrupted by user") |
| exit(1) |
| except Exception as e: |
| print(f"Unexpected error: {e}") |
| exit(1) |
|
|
|
|
| if __name__ == "__main__": |
| main() |