Spaces:

minhan6559
/

Log-Analysis-MultiAgent

Sleeping

App Files Files Community

Log-Analysis-MultiAgent / src /scripts /run_evaluation.py

minhan6559

Upload 101 files

e4932aa verified 30 days ago

raw

history blame

10.5 kB

	#!/usr/bin/env python3
	"""
	Run Evaluation Pipeline

	This orchestrates the evaluation workflow on existing final_response data:
	1. Count tactic occurrences (count_tactics.py)
	2. Generate evaluation metrics (evaluate_metrics.py)
	3. Compare models (compare_models.py)
	4. Generate CSV with simple metrics (generate_metrics_csv.py)

	NOTE: This does NOT run the full 3-agent pipeline.
	Use execute_pipeline.py separately to generate final_response data first.

	Usage:
	python run_evaluation.py [--skip-counting]
	"""
	import subprocess
	import sys
	from pathlib import Path
	from datetime import datetime
	import argparse


	def find_project_root(start: Path) -> Path:
	"""Find the project root by looking for common markers."""
	for p in [start] + list(start.parents):
	if (
	(p / "final_response").exists()
	or (p / "src").exists()
	or (p / ".git").exists()
	):
	return p
	return start.parent


	class EvaluationRunner:
	"""Orchestrates the evaluation workflow"""

	def __init__(self, skip_counting: bool = False):
	self.skip_counting = skip_counting
	current_file = Path(__file__).resolve()
	self.project_root = find_project_root(current_file.parent)
	# Point to the evaluation/full_pipeline directory for scripts
	self.eval_dir = self.project_root / "src" / "evaluation" / "full_pipeline"
	# Output directory now in mordor_dataset/eval_output
	self.output_dir = (
	self.project_root / "mordor_dataset" / "eval_output" / "evaluation_results"
	)
	self.start_time = None

	def print_header(self, step: str, description: str):
	"""Print a formatted step header"""
	print("\n" + "=" * 80)
	print(f"STEP {step}: {description}")
	print("=" * 80)

	def run_command(self, description: str, cmd: list) -> bool:
	"""Run a command and handle errors"""
	print(f"\n{description}")
	print(f"Command: {' '.join(str(c) for c in cmd)}\n")

	try:
	result = subprocess.run(cmd, check=True)
	print(f"\n[SUCCESS] {description} completed")
	return True
	except subprocess.CalledProcessError as e:
	print(f"\n[ERROR] {description} failed with exit code {e.returncode}")
	return False
	except Exception as e:
	print(f"\n[ERROR] Unexpected error during {description}: {e}")
	return False

	def step_1_count_tactics(self) -> bool:
	"""Step 1: Count tactic occurrences"""
	self.print_header("1/3", "Counting Tactic Occurrences")

	if self.skip_counting:
	print("Skipping tactic counting (--skip-counting flag set)")
	print("Using existing tactic_counts_summary.json")
	return True

	final_response_dir = (
	self.project_root / "mordor_dataset" / "eval_output" / "final_response"
	)
	# Ensure output directory exists
	self.output_dir.mkdir(exist_ok=True)
	output_file = self.output_dir / "tactic_counts_summary.json"

	if not final_response_dir.exists():
	print(
	f"[ERROR] final_response directory not found at: {final_response_dir}"
	)
	print(
	"Run execute_pipeline_all_datasets.py first to generate analysis results"
	)
	return False

	# Count response_analysis.json files
	analysis_files = list(final_response_dir.rglob("response_analysis.json"))
	if not analysis_files:
	print(f"[ERROR] No response_analysis.json files found in final_response")
	print(
	"Run execute_pipeline_all_datasets.py first to generate analysis results"
	)
	return False

	print(f"Found {len(analysis_files)} analysis files")
	print(f"Output: {output_file}")

	script_path = self.eval_dir / "count_tactics.py"
	return self.run_command(
	"Count tactic occurrences",
	[sys.executable, str(script_path), "--output", str(output_file)],
	)

	def step_2_evaluate_metrics(self) -> bool:
	"""Step 2: Generate evaluation metrics for each model"""
	self.print_header("2/3", "Generating Evaluation Metrics")

	tactic_counts_file = self.output_dir / "tactic_counts_summary.json"
	output_file = self.output_dir / "evaluation_report.json"

	if not tactic_counts_file.exists():
	print(f"[ERROR] Tactic counts file not found: {tactic_counts_file}")
	print("Run step 1 first or remove --skip-counting flag")
	return False

	print(f"Input: {tactic_counts_file}")
	print(f"Output: {output_file}")
	print(
	"Note: Individual model reports will be saved as evaluation_report_[model_name].json"
	)

	script_path = self.eval_dir / "evaluate_metrics.py"
	return self.run_command(
	"Generate evaluation metrics for each model",
	[
	sys.executable,
	str(script_path),
	"--input",
	str(tactic_counts_file),
	"--output",
	str(output_file),
	],
	)

	def step_3_compare_models(self) -> bool:
	"""Step 3: Compare models"""
	self.print_header("3/4", "Comparing Models")

	tactic_counts_file = self.output_dir / "tactic_counts_summary.json"
	output_file = self.output_dir / "model_comparison.json"

	if not tactic_counts_file.exists():
	print(f"[ERROR] Tactic counts file not found: {tactic_counts_file}")
	print("Run step 1 first or remove --skip-counting flag")
	return False

	print(f"Input: {tactic_counts_file}")
	print(f"Output: {output_file}")

	script_path = self.eval_dir / "compare_models.py"
	return self.run_command(
	"Compare models",
	[
	sys.executable,
	str(script_path),
	"--input",
	str(tactic_counts_file),
	"--output",
	str(output_file),
	],
	)

	def step_4_generate_csv(self) -> bool:
	"""Step 4: Generate CSV with simple metrics"""
	self.print_header("4/4", "Generating CSV Metrics")

	tactic_counts_file = self.output_dir / "tactic_counts_summary.json"
	output_file = self.output_dir / "model_metrics.csv"

	if not tactic_counts_file.exists():
	print(f"[ERROR] Tactic counts file not found: {tactic_counts_file}")
	print("Run step 1 first or remove --skip-counting flag")
	return False

	print(f"Input: {tactic_counts_file}")
	print(f"Output: {output_file}")

	script_path = self.eval_dir / "generate_metrics_csv.py"
	return self.run_command(
	"Generate CSV with simple metrics (F1, accuracy, precision, recall)",
	[
	sys.executable,
	str(script_path),
	"--input",
	str(tactic_counts_file),
	"--output",
	str(output_file),
	],
	)

	def run(self) -> int:
	"""Run the evaluation pipeline"""
	self.start_time = datetime.now()

	print("\n" + "=" * 80)
	print("EVALUATION PIPELINE")
	print("=" * 80)
	print(f"Project Root: {self.project_root}")
	print(f"Evaluation Dir: {self.eval_dir}")
	print(f"Output Dir: {self.output_dir}")
	print(f"Start Time: {self.start_time.strftime('%Y-%m-%d %H:%M:%S')}")

	# Step 1: Count tactics
	if not self.step_1_count_tactics():
	print("\n[ERROR] Evaluation failed at Step 1")
	return 1

	# Step 2: Evaluate metrics
	if not self.step_2_evaluate_metrics():
	print("\n[ERROR] Evaluation failed at Step 2")
	return 1

	# Step 3: Compare models
	if not self.step_3_compare_models():
	print("\n[ERROR] Evaluation failed at Step 3")
	return 1

	# Step 4: Generate CSV metrics
	if not self.step_4_generate_csv():
	print("\n[ERROR] Evaluation failed at Step 4")
	return 1

	# Success summary
	end_time = datetime.now()
	duration = (end_time - self.start_time).total_seconds()

	print("\n" + "=" * 80)
	print("EVALUATION PIPELINE COMPLETED SUCCESSFULLY")
	print("=" * 80)
	print(f"Duration: {duration:.1f} seconds")
	print(f"\nOutput Files:")
	print(f" - {self.output_dir / 'tactic_counts_summary.json'}")
	print(f" - {self.output_dir / 'evaluation_report.json'} (summary)")
	print(
	f" - {self.output_dir / 'evaluation_report_[model_name].json'} (per model)"
	)
	print(f" - {self.output_dir / 'model_comparison.json'}")
	print(
	f" - {self.output_dir / 'model_metrics.csv'} (simple metrics: F1, accuracy, precision, recall)"
	)
	print(f"\nAll outputs are now organized under: mordor_dataset/eval_output/")
	print("=" * 80 + "\n")

	return 0


	def main():
	parser = argparse.ArgumentParser(
	description="Run evaluation pipeline on existing final_response data",
	formatter_class=argparse.RawDescriptionHelpFormatter,
	epilog="""
	Examples:
	# Run full evaluation (count tactics + evaluate metrics + compare models)
	python run_evaluation.py

	# Skip counting, only evaluate (use existing tactic_counts_summary.json)
	python run_evaluation.py --skip-counting

	Note: This does NOT run the 3-agent pipeline.
	Use execute_pipeline_all_datasets.py separately to process mordor dataset files.
	""",
	)
	parser.add_argument(
	"--skip-counting",
	action="store_true",
	help="Skip counting tactics, use existing tactic_counts_summary.json",
	)

	args = parser.parse_args()

	runner = EvaluationRunner(skip_counting=args.skip_counting)
	exit_code = runner.run()
	sys.exit(exit_code)


	if __name__ == "__main__":
	main()