|
|
"""Main entry point for the Korean Q&A evaluation system.""" |
|
|
|
|
|
import argparse |
|
|
import sys |
|
|
from pathlib import Path |
|
|
import logging |
|
|
|
|
|
|
|
|
sys.path.append(str(Path(__file__).parent / "src")) |
|
|
|
|
|
from src.logger import setup_logging |
|
|
from src.config import Config |
|
|
from src.dataset_loader import DatasetLoader |
|
|
from src.evaluator import KoreanQAEvaluator |
|
|
|
|
|
def main(): |
|
|
"""Main function to run the evaluation.""" |
|
|
parser = argparse.ArgumentParser(description="Korean Q&A Evaluation System") |
|
|
parser.add_argument("--config", default=None, help="src/config.yaml") |
|
|
parser.add_argument("--dataset", default="/Users/ahmedmostafa/Downloads/eval_Korean_qa/assets/bench_korean.csv", help="Path to dataset CSV file") |
|
|
parser.add_argument("--output", help="Output path for results (optional)") |
|
|
parser.add_argument("--log-level", default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"], |
|
|
help="Logging level") |
|
|
parser.add_argument("--threshold", type=float, default=0.8, help="Evaluation threshold") |
|
|
parser.add_argument("--verbose", action="store_true", help="Enable verbose evaluation mode") |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
logger_setup = setup_logging(log_level=args.log_level) |
|
|
logger = logger_setup.get_logger(__name__) |
|
|
|
|
|
try: |
|
|
logger.info("Starting Korean Q&A Evaluation System") |
|
|
|
|
|
|
|
|
logger.info("Loading configuration...") |
|
|
if args.config is None: |
|
|
|
|
|
script_dir = Path(__file__).parent |
|
|
possible_configs = [script_dir / "src" / "config.yaml", |
|
|
script_dir / "config.yaml"] |
|
|
|
|
|
|
|
|
config_path = None |
|
|
for path in possible_configs: |
|
|
if path.exists(): |
|
|
config_path = str(path) |
|
|
break |
|
|
|
|
|
if config_path is None: |
|
|
raise FileNotFoundError("No config.yaml found in expected locations") |
|
|
else: |
|
|
config_path = args.config |
|
|
config = Config(config_path) |
|
|
|
|
|
|
|
|
logger_setup.log_evaluation_start(args.dataset, config.gemini_model) |
|
|
|
|
|
|
|
|
logger.info("Loading dataset...") |
|
|
dataset_loader = DatasetLoader() |
|
|
dataset = dataset_loader.load_from_csv(args.dataset) |
|
|
|
|
|
|
|
|
stats = dataset_loader.get_dataset_stats() |
|
|
logger.info(f"Dataset loaded: {stats}") |
|
|
|
|
|
|
|
|
logger.info("Initializing evaluator...") |
|
|
evaluator = KoreanQAEvaluator( |
|
|
model_name=config.gemini_model, |
|
|
api_key=config.google_api_key, |
|
|
threshold=args.threshold, |
|
|
verbose_mode=args.verbose) |
|
|
|
|
|
|
|
|
|
|
|
logger.info("Running evaluation...") |
|
|
results = evaluator.evaluate_dataset(dataset) |
|
|
|
|
|
|
|
|
if args.output: |
|
|
output_path = evaluator.save_results(results, args.output) |
|
|
else: |
|
|
output_path = evaluator.save_results(results) |
|
|
|
|
|
|
|
|
|
|
|
logger_setup.log_evaluation_end(results) |
|
|
logger.info(f"Evaluation completed successfully. Results saved to: {output_path}") |
|
|
|
|
|
print("\n" + "="*60) |
|
|
print("EVALUATION SUMMARY") |
|
|
print("="*60) |
|
|
print(f"Total test cases: {results['total_cases']}") |
|
|
print(f"Passed cases: {results['passed_cases']}") |
|
|
print(f"Failed cases: {results['failed_cases']}") |
|
|
print(f"Pass rate: {results['pass_rate']}%") |
|
|
print(f"Average score: {results['average_score']}") |
|
|
print(f"Threshold: {results['threshold']}") |
|
|
print(f"Model: {results['model_name']}") |
|
|
print(f"Results saved to: {output_path}") |
|
|
print("="*60) |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Evaluation failed: {e}") |
|
|
sys.exit(1) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |