A7m0d's picture
Upload folder using huggingface_hub
7dfe46c verified
"""Main entry point for the Korean Q&A evaluation system."""
import argparse
import sys
from pathlib import Path
import logging
# Add src to path
sys.path.append(str(Path(__file__).parent / "src"))
from src.logger import setup_logging
from src.config import Config
from src.dataset_loader import DatasetLoader
from src.evaluator import KoreanQAEvaluator
def main():
"""Main function to run the evaluation."""
parser = argparse.ArgumentParser(description="Korean Q&A Evaluation System")
parser.add_argument("--config", default=None, help="src/config.yaml")
parser.add_argument("--dataset", default="/Users/ahmedmostafa/Downloads/eval_Korean_qa/assets/bench_korean.csv", help="Path to dataset CSV file")
parser.add_argument("--output", help="Output path for results (optional)")
parser.add_argument("--log-level", default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
help="Logging level")
parser.add_argument("--threshold", type=float, default=0.8, help="Evaluation threshold")
parser.add_argument("--verbose", action="store_true", help="Enable verbose evaluation mode")
args = parser.parse_args()
# Setup logging
logger_setup = setup_logging(log_level=args.log_level)
logger = logger_setup.get_logger(__name__)
try:
logger.info("Starting Korean Q&A Evaluation System")
# Load configuration
logger.info("Loading configuration...")
if args.config is None:
# Try to find config file in multiple locations
script_dir = Path(__file__).parent
possible_configs = [script_dir / "src" / "config.yaml",
script_dir / "config.yaml"]
config_path = None
for path in possible_configs:
if path.exists():
config_path = str(path)
break
if config_path is None:
raise FileNotFoundError("No config.yaml found in expected locations")
else:
config_path = args.config
config = Config(config_path)
# Log evaluation start
logger_setup.log_evaluation_start(args.dataset, config.gemini_model)
# Load dataset
logger.info("Loading dataset...")
dataset_loader = DatasetLoader()
dataset = dataset_loader.load_from_csv(args.dataset)
# Get dataset statistics
stats = dataset_loader.get_dataset_stats()
logger.info(f"Dataset loaded: {stats}")
# Initialize evaluator
logger.info("Initializing evaluator...")
evaluator = KoreanQAEvaluator(
model_name=config.gemini_model,
api_key=config.google_api_key,
threshold=args.threshold,
verbose_mode=args.verbose)
# Run evaluation
logger.info("Running evaluation...")
results = evaluator.evaluate_dataset(dataset)
# Save results
if args.output:
output_path = evaluator.save_results(results, args.output)
else:
output_path = evaluator.save_results(results)
# Log evaluation end
logger_setup.log_evaluation_end(results)
logger.info(f"Evaluation completed successfully. Results saved to: {output_path}")
# Print summary to console
print("\n" + "="*60)
print("EVALUATION SUMMARY")
print("="*60)
print(f"Total test cases: {results['total_cases']}")
print(f"Passed cases: {results['passed_cases']}")
print(f"Failed cases: {results['failed_cases']}")
print(f"Pass rate: {results['pass_rate']}%")
print(f"Average score: {results['average_score']}")
print(f"Threshold: {results['threshold']}")
print(f"Model: {results['model_name']}")
print(f"Results saved to: {output_path}")
print("="*60)
except Exception as e:
logger.error(f"Evaluation failed: {e}")
sys.exit(1)
if __name__ == "__main__":
main()