import argparse import re import random from typing import Dict, List, Optional, Any from langchain_openai import ChatOpenAI from langchain.schema import HumanMessage from datasets import load_dataset import pandas as pd from ..evaluation.questions import EXAM_QUESTIONS def get_model_response(llm, question): """Get response from the model for a single question""" prompt = f"""Question: {question['statement']} Options: A) {question['options']['A']} B) {question['options']['B']} C) {question['options']['C']} D) {question['options']['D']} Respond with only the letter of the correct option (A, B, C, or D). Answer:""" message = HumanMessage(content=prompt) response = llm.invoke([message]) generated_response = response.content.strip().upper() # Clean the response to get just the letter match = re.search(r'[ABCD]', generated_response) if match: return match.group(0) return "No valid response" def evaluate_theme(llm, theme_questions: List[Dict], max_questions: int = 50, progress=None, theme_index=0, total_themes=1) -> Dict: """ Evaluate a model on a specific theme Args: llm: The language model to use theme_questions: List of questions for this theme max_questions: Maximum number of questions to evaluate per theme progress: Progress tracker object theme_index: Current theme index (for progress calculation) total_themes: Total number of themes (for progress calculation) """ # Randomly sample questions if we have more than max_questions if len(theme_questions) > max_questions: eval_questions = random.sample(theme_questions, max_questions) else: eval_questions = theme_questions correct = 0 total = len(eval_questions) responses = [] versions_seen = set() for i, question in enumerate(eval_questions): # Update progress if available if progress is not None: # Calculate progress: 10% start + 80% for questions across all themes question_progress = i / len(eval_questions) theme_progress = theme_index / total_themes theme_portion = 1 / total_themes current_progress = 0.1 + 0.8 * (theme_progress + question_progress * theme_portion) progress(current_progress, f"Evaluando pregunta {i+1}/{len(eval_questions)}") response = get_model_response(llm, question) is_correct = response == question['real_answer'] if is_correct: correct += 1 responses.append({ 'question': question['statement'], 'model_answer': response, 'correct_answer': question['real_answer'], 'is_correct': is_correct, 'version': question['version'] }) versions_seen.add(question['version']) return { 'accuracy': correct / total if total > 0 else 0, 'responses': responses, 'versions_evaluated': list(versions_seen), 'num_questions': total } def run_exam(model_name: str, openai_api_key: str, openrouter_base_url: str, exam_theme: str = None, progress: Optional[Any] = None) -> Dict: """ Run the exam on all themes or a specific theme Args: model_name: The model ID to use in OpenRouter openai_api_key: OpenRouter API key openrouter_base_url: Base URL for OpenRouter API exam_theme: Optional specific theme to evaluate (if None, evaluates all themes) progress: Optional progress tracker function """ # Update progress if available if progress is not None: progress(0.05, "Inicializando modelo...") # For OpenRouter, we need to set up the proper HTTP headers headers = { "Authorization": f"Bearer {openai_api_key}" } llm = ChatOpenAI( openai_api_key=openai_api_key, openai_api_base=openrouter_base_url, model_name=model_name, default_headers=headers ) results = {} total_correct = 0 total_questions = 0 if progress is not None: progress(0.1, "Preparando preguntas del examen...") # If a specific theme is provided, only evaluate that theme if exam_theme and exam_theme in EXAM_QUESTIONS: if progress is not None: progress(0.1, f"Evaluando tema: {exam_theme}...") theme_results = evaluate_theme(llm, EXAM_QUESTIONS[exam_theme], progress=progress) results[exam_theme] = theme_results # Add to totals for overall score calculation correct_count = sum(1 for r in theme_results['responses'] if r['is_correct']) total_correct += correct_count total_questions += theme_results['num_questions'] else: # Get all themes with questions valid_themes = [(theme, questions) for theme, questions in EXAM_QUESTIONS.items() if questions] total_theme_count = len(valid_themes) # Evaluate each theme for i, (theme, questions) in enumerate(valid_themes): if progress is not None: progress(0.1 + (0.8 * i/total_theme_count), f"Evaluando tema: {theme}...") theme_results = evaluate_theme( llm, questions, progress=progress, theme_index=i, total_themes=total_theme_count ) results[theme] = theme_results # Update totals for overall score correct_count = sum(1 for r in theme_results['responses'] if r['is_correct']) total_correct += correct_count total_questions += theme_results['num_questions'] # Calculate overall score results['overall'] = { 'accuracy': total_correct / total_questions if total_questions > 0 else 0, 'total_questions': total_questions, } if progress is not None: progress(0.9, "Finalizando evaluación...") return results if __name__ == "__main__": # Setup argparse to receive parameters from the command line parser = argparse.ArgumentParser(description="Take an exam using OpenRouter and ChatOpenAI") parser.add_argument('--openai_api_key', required=True, help="OpenRouter API key") parser.add_argument('--openrouter_base_url', required=True, help="Base URL of OpenRouter") parser.add_argument('--model', required=True, help="The model to use (e.g., 'gpt-3.5-turbo')") parser.add_argument('--exam_theme', required=False, help="Specific theme to evaluate (optional)") args = parser.parse_args() # Run the exam results = run_exam( model_name=args.model, openai_api_key=args.openai_api_key, openrouter_base_url=args.openrouter_base_url, exam_theme=args.exam_theme ) # Print the results print("Exam Results:") for theme, theme_results in results.items(): if theme != 'overall': print(f"Theme: {theme}, Accuracy: {theme_results['accuracy']:.2f}") print(f"Overall Accuracy: {results['overall']['accuracy']:.2f}") print(f"Total Questions Evaluated: {results['overall']['total_questions']}")