Spaces:

SASLeaderboard
/

leaderboard

Running

File size: 7,257 Bytes

import argparse
import re
import random
from typing import Dict, List, Optional, Any
from langchain_openai import ChatOpenAI
from langchain.schema import HumanMessage
from datasets import load_dataset
import pandas as pd
from ..evaluation.questions import EXAM_QUESTIONS

def get_model_response(llm, question):
    """Get response from the model for a single question"""
    prompt = f"""Question: {question['statement']}
Options:
A) {question['options']['A']}
B) {question['options']['B']}
C) {question['options']['C']}
D) {question['options']['D']}

Respond with only the letter of the correct option (A, B, C, or D).
Answer:"""
    
    message = HumanMessage(content=prompt)
    response = llm.invoke([message])
    generated_response = response.content.strip().upper()
    
    # Clean the response to get just the letter
    match = re.search(r'[ABCD]', generated_response)
    if match:
        return match.group(0)
    return "No valid response"

def evaluate_theme(llm, theme_questions: List[Dict], max_questions: int = 50, progress=None, theme_index=0, total_themes=1) -> Dict:
    """
    Evaluate a model on a specific theme
    Args:
        llm: The language model to use
        theme_questions: List of questions for this theme
        max_questions: Maximum number of questions to evaluate per theme
        progress: Progress tracker object
        theme_index: Current theme index (for progress calculation)
        total_themes: Total number of themes (for progress calculation)
    """
    # Randomly sample questions if we have more than max_questions
    if len(theme_questions) > max_questions:
        eval_questions = random.sample(theme_questions, max_questions)
    else:
        eval_questions = theme_questions

    correct = 0
    total = len(eval_questions)
    responses = []
    versions_seen = set()
    
    for i, question in enumerate(eval_questions):
        # Update progress if available
        if progress is not None:
            # Calculate progress: 10% start + 80% for questions across all themes
            question_progress = i / len(eval_questions)
            theme_progress = theme_index / total_themes
            theme_portion = 1 / total_themes
            current_progress = 0.1 + 0.8 * (theme_progress + question_progress * theme_portion)
            progress(current_progress, f"Evaluando pregunta {i+1}/{len(eval_questions)}")
            
        response = get_model_response(llm, question)
        is_correct = response == question['real_answer']
        if is_correct:
            correct += 1
            
        responses.append({
            'question': question['statement'],
            'model_answer': response,
            'correct_answer': question['real_answer'],
            'is_correct': is_correct,
            'version': question['version']
        })
        versions_seen.add(question['version'])
    
    return {
        'accuracy': correct / total if total > 0 else 0,
        'responses': responses,
        'versions_evaluated': list(versions_seen),
        'num_questions': total
    }

def run_exam(model_name: str, openai_api_key: str, openrouter_base_url: str, exam_theme: str = None, progress: Optional[Any] = None) -> Dict:
    """
    Run the exam on all themes or a specific theme
    
    Args:
        model_name: The model ID to use in OpenRouter
        openai_api_key: OpenRouter API key
        openrouter_base_url: Base URL for OpenRouter API
        exam_theme: Optional specific theme to evaluate (if None, evaluates all themes)
        progress: Optional progress tracker function
    """
    # Update progress if available
    if progress is not None:
        progress(0.05, "Inicializando modelo...")
        
    # For OpenRouter, we need to set up the proper HTTP headers
    headers = {
        "Authorization": f"Bearer {openai_api_key}"
    }
    
    llm = ChatOpenAI(
        openai_api_key=openai_api_key,
        openai_api_base=openrouter_base_url,
        model_name=model_name,
        default_headers=headers
    )
    
    results = {}
    total_correct = 0
    total_questions = 0
    
    if progress is not None:
        progress(0.1, "Preparando preguntas del examen...")
    
    # If a specific theme is provided, only evaluate that theme
    if exam_theme and exam_theme in EXAM_QUESTIONS:
        if progress is not None:
            progress(0.1, f"Evaluando tema: {exam_theme}...")
            
        theme_results = evaluate_theme(llm, EXAM_QUESTIONS[exam_theme], progress=progress)
        results[exam_theme] = theme_results
        
        # Add to totals for overall score calculation
        correct_count = sum(1 for r in theme_results['responses'] if r['is_correct'])
        total_correct += correct_count
        total_questions += theme_results['num_questions']
    else:
        # Get all themes with questions
        valid_themes = [(theme, questions) for theme, questions in EXAM_QUESTIONS.items() if questions]
        total_theme_count = len(valid_themes)
        
        # Evaluate each theme
        for i, (theme, questions) in enumerate(valid_themes):
            if progress is not None:
                progress(0.1 + (0.8 * i/total_theme_count), f"Evaluando tema: {theme}...")
                
            theme_results = evaluate_theme(
                llm, 
                questions, 
                progress=progress, 
                theme_index=i, 
                total_themes=total_theme_count
            )
            results[theme] = theme_results
            
            # Update totals for overall score
            correct_count = sum(1 for r in theme_results['responses'] if r['is_correct'])
            total_correct += correct_count
            total_questions += theme_results['num_questions']
    
    # Calculate overall score
    results['overall'] = {
        'accuracy': total_correct / total_questions if total_questions > 0 else 0,
        'total_questions': total_questions,
    }
    
    if progress is not None:
        progress(0.9, "Finalizando evaluación...")
    
    return results

if __name__ == "__main__":
    # Setup argparse to receive parameters from the command line
    parser = argparse.ArgumentParser(description="Take an exam using OpenRouter and ChatOpenAI")
    parser.add_argument('--openai_api_key', required=True, help="OpenRouter API key")
    parser.add_argument('--openrouter_base_url', required=True, help="Base URL of OpenRouter")
    parser.add_argument('--model', required=True, help="The model to use (e.g., 'gpt-3.5-turbo')")
    parser.add_argument('--exam_theme', required=False, help="Specific theme to evaluate (optional)")

    args = parser.parse_args()

    # Run the exam
    results = run_exam(
        model_name=args.model,
        openai_api_key=args.openai_api_key,
        openrouter_base_url=args.openrouter_base_url,
        exam_theme=args.exam_theme
    )

    # Print the results
    print("Exam Results:")
    for theme, theme_results in results.items():
        if theme != 'overall':
            print(f"Theme: {theme}, Accuracy: {theme_results['accuracy']:.2f}")
    print(f"Overall Accuracy: {results['overall']['accuracy']:.2f}")
    print(f"Total Questions Evaluated: {results['overall']['total_questions']}")