File size: 7,257 Bytes
8ee2d42
 
7d9d48e
6d36002
 
8ee2d42
7d9d48e
8ee2d42
7d9d48e
 
 
 
 
 
 
 
 
 
 
 
 
8ee2d42
 
6d36002
7d9d48e
8ee2d42
7d9d48e
 
8ee2d42
7d9d48e
 
 
6d36002
7d9d48e
 
 
 
 
 
6d36002
 
 
7d9d48e
 
 
 
8ee2d42
7d9d48e
8ee2d42
7d9d48e
 
 
 
 
6d36002
 
 
 
 
 
 
 
 
 
7d9d48e
 
 
 
 
 
 
 
 
 
 
8ee2d42
7d9d48e
 
 
 
 
 
 
 
 
6d36002
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7d9d48e
 
 
 
6d36002
7d9d48e
 
 
 
 
 
6d36002
 
 
 
 
 
 
 
 
 
7d9d48e
6d36002
7d9d48e
 
 
6d36002
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7d9d48e
 
 
 
 
 
 
6d36002
 
 
7d9d48e
8ee2d42
 
 
 
 
 
 
6d36002
8ee2d42
 
 
7d9d48e
 
8ee2d42
7d9d48e
6d36002
 
8ee2d42
 
7d9d48e
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
import argparse
import re
import random
from typing import Dict, List, Optional, Any
from langchain_openai import ChatOpenAI
from langchain.schema import HumanMessage
from datasets import load_dataset
import pandas as pd
from ..evaluation.questions import EXAM_QUESTIONS

def get_model_response(llm, question):
    """Get response from the model for a single question"""
    prompt = f"""Question: {question['statement']}
Options:
A) {question['options']['A']}
B) {question['options']['B']}
C) {question['options']['C']}
D) {question['options']['D']}

Respond with only the letter of the correct option (A, B, C, or D).
Answer:"""
    
    message = HumanMessage(content=prompt)
    response = llm.invoke([message])
    generated_response = response.content.strip().upper()
    
    # Clean the response to get just the letter
    match = re.search(r'[ABCD]', generated_response)
    if match:
        return match.group(0)
    return "No valid response"

def evaluate_theme(llm, theme_questions: List[Dict], max_questions: int = 50, progress=None, theme_index=0, total_themes=1) -> Dict:
    """
    Evaluate a model on a specific theme
    Args:
        llm: The language model to use
        theme_questions: List of questions for this theme
        max_questions: Maximum number of questions to evaluate per theme
        progress: Progress tracker object
        theme_index: Current theme index (for progress calculation)
        total_themes: Total number of themes (for progress calculation)
    """
    # Randomly sample questions if we have more than max_questions
    if len(theme_questions) > max_questions:
        eval_questions = random.sample(theme_questions, max_questions)
    else:
        eval_questions = theme_questions

    correct = 0
    total = len(eval_questions)
    responses = []
    versions_seen = set()
    
    for i, question in enumerate(eval_questions):
        # Update progress if available
        if progress is not None:
            # Calculate progress: 10% start + 80% for questions across all themes
            question_progress = i / len(eval_questions)
            theme_progress = theme_index / total_themes
            theme_portion = 1 / total_themes
            current_progress = 0.1 + 0.8 * (theme_progress + question_progress * theme_portion)
            progress(current_progress, f"Evaluando pregunta {i+1}/{len(eval_questions)}")
            
        response = get_model_response(llm, question)
        is_correct = response == question['real_answer']
        if is_correct:
            correct += 1
            
        responses.append({
            'question': question['statement'],
            'model_answer': response,
            'correct_answer': question['real_answer'],
            'is_correct': is_correct,
            'version': question['version']
        })
        versions_seen.add(question['version'])
    
    return {
        'accuracy': correct / total if total > 0 else 0,
        'responses': responses,
        'versions_evaluated': list(versions_seen),
        'num_questions': total
    }

def run_exam(model_name: str, openai_api_key: str, openrouter_base_url: str, exam_theme: str = None, progress: Optional[Any] = None) -> Dict:
    """
    Run the exam on all themes or a specific theme
    
    Args:
        model_name: The model ID to use in OpenRouter
        openai_api_key: OpenRouter API key
        openrouter_base_url: Base URL for OpenRouter API
        exam_theme: Optional specific theme to evaluate (if None, evaluates all themes)
        progress: Optional progress tracker function
    """
    # Update progress if available
    if progress is not None:
        progress(0.05, "Inicializando modelo...")
        
    # For OpenRouter, we need to set up the proper HTTP headers
    headers = {
        "Authorization": f"Bearer {openai_api_key}"
    }
    
    llm = ChatOpenAI(
        openai_api_key=openai_api_key,
        openai_api_base=openrouter_base_url,
        model_name=model_name,
        default_headers=headers
    )
    
    results = {}
    total_correct = 0
    total_questions = 0
    
    if progress is not None:
        progress(0.1, "Preparando preguntas del examen...")
    
    # If a specific theme is provided, only evaluate that theme
    if exam_theme and exam_theme in EXAM_QUESTIONS:
        if progress is not None:
            progress(0.1, f"Evaluando tema: {exam_theme}...")
            
        theme_results = evaluate_theme(llm, EXAM_QUESTIONS[exam_theme], progress=progress)
        results[exam_theme] = theme_results
        
        # Add to totals for overall score calculation
        correct_count = sum(1 for r in theme_results['responses'] if r['is_correct'])
        total_correct += correct_count
        total_questions += theme_results['num_questions']
    else:
        # Get all themes with questions
        valid_themes = [(theme, questions) for theme, questions in EXAM_QUESTIONS.items() if questions]
        total_theme_count = len(valid_themes)
        
        # Evaluate each theme
        for i, (theme, questions) in enumerate(valid_themes):
            if progress is not None:
                progress(0.1 + (0.8 * i/total_theme_count), f"Evaluando tema: {theme}...")
                
            theme_results = evaluate_theme(
                llm, 
                questions, 
                progress=progress, 
                theme_index=i, 
                total_themes=total_theme_count
            )
            results[theme] = theme_results
            
            # Update totals for overall score
            correct_count = sum(1 for r in theme_results['responses'] if r['is_correct'])
            total_correct += correct_count
            total_questions += theme_results['num_questions']
    
    # Calculate overall score
    results['overall'] = {
        'accuracy': total_correct / total_questions if total_questions > 0 else 0,
        'total_questions': total_questions,
    }
    
    if progress is not None:
        progress(0.9, "Finalizando evaluación...")
    
    return results

if __name__ == "__main__":
    # Setup argparse to receive parameters from the command line
    parser = argparse.ArgumentParser(description="Take an exam using OpenRouter and ChatOpenAI")
    parser.add_argument('--openai_api_key', required=True, help="OpenRouter API key")
    parser.add_argument('--openrouter_base_url', required=True, help="Base URL of OpenRouter")
    parser.add_argument('--model', required=True, help="The model to use (e.g., 'gpt-3.5-turbo')")
    parser.add_argument('--exam_theme', required=False, help="Specific theme to evaluate (optional)")

    args = parser.parse_args()

    # Run the exam
    results = run_exam(
        model_name=args.model,
        openai_api_key=args.openai_api_key,
        openrouter_base_url=args.openrouter_base_url,
        exam_theme=args.exam_theme
    )

    # Print the results
    print("Exam Results:")
    for theme, theme_results in results.items():
        if theme != 'overall':
            print(f"Theme: {theme}, Accuracy: {theme_results['accuracy']:.2f}")
    print(f"Overall Accuracy: {results['overall']['accuracy']:.2f}")
    print(f"Total Questions Evaluated: {results['overall']['total_questions']}")