Spaces:
Running
Running
File size: 7,257 Bytes
8ee2d42 7d9d48e 6d36002 8ee2d42 7d9d48e 8ee2d42 7d9d48e 8ee2d42 6d36002 7d9d48e 8ee2d42 7d9d48e 8ee2d42 7d9d48e 6d36002 7d9d48e 6d36002 7d9d48e 8ee2d42 7d9d48e 8ee2d42 7d9d48e 6d36002 7d9d48e 8ee2d42 7d9d48e 6d36002 7d9d48e 6d36002 7d9d48e 6d36002 7d9d48e 6d36002 7d9d48e 6d36002 7d9d48e 6d36002 7d9d48e 8ee2d42 6d36002 8ee2d42 7d9d48e 8ee2d42 7d9d48e 6d36002 8ee2d42 7d9d48e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 |
import argparse
import re
import random
from typing import Dict, List, Optional, Any
from langchain_openai import ChatOpenAI
from langchain.schema import HumanMessage
from datasets import load_dataset
import pandas as pd
from ..evaluation.questions import EXAM_QUESTIONS
def get_model_response(llm, question):
"""Get response from the model for a single question"""
prompt = f"""Question: {question['statement']}
Options:
A) {question['options']['A']}
B) {question['options']['B']}
C) {question['options']['C']}
D) {question['options']['D']}
Respond with only the letter of the correct option (A, B, C, or D).
Answer:"""
message = HumanMessage(content=prompt)
response = llm.invoke([message])
generated_response = response.content.strip().upper()
# Clean the response to get just the letter
match = re.search(r'[ABCD]', generated_response)
if match:
return match.group(0)
return "No valid response"
def evaluate_theme(llm, theme_questions: List[Dict], max_questions: int = 50, progress=None, theme_index=0, total_themes=1) -> Dict:
"""
Evaluate a model on a specific theme
Args:
llm: The language model to use
theme_questions: List of questions for this theme
max_questions: Maximum number of questions to evaluate per theme
progress: Progress tracker object
theme_index: Current theme index (for progress calculation)
total_themes: Total number of themes (for progress calculation)
"""
# Randomly sample questions if we have more than max_questions
if len(theme_questions) > max_questions:
eval_questions = random.sample(theme_questions, max_questions)
else:
eval_questions = theme_questions
correct = 0
total = len(eval_questions)
responses = []
versions_seen = set()
for i, question in enumerate(eval_questions):
# Update progress if available
if progress is not None:
# Calculate progress: 10% start + 80% for questions across all themes
question_progress = i / len(eval_questions)
theme_progress = theme_index / total_themes
theme_portion = 1 / total_themes
current_progress = 0.1 + 0.8 * (theme_progress + question_progress * theme_portion)
progress(current_progress, f"Evaluando pregunta {i+1}/{len(eval_questions)}")
response = get_model_response(llm, question)
is_correct = response == question['real_answer']
if is_correct:
correct += 1
responses.append({
'question': question['statement'],
'model_answer': response,
'correct_answer': question['real_answer'],
'is_correct': is_correct,
'version': question['version']
})
versions_seen.add(question['version'])
return {
'accuracy': correct / total if total > 0 else 0,
'responses': responses,
'versions_evaluated': list(versions_seen),
'num_questions': total
}
def run_exam(model_name: str, openai_api_key: str, openrouter_base_url: str, exam_theme: str = None, progress: Optional[Any] = None) -> Dict:
"""
Run the exam on all themes or a specific theme
Args:
model_name: The model ID to use in OpenRouter
openai_api_key: OpenRouter API key
openrouter_base_url: Base URL for OpenRouter API
exam_theme: Optional specific theme to evaluate (if None, evaluates all themes)
progress: Optional progress tracker function
"""
# Update progress if available
if progress is not None:
progress(0.05, "Inicializando modelo...")
# For OpenRouter, we need to set up the proper HTTP headers
headers = {
"Authorization": f"Bearer {openai_api_key}"
}
llm = ChatOpenAI(
openai_api_key=openai_api_key,
openai_api_base=openrouter_base_url,
model_name=model_name,
default_headers=headers
)
results = {}
total_correct = 0
total_questions = 0
if progress is not None:
progress(0.1, "Preparando preguntas del examen...")
# If a specific theme is provided, only evaluate that theme
if exam_theme and exam_theme in EXAM_QUESTIONS:
if progress is not None:
progress(0.1, f"Evaluando tema: {exam_theme}...")
theme_results = evaluate_theme(llm, EXAM_QUESTIONS[exam_theme], progress=progress)
results[exam_theme] = theme_results
# Add to totals for overall score calculation
correct_count = sum(1 for r in theme_results['responses'] if r['is_correct'])
total_correct += correct_count
total_questions += theme_results['num_questions']
else:
# Get all themes with questions
valid_themes = [(theme, questions) for theme, questions in EXAM_QUESTIONS.items() if questions]
total_theme_count = len(valid_themes)
# Evaluate each theme
for i, (theme, questions) in enumerate(valid_themes):
if progress is not None:
progress(0.1 + (0.8 * i/total_theme_count), f"Evaluando tema: {theme}...")
theme_results = evaluate_theme(
llm,
questions,
progress=progress,
theme_index=i,
total_themes=total_theme_count
)
results[theme] = theme_results
# Update totals for overall score
correct_count = sum(1 for r in theme_results['responses'] if r['is_correct'])
total_correct += correct_count
total_questions += theme_results['num_questions']
# Calculate overall score
results['overall'] = {
'accuracy': total_correct / total_questions if total_questions > 0 else 0,
'total_questions': total_questions,
}
if progress is not None:
progress(0.9, "Finalizando evaluación...")
return results
if __name__ == "__main__":
# Setup argparse to receive parameters from the command line
parser = argparse.ArgumentParser(description="Take an exam using OpenRouter and ChatOpenAI")
parser.add_argument('--openai_api_key', required=True, help="OpenRouter API key")
parser.add_argument('--openrouter_base_url', required=True, help="Base URL of OpenRouter")
parser.add_argument('--model', required=True, help="The model to use (e.g., 'gpt-3.5-turbo')")
parser.add_argument('--exam_theme', required=False, help="Specific theme to evaluate (optional)")
args = parser.parse_args()
# Run the exam
results = run_exam(
model_name=args.model,
openai_api_key=args.openai_api_key,
openrouter_base_url=args.openrouter_base_url,
exam_theme=args.exam_theme
)
# Print the results
print("Exam Results:")
for theme, theme_results in results.items():
if theme != 'overall':
print(f"Theme: {theme}, Accuracy: {theme_results['accuracy']:.2f}")
print(f"Overall Accuracy: {results['overall']['accuracy']:.2f}")
print(f"Total Questions Evaluated: {results['overall']['total_questions']}")
|