Rafael Jaime Moreno
some changes wip openrouter
6d36002
import argparse
import re
import random
from typing import Dict, List, Optional, Any
from langchain_openai import ChatOpenAI
from langchain.schema import HumanMessage
from datasets import load_dataset
import pandas as pd
from ..evaluation.questions import EXAM_QUESTIONS
def get_model_response(llm, question):
"""Get response from the model for a single question"""
prompt = f"""Question: {question['statement']}
Options:
A) {question['options']['A']}
B) {question['options']['B']}
C) {question['options']['C']}
D) {question['options']['D']}
Respond with only the letter of the correct option (A, B, C, or D).
Answer:"""
message = HumanMessage(content=prompt)
response = llm.invoke([message])
generated_response = response.content.strip().upper()
# Clean the response to get just the letter
match = re.search(r'[ABCD]', generated_response)
if match:
return match.group(0)
return "No valid response"
def evaluate_theme(llm, theme_questions: List[Dict], max_questions: int = 50, progress=None, theme_index=0, total_themes=1) -> Dict:
"""
Evaluate a model on a specific theme
Args:
llm: The language model to use
theme_questions: List of questions for this theme
max_questions: Maximum number of questions to evaluate per theme
progress: Progress tracker object
theme_index: Current theme index (for progress calculation)
total_themes: Total number of themes (for progress calculation)
"""
# Randomly sample questions if we have more than max_questions
if len(theme_questions) > max_questions:
eval_questions = random.sample(theme_questions, max_questions)
else:
eval_questions = theme_questions
correct = 0
total = len(eval_questions)
responses = []
versions_seen = set()
for i, question in enumerate(eval_questions):
# Update progress if available
if progress is not None:
# Calculate progress: 10% start + 80% for questions across all themes
question_progress = i / len(eval_questions)
theme_progress = theme_index / total_themes
theme_portion = 1 / total_themes
current_progress = 0.1 + 0.8 * (theme_progress + question_progress * theme_portion)
progress(current_progress, f"Evaluando pregunta {i+1}/{len(eval_questions)}")
response = get_model_response(llm, question)
is_correct = response == question['real_answer']
if is_correct:
correct += 1
responses.append({
'question': question['statement'],
'model_answer': response,
'correct_answer': question['real_answer'],
'is_correct': is_correct,
'version': question['version']
})
versions_seen.add(question['version'])
return {
'accuracy': correct / total if total > 0 else 0,
'responses': responses,
'versions_evaluated': list(versions_seen),
'num_questions': total
}
def run_exam(model_name: str, openai_api_key: str, openrouter_base_url: str, exam_theme: str = None, progress: Optional[Any] = None) -> Dict:
"""
Run the exam on all themes or a specific theme
Args:
model_name: The model ID to use in OpenRouter
openai_api_key: OpenRouter API key
openrouter_base_url: Base URL for OpenRouter API
exam_theme: Optional specific theme to evaluate (if None, evaluates all themes)
progress: Optional progress tracker function
"""
# Update progress if available
if progress is not None:
progress(0.05, "Inicializando modelo...")
# For OpenRouter, we need to set up the proper HTTP headers
headers = {
"Authorization": f"Bearer {openai_api_key}"
}
llm = ChatOpenAI(
openai_api_key=openai_api_key,
openai_api_base=openrouter_base_url,
model_name=model_name,
default_headers=headers
)
results = {}
total_correct = 0
total_questions = 0
if progress is not None:
progress(0.1, "Preparando preguntas del examen...")
# If a specific theme is provided, only evaluate that theme
if exam_theme and exam_theme in EXAM_QUESTIONS:
if progress is not None:
progress(0.1, f"Evaluando tema: {exam_theme}...")
theme_results = evaluate_theme(llm, EXAM_QUESTIONS[exam_theme], progress=progress)
results[exam_theme] = theme_results
# Add to totals for overall score calculation
correct_count = sum(1 for r in theme_results['responses'] if r['is_correct'])
total_correct += correct_count
total_questions += theme_results['num_questions']
else:
# Get all themes with questions
valid_themes = [(theme, questions) for theme, questions in EXAM_QUESTIONS.items() if questions]
total_theme_count = len(valid_themes)
# Evaluate each theme
for i, (theme, questions) in enumerate(valid_themes):
if progress is not None:
progress(0.1 + (0.8 * i/total_theme_count), f"Evaluando tema: {theme}...")
theme_results = evaluate_theme(
llm,
questions,
progress=progress,
theme_index=i,
total_themes=total_theme_count
)
results[theme] = theme_results
# Update totals for overall score
correct_count = sum(1 for r in theme_results['responses'] if r['is_correct'])
total_correct += correct_count
total_questions += theme_results['num_questions']
# Calculate overall score
results['overall'] = {
'accuracy': total_correct / total_questions if total_questions > 0 else 0,
'total_questions': total_questions,
}
if progress is not None:
progress(0.9, "Finalizando evaluación...")
return results
if __name__ == "__main__":
# Setup argparse to receive parameters from the command line
parser = argparse.ArgumentParser(description="Take an exam using OpenRouter and ChatOpenAI")
parser.add_argument('--openai_api_key', required=True, help="OpenRouter API key")
parser.add_argument('--openrouter_base_url', required=True, help="Base URL of OpenRouter")
parser.add_argument('--model', required=True, help="The model to use (e.g., 'gpt-3.5-turbo')")
parser.add_argument('--exam_theme', required=False, help="Specific theme to evaluate (optional)")
args = parser.parse_args()
# Run the exam
results = run_exam(
model_name=args.model,
openai_api_key=args.openai_api_key,
openrouter_base_url=args.openrouter_base_url,
exam_theme=args.exam_theme
)
# Print the results
print("Exam Results:")
for theme, theme_results in results.items():
if theme != 'overall':
print(f"Theme: {theme}, Accuracy: {theme_results['accuracy']:.2f}")
print(f"Overall Accuracy: {results['overall']['accuracy']:.2f}")
print(f"Total Questions Evaluated: {results['overall']['total_questions']}")