Spaces:
Running
Running
import argparse | |
import re | |
import random | |
from typing import Dict, List, Optional, Any | |
from langchain_openai import ChatOpenAI | |
from langchain.schema import HumanMessage | |
from datasets import load_dataset | |
import pandas as pd | |
from ..evaluation.questions import EXAM_QUESTIONS | |
def get_model_response(llm, question): | |
"""Get response from the model for a single question""" | |
prompt = f"""Question: {question['statement']} | |
Options: | |
A) {question['options']['A']} | |
B) {question['options']['B']} | |
C) {question['options']['C']} | |
D) {question['options']['D']} | |
Respond with only the letter of the correct option (A, B, C, or D). | |
Answer:""" | |
message = HumanMessage(content=prompt) | |
response = llm.invoke([message]) | |
generated_response = response.content.strip().upper() | |
# Clean the response to get just the letter | |
match = re.search(r'[ABCD]', generated_response) | |
if match: | |
return match.group(0) | |
return "No valid response" | |
def evaluate_theme(llm, theme_questions: List[Dict], max_questions: int = 50, progress=None, theme_index=0, total_themes=1) -> Dict: | |
""" | |
Evaluate a model on a specific theme | |
Args: | |
llm: The language model to use | |
theme_questions: List of questions for this theme | |
max_questions: Maximum number of questions to evaluate per theme | |
progress: Progress tracker object | |
theme_index: Current theme index (for progress calculation) | |
total_themes: Total number of themes (for progress calculation) | |
""" | |
# Randomly sample questions if we have more than max_questions | |
if len(theme_questions) > max_questions: | |
eval_questions = random.sample(theme_questions, max_questions) | |
else: | |
eval_questions = theme_questions | |
correct = 0 | |
total = len(eval_questions) | |
responses = [] | |
versions_seen = set() | |
for i, question in enumerate(eval_questions): | |
# Update progress if available | |
if progress is not None: | |
# Calculate progress: 10% start + 80% for questions across all themes | |
question_progress = i / len(eval_questions) | |
theme_progress = theme_index / total_themes | |
theme_portion = 1 / total_themes | |
current_progress = 0.1 + 0.8 * (theme_progress + question_progress * theme_portion) | |
progress(current_progress, f"Evaluando pregunta {i+1}/{len(eval_questions)}") | |
response = get_model_response(llm, question) | |
is_correct = response == question['real_answer'] | |
if is_correct: | |
correct += 1 | |
responses.append({ | |
'question': question['statement'], | |
'model_answer': response, | |
'correct_answer': question['real_answer'], | |
'is_correct': is_correct, | |
'version': question['version'] | |
}) | |
versions_seen.add(question['version']) | |
return { | |
'accuracy': correct / total if total > 0 else 0, | |
'responses': responses, | |
'versions_evaluated': list(versions_seen), | |
'num_questions': total | |
} | |
def run_exam(model_name: str, openai_api_key: str, openrouter_base_url: str, exam_theme: str = None, progress: Optional[Any] = None) -> Dict: | |
""" | |
Run the exam on all themes or a specific theme | |
Args: | |
model_name: The model ID to use in OpenRouter | |
openai_api_key: OpenRouter API key | |
openrouter_base_url: Base URL for OpenRouter API | |
exam_theme: Optional specific theme to evaluate (if None, evaluates all themes) | |
progress: Optional progress tracker function | |
""" | |
# Update progress if available | |
if progress is not None: | |
progress(0.05, "Inicializando modelo...") | |
# For OpenRouter, we need to set up the proper HTTP headers | |
headers = { | |
"Authorization": f"Bearer {openai_api_key}" | |
} | |
llm = ChatOpenAI( | |
openai_api_key=openai_api_key, | |
openai_api_base=openrouter_base_url, | |
model_name=model_name, | |
default_headers=headers | |
) | |
results = {} | |
total_correct = 0 | |
total_questions = 0 | |
if progress is not None: | |
progress(0.1, "Preparando preguntas del examen...") | |
# If a specific theme is provided, only evaluate that theme | |
if exam_theme and exam_theme in EXAM_QUESTIONS: | |
if progress is not None: | |
progress(0.1, f"Evaluando tema: {exam_theme}...") | |
theme_results = evaluate_theme(llm, EXAM_QUESTIONS[exam_theme], progress=progress) | |
results[exam_theme] = theme_results | |
# Add to totals for overall score calculation | |
correct_count = sum(1 for r in theme_results['responses'] if r['is_correct']) | |
total_correct += correct_count | |
total_questions += theme_results['num_questions'] | |
else: | |
# Get all themes with questions | |
valid_themes = [(theme, questions) for theme, questions in EXAM_QUESTIONS.items() if questions] | |
total_theme_count = len(valid_themes) | |
# Evaluate each theme | |
for i, (theme, questions) in enumerate(valid_themes): | |
if progress is not None: | |
progress(0.1 + (0.8 * i/total_theme_count), f"Evaluando tema: {theme}...") | |
theme_results = evaluate_theme( | |
llm, | |
questions, | |
progress=progress, | |
theme_index=i, | |
total_themes=total_theme_count | |
) | |
results[theme] = theme_results | |
# Update totals for overall score | |
correct_count = sum(1 for r in theme_results['responses'] if r['is_correct']) | |
total_correct += correct_count | |
total_questions += theme_results['num_questions'] | |
# Calculate overall score | |
results['overall'] = { | |
'accuracy': total_correct / total_questions if total_questions > 0 else 0, | |
'total_questions': total_questions, | |
} | |
if progress is not None: | |
progress(0.9, "Finalizando evaluación...") | |
return results | |
if __name__ == "__main__": | |
# Setup argparse to receive parameters from the command line | |
parser = argparse.ArgumentParser(description="Take an exam using OpenRouter and ChatOpenAI") | |
parser.add_argument('--openai_api_key', required=True, help="OpenRouter API key") | |
parser.add_argument('--openrouter_base_url', required=True, help="Base URL of OpenRouter") | |
parser.add_argument('--model', required=True, help="The model to use (e.g., 'gpt-3.5-turbo')") | |
parser.add_argument('--exam_theme', required=False, help="Specific theme to evaluate (optional)") | |
args = parser.parse_args() | |
# Run the exam | |
results = run_exam( | |
model_name=args.model, | |
openai_api_key=args.openai_api_key, | |
openrouter_base_url=args.openrouter_base_url, | |
exam_theme=args.exam_theme | |
) | |
# Print the results | |
print("Exam Results:") | |
for theme, theme_results in results.items(): | |
if theme != 'overall': | |
print(f"Theme: {theme}, Accuracy: {theme_results['accuracy']:.2f}") | |
print(f"Overall Accuracy: {results['overall']['accuracy']:.2f}") | |
print(f"Total Questions Evaluated: {results['overall']['total_questions']}") | |