Spaces:

SASLeaderboard
/

leaderboard

Running

leaderboard / src /submission /do_exam.py

Rafael Jaime Moreno

some changes wip openrouter

6d36002 15 days ago

7.26 kB

	import argparse
	import re
	import random
	from typing import Dict, List, Optional, Any
	from langchain_openai import ChatOpenAI
	from langchain.schema import HumanMessage
	from datasets import load_dataset
	import pandas as pd
	from ..evaluation.questions import EXAM_QUESTIONS

	def get_model_response(llm, question):
	"""Get response from the model for a single question"""
	prompt = f"""Question: {question['statement']}
	Options:
	A) {question['options']['A']}
	B) {question['options']['B']}
	C) {question['options']['C']}
	D) {question['options']['D']}

	Respond with only the letter of the correct option (A, B, C, or D).
	Answer:"""

	message = HumanMessage(content=prompt)
	response = llm.invoke([message])
	generated_response = response.content.strip().upper()

	# Clean the response to get just the letter
	match = re.search(r'[ABCD]', generated_response)
	if match:
	return match.group(0)
	return "No valid response"

	def evaluate_theme(llm, theme_questions: List[Dict], max_questions: int = 50, progress=None, theme_index=0, total_themes=1) -> Dict:
	"""
	Evaluate a model on a specific theme
	Args:
	llm: The language model to use
	theme_questions: List of questions for this theme
	max_questions: Maximum number of questions to evaluate per theme
	progress: Progress tracker object
	theme_index: Current theme index (for progress calculation)
	total_themes: Total number of themes (for progress calculation)
	"""
	# Randomly sample questions if we have more than max_questions
	if len(theme_questions) > max_questions:
	eval_questions = random.sample(theme_questions, max_questions)
	else:
	eval_questions = theme_questions

	correct = 0
	total = len(eval_questions)
	responses = []
	versions_seen = set()

	for i, question in enumerate(eval_questions):
	# Update progress if available
	if progress is not None:
	# Calculate progress: 10% start + 80% for questions across all themes
	question_progress = i / len(eval_questions)
	theme_progress = theme_index / total_themes
	theme_portion = 1 / total_themes
	current_progress = 0.1 + 0.8 * (theme_progress + question_progress * theme_portion)
	progress(current_progress, f"Evaluando pregunta {i+1}/{len(eval_questions)}")

	response = get_model_response(llm, question)
	is_correct = response == question['real_answer']
	if is_correct:
	correct += 1

	responses.append({
	'question': question['statement'],
	'model_answer': response,
	'correct_answer': question['real_answer'],
	'is_correct': is_correct,
	'version': question['version']
	})
	versions_seen.add(question['version'])

	return {
	'accuracy': correct / total if total > 0 else 0,
	'responses': responses,
	'versions_evaluated': list(versions_seen),
	'num_questions': total
	}

	def run_exam(model_name: str, openai_api_key: str, openrouter_base_url: str, exam_theme: str = None, progress: Optional[Any] = None) -> Dict:
	"""
	Run the exam on all themes or a specific theme

	Args:
	model_name: The model ID to use in OpenRouter
	openai_api_key: OpenRouter API key
	openrouter_base_url: Base URL for OpenRouter API
	exam_theme: Optional specific theme to evaluate (if None, evaluates all themes)
	progress: Optional progress tracker function
	"""
	# Update progress if available
	if progress is not None:
	progress(0.05, "Inicializando modelo...")

	# For OpenRouter, we need to set up the proper HTTP headers
	headers = {
	"Authorization": f"Bearer {openai_api_key}"
	}

	llm = ChatOpenAI(
	openai_api_key=openai_api_key,
	openai_api_base=openrouter_base_url,
	model_name=model_name,
	default_headers=headers
	)

	results = {}
	total_correct = 0
	total_questions = 0

	if progress is not None:
	progress(0.1, "Preparando preguntas del examen...")

	# If a specific theme is provided, only evaluate that theme
	if exam_theme and exam_theme in EXAM_QUESTIONS:
	if progress is not None:
	progress(0.1, f"Evaluando tema: {exam_theme}...")

	theme_results = evaluate_theme(llm, EXAM_QUESTIONS[exam_theme], progress=progress)
	results[exam_theme] = theme_results

	# Add to totals for overall score calculation
	correct_count = sum(1 for r in theme_results['responses'] if r['is_correct'])
	total_correct += correct_count
	total_questions += theme_results['num_questions']
	else:
	# Get all themes with questions
	valid_themes = [(theme, questions) for theme, questions in EXAM_QUESTIONS.items() if questions]
	total_theme_count = len(valid_themes)

	# Evaluate each theme
	for i, (theme, questions) in enumerate(valid_themes):
	if progress is not None:
	progress(0.1 + (0.8 * i/total_theme_count), f"Evaluando tema: {theme}...")

	theme_results = evaluate_theme(
	llm,
	questions,
	progress=progress,
	theme_index=i,
	total_themes=total_theme_count
	)
	results[theme] = theme_results

	# Update totals for overall score
	correct_count = sum(1 for r in theme_results['responses'] if r['is_correct'])
	total_correct += correct_count
	total_questions += theme_results['num_questions']

	# Calculate overall score
	results['overall'] = {
	'accuracy': total_correct / total_questions if total_questions > 0 else 0,
	'total_questions': total_questions,
	}

	if progress is not None:
	progress(0.9, "Finalizando evaluación...")

	return results

	if __name__ == "__main__":
	# Setup argparse to receive parameters from the command line
	parser = argparse.ArgumentParser(description="Take an exam using OpenRouter and ChatOpenAI")
	parser.add_argument('--openai_api_key', required=True, help="OpenRouter API key")
	parser.add_argument('--openrouter_base_url', required=True, help="Base URL of OpenRouter")
	parser.add_argument('--model', required=True, help="The model to use (e.g., 'gpt-3.5-turbo')")
	parser.add_argument('--exam_theme', required=False, help="Specific theme to evaluate (optional)")

	args = parser.parse_args()

	# Run the exam
	results = run_exam(
	model_name=args.model,
	openai_api_key=args.openai_api_key,
	openrouter_base_url=args.openrouter_base_url,
	exam_theme=args.exam_theme
	)

	# Print the results
	print("Exam Results:")
	for theme, theme_results in results.items():
	if theme != 'overall':
	print(f"Theme: {theme}, Accuracy: {theme_results['accuracy']:.2f}")
	print(f"Overall Accuracy: {results['overall']['accuracy']:.2f}")
	print(f"Total Questions Evaluated: {results['overall']['total_questions']}")