Spaces:

SASLeaderboard
/

leaderboard_backend

Sleeping

App Files Files Community

leaderboard_backend / app.py

RafaelJaime

Create app.py

7a9b69c verified about 1 month ago

raw

history blame contribute delete

10.7 kB

	import gradio as gr
	import requests
	import random
	from datasets import load_dataset, Dataset
	from typing import Dict, List
	import re
	import datetime
	import pandas as pd
	import os
	from dotenv import load_dotenv

	load_dotenv()

	def sanitize_theme_name(theme: str) -> str:
	sanitized = re.sub(r'[^\w\s-]', '', theme)
	sanitized = re.sub(r'[-\s]+', '_', sanitized)
	return sanitized.lower().strip('_')

	def load_questions_from_dataset() -> Dict[str, List[Dict]]:
	dataset = load_dataset("SASLeaderboard/sas_opposition_exam_data")
	dataset = dataset['train'].filter(lambda x: x['theme'] == 'FEA Urología')

	questions_by_theme = {}
	skipped = 0
	loaded = 0

	for item in dataset:
	theme = item['theme']
	answers = item.get('answers', [])
	correct_answer = item.get('correct_answer', '')

	if not answers or not correct_answer or len(answers) < 3:
	skipped += 1
	continue

	while len(answers) < 4:
	answers.append(answers[-1])

	sanitized_theme = sanitize_theme_name(theme)

	if sanitized_theme not in questions_by_theme:
	questions_by_theme[sanitized_theme] = []

	try:
	question = {
	"statement": item['statement'],
	"options": {
	"A": answers[0],
	"B": answers[1],
	"C": answers[2],
	"D": answers[3]
	},
	"real_answer": correct_answer,
	"theme": theme,
	"sanitized_theme": sanitized_theme,
	"version": item.get('version', 'Default')
	}

	questions_by_theme[sanitized_theme].append(question)
	loaded += 1
	except Exception as e:
	skipped += 1
	continue

	print(f"Loaded {loaded} questions, skipped {skipped} invalid questions")
	return questions_by_theme

	def ask_ai_model(api_key: str, model: str, question: Dict) -> tuple:
	prompt = f"""You are a medical expert taking a urology examination. Please analyze this question carefully and provide your answer.

	Question: {question['statement']}

	Options:
	A) {question['options']['A']}
	B) {question['options']['B']}
	C) {question['options']['C']}
	D) {question['options']['D']}

	Please provide your answer in this exact format:
	Answer: [A/B/C/D]

	Then provide your reasoning."""

	try:
	headers = {
	"Authorization": f"Bearer {api_key}",
	"Content-Type": "application/json"
	}

	data = {
	"model": model,
	"messages": [
	{"role": "user", "content": prompt}
	]
	}

	response = requests.post("https://openrouter.ai/api/v1/chat/completions",
	headers=headers, json=data)

	if response.status_code == 200:
	result = response.json()
	ai_response = result["choices"][0]["message"]["content"]

	ai_answer = extract_answer_from_response(ai_response)

	return ai_response, ai_answer
	else:
	error_msg = f"API Error {response.status_code}: {response.text}"
	return error_msg, "API_ERROR"

	except Exception as e:
	error_msg = f"Request Error: {str(e)}"
	return error_msg, "REQUEST_ERROR"

	def extract_answer_from_response(ai_response: str) -> str:
	if not ai_response:
	return "EMPTY_RESPONSE"

	lines = ai_response.split('\n')

	for line in lines:
	line_clean = line.strip().lower()
	if line_clean.startswith('answer:'):
	answer_part = line.split(':')[1].strip().upper()
	for char in answer_part:
	if char in ['A', 'B', 'C', 'D']:
	return char

	for line in lines:
	line_clean = line.strip().lower()
	if 'answer is' in line_clean:
	for char in ['A', 'B', 'C', 'D']:
	if char.lower() in line_clean.split('answer is')[1][:5]:
	return char

	for line in lines[:5]:
	line_upper = line.upper()
	for char in ['A', 'B', 'C', 'D']:
	patterns = [f"{char})", f"{char}.", f"OPTION {char}", f"({char})", f"CHOICE {char}"]
	for pattern in patterns:
	if pattern in line_upper:
	return char

	for line in lines[:3]:
	for char in ['A', 'B', 'C', 'D']:
	if char in line.upper():
	return char

	for char in ['A', 'B', 'C', 'D']:
	if char in ai_response.upper():
	return char

	return "NO_ANSWER_FOUND"

	def save_results_to_dataset(results: List[Dict], hf_token: str = None) -> str:
	if not results:
	return "No results to save"

	if not hf_token:
	hf_token = os.getenv("HF_TOKEN")

	if not hf_token:
	return "❌ HuggingFace token not found. Please provide it in the interface or set HF_TOKEN environment variable"

	try:
	try:
	existing_dataset = load_dataset("SASLeaderboard/results", use_auth_token=hf_token)
	existing_data = existing_dataset['train'].to_pandas()
	except Exception:
	existing_data = None

	new_data = pd.DataFrame(results)

	if existing_data is not None:
	combined_data = pd.concat([existing_data, new_data], ignore_index=True)
	else:
	combined_data = new_data

	new_dataset = Dataset.from_pandas(combined_data)

	new_dataset.push_to_hub(
	"SASLeaderboard/results",
	token=hf_token,
	commit_message=f"Automated exam results for {results[0]['model']} - {len(results)} questions"
	)

	return f"✅ Successfully saved {len(results)} results to SASLeaderboard/results dataset"

	except Exception as e:
	return f"❌ Error saving results: {str(e)}"

	def run_automated_exam(api_key: str, model: str, hf_token: str = ""):
	if not api_key:
	yield "❌ Please provide OpenRouter API key"
	return

	if not model:
	yield "❌ Please provide model name"
	return

	yield "🔄 Loading questions from dataset..."

	try:
	all_questions_by_theme = load_questions_from_dataset()

	all_questions = []
	for theme_questions in all_questions_by_theme.values():
	all_questions.extend(theme_questions)

	total_questions = len(all_questions)

	yield f"✅ Loaded {total_questions} questions from dataset"
	yield f"🚀 Starting automated exam with ALL {total_questions} questions for model: {model}"

	session_id = f"{model}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}"
	results = []
	correct_count = 0

	for i, question in enumerate(all_questions):

	ai_response, ai_answer = ask_ai_model(api_key, model, question)

	if ai_answer in ["API_ERROR", "REQUEST_ERROR", "EMPTY_RESPONSE", "NO_ANSWER_FOUND"]:
	yield f"⚠️ Question {i+1}: Error getting answer - {ai_answer}. Response: {ai_response[:100]}..."

	is_correct = ai_answer == question['real_answer']
	if is_correct:
	correct_count += 1

	result = {
	"session_id": session_id,
	"model": model,
	"question": question['statement'],
	"theme": question['theme'],
	"correct_answer": question['real_answer'],
	"ai_answer": ai_answer,
	"ai_response": ai_response,
	"is_correct": is_correct,
	"timestamp": datetime.datetime.now().isoformat(),
	"options_a": question['options']['A'],
	"options_b": question['options']['B'],
	"options_c": question['options']['C'],
	"options_d": question['options']['D']
	}

	results.append(result)

	current_accuracy = (correct_count / (i + 1)) * 100

	status_emoji = "✅" if is_correct else "❌"
	yield f"{status_emoji} Q{i+1}/{total_questions}: Accuracy: {correct_count}/{i+1} ({current_accuracy:.1f}%) \| AI: {ai_answer} vs Correct: {question['real_answer']} \| {question['statement'][:80]}..."

	yield f"💾 Saving results to HuggingFace dataset..."

	save_result = save_results_to_dataset(results, hf_token)

	final_accuracy = (correct_count / len(results)) * 100
	yield f"""
	## 🎯 Exam Complete!

	Final Results:
	- Model: {model}
	- Total Questions: {len(results)}
	- Correct Answers: {correct_count}
	- Final Accuracy: {final_accuracy:.1f}%
	- Session ID: {session_id}

	Save Status: {save_result}

	The automated exam has been completed successfully!
	"""

	except Exception as e:
	yield f"❌ Error during automated exam: {str(e)}"

	with gr.Blocks(title="Automated Urology Exam System") as demo:
	gr.Markdown("# Automated Urology Exam System")
	gr.Markdown("This system automatically runs a complete urology exam for AI models using ALL available questions (~150) and saves results to the dataset.")

	with gr.Row():
	with gr.Column():
	gr.Markdown("Get your API key: [OpenRouter Keys](https://openrouter.ai/settings/keys)")
	api_key_input = gr.Textbox(
	label="OpenRouter API Key",
	type="password",
	placeholder="Enter your OpenRouter API key"
	)
	with gr.Column():
	gr.Markdown("Find models: [OpenRouter Models](https://openrouter.ai/models)")
	model_input = gr.Textbox(
	label="Model Name",
	placeholder="e.g., anthropic/claude-3-sonnet",
	value="anthropic/claude-3-sonnet"
	)

	with gr.Row():
	start_exam_btn = gr.Button("Start Automated Exam", variant="primary", size="lg")

	with gr.Row():
	progress_output = gr.Textbox(
	label="Exam Progress - Dont close this window",
	placeholder="Exam progress will be displayed here...",
	lines=15,
	max_lines=20,
	interactive=False
	)

	start_exam_btn.click(
	run_automated_exam,
	inputs=[api_key_input, model_input],
	outputs=[progress_output]
	)

	if __name__ == "__main__":
	demo.launch()