RafaelJaime's picture
Create app.py
7a9b69c verified
import gradio as gr
import requests
import random
from datasets import load_dataset, Dataset
from typing import Dict, List
import re
import datetime
import pandas as pd
import os
from dotenv import load_dotenv
load_dotenv()
def sanitize_theme_name(theme: str) -> str:
sanitized = re.sub(r'[^\w\s-]', '', theme)
sanitized = re.sub(r'[-\s]+', '_', sanitized)
return sanitized.lower().strip('_')
def load_questions_from_dataset() -> Dict[str, List[Dict]]:
dataset = load_dataset("SASLeaderboard/sas_opposition_exam_data")
dataset = dataset['train'].filter(lambda x: x['theme'] == 'FEA Urología')
questions_by_theme = {}
skipped = 0
loaded = 0
for item in dataset:
theme = item['theme']
answers = item.get('answers', [])
correct_answer = item.get('correct_answer', '')
if not answers or not correct_answer or len(answers) < 3:
skipped += 1
continue
while len(answers) < 4:
answers.append(answers[-1])
sanitized_theme = sanitize_theme_name(theme)
if sanitized_theme not in questions_by_theme:
questions_by_theme[sanitized_theme] = []
try:
question = {
"statement": item['statement'],
"options": {
"A": answers[0],
"B": answers[1],
"C": answers[2],
"D": answers[3]
},
"real_answer": correct_answer,
"theme": theme,
"sanitized_theme": sanitized_theme,
"version": item.get('version', 'Default')
}
questions_by_theme[sanitized_theme].append(question)
loaded += 1
except Exception as e:
skipped += 1
continue
print(f"Loaded {loaded} questions, skipped {skipped} invalid questions")
return questions_by_theme
def ask_ai_model(api_key: str, model: str, question: Dict) -> tuple:
prompt = f"""You are a medical expert taking a urology examination. Please analyze this question carefully and provide your answer.
Question: {question['statement']}
Options:
A) {question['options']['A']}
B) {question['options']['B']}
C) {question['options']['C']}
D) {question['options']['D']}
Please provide your answer in this exact format:
Answer: [A/B/C/D]
Then provide your reasoning."""
try:
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
}
data = {
"model": model,
"messages": [
{"role": "user", "content": prompt}
]
}
response = requests.post("https://openrouter.ai/api/v1/chat/completions",
headers=headers, json=data)
if response.status_code == 200:
result = response.json()
ai_response = result["choices"][0]["message"]["content"]
ai_answer = extract_answer_from_response(ai_response)
return ai_response, ai_answer
else:
error_msg = f"API Error {response.status_code}: {response.text}"
return error_msg, "API_ERROR"
except Exception as e:
error_msg = f"Request Error: {str(e)}"
return error_msg, "REQUEST_ERROR"
def extract_answer_from_response(ai_response: str) -> str:
if not ai_response:
return "EMPTY_RESPONSE"
lines = ai_response.split('\n')
for line in lines:
line_clean = line.strip().lower()
if line_clean.startswith('answer:'):
answer_part = line.split(':')[1].strip().upper()
for char in answer_part:
if char in ['A', 'B', 'C', 'D']:
return char
for line in lines:
line_clean = line.strip().lower()
if 'answer is' in line_clean:
for char in ['A', 'B', 'C', 'D']:
if char.lower() in line_clean.split('answer is')[1][:5]:
return char
for line in lines[:5]:
line_upper = line.upper()
for char in ['A', 'B', 'C', 'D']:
patterns = [f"{char})", f"{char}.", f"OPTION {char}", f"({char})", f"CHOICE {char}"]
for pattern in patterns:
if pattern in line_upper:
return char
for line in lines[:3]:
for char in ['A', 'B', 'C', 'D']:
if char in line.upper():
return char
for char in ['A', 'B', 'C', 'D']:
if char in ai_response.upper():
return char
return "NO_ANSWER_FOUND"
def save_results_to_dataset(results: List[Dict], hf_token: str = None) -> str:
if not results:
return "No results to save"
if not hf_token:
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
return "❌ HuggingFace token not found. Please provide it in the interface or set HF_TOKEN environment variable"
try:
try:
existing_dataset = load_dataset("SASLeaderboard/results", use_auth_token=hf_token)
existing_data = existing_dataset['train'].to_pandas()
except Exception:
existing_data = None
new_data = pd.DataFrame(results)
if existing_data is not None:
combined_data = pd.concat([existing_data, new_data], ignore_index=True)
else:
combined_data = new_data
new_dataset = Dataset.from_pandas(combined_data)
new_dataset.push_to_hub(
"SASLeaderboard/results",
token=hf_token,
commit_message=f"Automated exam results for {results[0]['model']} - {len(results)} questions"
)
return f"✅ Successfully saved {len(results)} results to SASLeaderboard/results dataset"
except Exception as e:
return f"❌ Error saving results: {str(e)}"
def run_automated_exam(api_key: str, model: str, hf_token: str = ""):
if not api_key:
yield "❌ Please provide OpenRouter API key"
return
if not model:
yield "❌ Please provide model name"
return
yield "🔄 Loading questions from dataset..."
try:
all_questions_by_theme = load_questions_from_dataset()
all_questions = []
for theme_questions in all_questions_by_theme.values():
all_questions.extend(theme_questions)
total_questions = len(all_questions)
yield f"✅ Loaded {total_questions} questions from dataset"
yield f"🚀 Starting automated exam with ALL {total_questions} questions for model: {model}"
session_id = f"{model}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}"
results = []
correct_count = 0
for i, question in enumerate(all_questions):
ai_response, ai_answer = ask_ai_model(api_key, model, question)
if ai_answer in ["API_ERROR", "REQUEST_ERROR", "EMPTY_RESPONSE", "NO_ANSWER_FOUND"]:
yield f"⚠️ Question {i+1}: Error getting answer - {ai_answer}. Response: {ai_response[:100]}..."
is_correct = ai_answer == question['real_answer']
if is_correct:
correct_count += 1
result = {
"session_id": session_id,
"model": model,
"question": question['statement'],
"theme": question['theme'],
"correct_answer": question['real_answer'],
"ai_answer": ai_answer,
"ai_response": ai_response,
"is_correct": is_correct,
"timestamp": datetime.datetime.now().isoformat(),
"options_a": question['options']['A'],
"options_b": question['options']['B'],
"options_c": question['options']['C'],
"options_d": question['options']['D']
}
results.append(result)
current_accuracy = (correct_count / (i + 1)) * 100
status_emoji = "✅" if is_correct else "❌"
yield f"{status_emoji} Q{i+1}/{total_questions}: Accuracy: {correct_count}/{i+1} ({current_accuracy:.1f}%) | AI: {ai_answer} vs Correct: {question['real_answer']} | {question['statement'][:80]}..."
yield f"💾 Saving results to HuggingFace dataset..."
save_result = save_results_to_dataset(results, hf_token)
final_accuracy = (correct_count / len(results)) * 100
yield f"""
## 🎯 Exam Complete!
**Final Results:**
- Model: {model}
- Total Questions: {len(results)}
- Correct Answers: {correct_count}
- Final Accuracy: {final_accuracy:.1f}%
- Session ID: {session_id}
**Save Status:** {save_result}
The automated exam has been completed successfully!
"""
except Exception as e:
yield f"❌ Error during automated exam: {str(e)}"
with gr.Blocks(title="Automated Urology Exam System") as demo:
gr.Markdown("# Automated Urology Exam System")
gr.Markdown("This system automatically runs a complete urology exam for AI models using ALL available questions (~150) and saves results to the dataset.")
with gr.Row():
with gr.Column():
gr.Markdown("**Get your API key:** [OpenRouter Keys](https://openrouter.ai/settings/keys)")
api_key_input = gr.Textbox(
label="OpenRouter API Key",
type="password",
placeholder="Enter your OpenRouter API key"
)
with gr.Column():
gr.Markdown("**Find models:** [OpenRouter Models](https://openrouter.ai/models)")
model_input = gr.Textbox(
label="Model Name",
placeholder="e.g., anthropic/claude-3-sonnet",
value="anthropic/claude-3-sonnet"
)
with gr.Row():
start_exam_btn = gr.Button("Start Automated Exam", variant="primary", size="lg")
with gr.Row():
progress_output = gr.Textbox(
label="Exam Progress - Dont close this window",
placeholder="Exam progress will be displayed here...",
lines=15,
max_lines=20,
interactive=False
)
start_exam_btn.click(
run_automated_exam,
inputs=[api_key_input, model_input],
outputs=[progress_output]
)
if __name__ == "__main__":
demo.launch()