Spaces:

SASLeaderboard
/

leaderboard_backend

Sleeping

File size: 10,746 Bytes

7a9b69c

import gradio as gr
import requests
import random
from datasets import load_dataset, Dataset
from typing import Dict, List
import re
import datetime
import pandas as pd
import os
from dotenv import load_dotenv

load_dotenv()

def sanitize_theme_name(theme: str) -> str:
    sanitized = re.sub(r'[^\w\s-]', '', theme)
    sanitized = re.sub(r'[-\s]+', '_', sanitized)
    return sanitized.lower().strip('_')

def load_questions_from_dataset() -> Dict[str, List[Dict]]:
    dataset = load_dataset("SASLeaderboard/sas_opposition_exam_data")
    dataset = dataset['train'].filter(lambda x: x['theme'] == 'FEA Urología')
    
    questions_by_theme = {}
    skipped = 0
    loaded = 0
    
    for item in dataset:
        theme = item['theme']
        answers = item.get('answers', [])
        correct_answer = item.get('correct_answer', '')
        
        if not answers or not correct_answer or len(answers) < 3:
            skipped += 1
            continue
            
        while len(answers) < 4:
            answers.append(answers[-1])
            
        sanitized_theme = sanitize_theme_name(theme)
            
        if sanitized_theme not in questions_by_theme:
            questions_by_theme[sanitized_theme] = []
            
        try:
            question = {
                "statement": item['statement'],
                "options": {
                    "A": answers[0],
                    "B": answers[1],
                    "C": answers[2],
                    "D": answers[3]
                },
                "real_answer": correct_answer,
                "theme": theme,
                "sanitized_theme": sanitized_theme,
                "version": item.get('version', 'Default')
            }
            
            questions_by_theme[sanitized_theme].append(question)
            loaded += 1
        except Exception as e:
            skipped += 1
            continue
    
    print(f"Loaded {loaded} questions, skipped {skipped} invalid questions")
    return questions_by_theme

def ask_ai_model(api_key: str, model: str, question: Dict) -> tuple:
    prompt = f"""You are a medical expert taking a urology examination. Please analyze this question carefully and provide your answer.

Question: {question['statement']}

Options:
A) {question['options']['A']}
B) {question['options']['B']}
C) {question['options']['C']}
D) {question['options']['D']}

Please provide your answer in this exact format:
Answer: [A/B/C/D]

Then provide your reasoning."""

    try:
        headers = {
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json"
        }
        
        data = {
            "model": model,
            "messages": [
                {"role": "user", "content": prompt}
            ]
        }
        
        response = requests.post("https://openrouter.ai/api/v1/chat/completions", 
                               headers=headers, json=data)
        
        if response.status_code == 200:
            result = response.json()
            ai_response = result["choices"][0]["message"]["content"]
            
            ai_answer = extract_answer_from_response(ai_response)
            
            return ai_response, ai_answer
        else:
            error_msg = f"API Error {response.status_code}: {response.text}"
            return error_msg, "API_ERROR"
    
    except Exception as e:
        error_msg = f"Request Error: {str(e)}"
        return error_msg, "REQUEST_ERROR"

def extract_answer_from_response(ai_response: str) -> str:
    if not ai_response:
        return "EMPTY_RESPONSE"
    
    lines = ai_response.split('\n')
    
    for line in lines:
        line_clean = line.strip().lower()
        if line_clean.startswith('answer:'):
            answer_part = line.split(':')[1].strip().upper()
            for char in answer_part:
                if char in ['A', 'B', 'C', 'D']:
                    return char
    
    for line in lines:
        line_clean = line.strip().lower()
        if 'answer is' in line_clean:
            for char in ['A', 'B', 'C', 'D']:
                if char.lower() in line_clean.split('answer is')[1][:5]:
                    return char
    
    for line in lines[:5]:
        line_upper = line.upper()
        for char in ['A', 'B', 'C', 'D']:
            patterns = [f"{char})", f"{char}.", f"OPTION {char}", f"({char})", f"CHOICE {char}"]
            for pattern in patterns:
                if pattern in line_upper:
                    return char
    
    for line in lines[:3]:
        for char in ['A', 'B', 'C', 'D']:
            if char in line.upper():
                return char
    
    for char in ['A', 'B', 'C', 'D']:
        if char in ai_response.upper():
            return char
    
    return "NO_ANSWER_FOUND"

def save_results_to_dataset(results: List[Dict], hf_token: str = None) -> str:
    if not results:
        return "No results to save"
    
    if not hf_token:
        hf_token = os.getenv("HF_TOKEN")
    
    if not hf_token:
        return "❌ HuggingFace token not found. Please provide it in the interface or set HF_TOKEN environment variable"
    
    try:
        try:
            existing_dataset = load_dataset("SASLeaderboard/results", use_auth_token=hf_token)
            existing_data = existing_dataset['train'].to_pandas()
        except Exception:
            existing_data = None
        
        new_data = pd.DataFrame(results)
        
        if existing_data is not None:
            combined_data = pd.concat([existing_data, new_data], ignore_index=True)
        else:
            combined_data = new_data
        
        new_dataset = Dataset.from_pandas(combined_data)
        
        new_dataset.push_to_hub(
            "SASLeaderboard/results",
            token=hf_token,
            commit_message=f"Automated exam results for {results[0]['model']} - {len(results)} questions"
        )
        
        return f"✅ Successfully saved {len(results)} results to SASLeaderboard/results dataset"
        
    except Exception as e:
        return f"❌ Error saving results: {str(e)}"

def run_automated_exam(api_key: str, model: str, hf_token: str = ""):
    if not api_key:
        yield "❌ Please provide OpenRouter API key"
        return
    
    if not model:
        yield "❌ Please provide model name"
        return
    
    yield "🔄 Loading questions from dataset..."
    
    try:
        all_questions_by_theme = load_questions_from_dataset()
        
        all_questions = []
        for theme_questions in all_questions_by_theme.values():
            all_questions.extend(theme_questions)
        
        total_questions = len(all_questions)
        
        yield f"✅ Loaded {total_questions} questions from dataset"
        yield f"🚀 Starting automated exam with ALL {total_questions} questions for model: {model}"
        
        session_id = f"{model}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}"
        results = []
        correct_count = 0
        
        for i, question in enumerate(all_questions):
            
            ai_response, ai_answer = ask_ai_model(api_key, model, question)
            
            if ai_answer in ["API_ERROR", "REQUEST_ERROR", "EMPTY_RESPONSE", "NO_ANSWER_FOUND"]:
                yield f"⚠️ Question {i+1}: Error getting answer - {ai_answer}. Response: {ai_response[:100]}..."
            
            is_correct = ai_answer == question['real_answer']
            if is_correct:
                correct_count += 1
            
            result = {
                "session_id": session_id,
                "model": model,
                "question": question['statement'],
                "theme": question['theme'],
                "correct_answer": question['real_answer'],
                "ai_answer": ai_answer,
                "ai_response": ai_response,
                "is_correct": is_correct,
                "timestamp": datetime.datetime.now().isoformat(),
                "options_a": question['options']['A'],
                "options_b": question['options']['B'],
                "options_c": question['options']['C'],
                "options_d": question['options']['D']
            }
            
            results.append(result)
            
            current_accuracy = (correct_count / (i + 1)) * 100
            
            status_emoji = "✅" if is_correct else "❌"
            yield f"{status_emoji} Q{i+1}/{total_questions}: Accuracy: {correct_count}/{i+1} ({current_accuracy:.1f}%) | AI: {ai_answer} vs Correct: {question['real_answer']} | {question['statement'][:80]}..."
        
        yield f"💾 Saving results to HuggingFace dataset..."
        
        save_result = save_results_to_dataset(results, hf_token)
        
        final_accuracy = (correct_count / len(results)) * 100
        yield f"""
## 🎯 Exam Complete!

**Final Results:**
- Model: {model}
- Total Questions: {len(results)}
- Correct Answers: {correct_count}
- Final Accuracy: {final_accuracy:.1f}%
- Session ID: {session_id}

**Save Status:** {save_result}

The automated exam has been completed successfully!
"""
        
    except Exception as e:
        yield f"❌ Error during automated exam: {str(e)}"

with gr.Blocks(title="Automated Urology Exam System") as demo:
    gr.Markdown("# Automated Urology Exam System")
    gr.Markdown("This system automatically runs a complete urology exam for AI models using ALL available questions (~150) and saves results to the dataset.")
    
    with gr.Row():
        with gr.Column():
            gr.Markdown("**Get your API key:** [OpenRouter Keys](https://openrouter.ai/settings/keys)")
            api_key_input = gr.Textbox(
                label="OpenRouter API Key", 
                type="password",
                placeholder="Enter your OpenRouter API key"
            )
        with gr.Column():
            gr.Markdown("**Find models:** [OpenRouter Models](https://openrouter.ai/models)")
            model_input = gr.Textbox(
                label="Model Name",
                placeholder="e.g., anthropic/claude-3-sonnet",
                value="anthropic/claude-3-sonnet"
            )
    
    with gr.Row():
        start_exam_btn = gr.Button("Start Automated Exam", variant="primary", size="lg")
    
    with gr.Row():
        progress_output = gr.Textbox(
            label="Exam Progress - Dont close this window",
            placeholder="Exam progress will be displayed here...",
            lines=15,
            max_lines=20,
            interactive=False
        )
    
    start_exam_btn.click(
        run_automated_exam,
        inputs=[api_key_input, model_input],
        outputs=[progress_output]
    )

if __name__ == "__main__":
    demo.launch()