leaderboard / src /evaluation /questions.py
Rafael Jaime Moreno
working on everything
7d9d48e
from datasets import load_dataset
from typing import Dict, List
import re
def sanitize_theme_name(theme: str) -> str:
"""Convert theme name to valid Python identifier"""
# Replace non-alphanumeric chars with underscore
sanitized = re.sub(r'[^a-zA-Z0-9_]', '_', theme)
# Ensure it starts with a letter
if not sanitized[0].isalpha():
sanitized = 'theme_' + sanitized
return sanitized
def load_questions_from_dataset() -> Dict[str, List[Dict]]:
"""Load and format questions from the HuggingFace dataset"""
dataset = load_dataset("RafaelJaime/sas_opposition_exam_data")
# Group questions by theme
questions_by_theme = {}
skipped = 0
loaded = 0
for item in dataset['train']:
theme = item['theme']
answers = item.get('answers', [])
correct_answer = item.get('correct_answer', '')
# Skip invalid questions
if not answers or not correct_answer or len(answers) < 3:
skipped += 1
continue
# Ensure we have exactly 4 options
while len(answers) < 4:
answers.append(answers[-1])
# Get sanitized theme name for use as identifier
sanitized_theme = sanitize_theme_name(theme)
if sanitized_theme not in questions_by_theme:
questions_by_theme[sanitized_theme] = []
# Format question in our standard format
try:
question = {
"statement": item['statement'],
"options": {
"A": answers[0],
"B": answers[1],
"C": answers[2],
"D": answers[3]
},
"real_answer": correct_answer,
"theme": theme, # Keep original theme name for display
"sanitized_theme": sanitized_theme, # Add sanitized name for internal use
"version": item.get('version', 'Default')
}
questions_by_theme[sanitized_theme].append(question)
loaded += 1
except Exception as e:
skipped += 1
continue
print(f"Loaded {loaded} questions, skipped {skipped} invalid questions")
return questions_by_theme
# Load questions from dataset
try:
EXAM_QUESTIONS = load_questions_from_dataset()
print(f"Successfully loaded questions for {len(EXAM_QUESTIONS)} themes")
except Exception as e:
print(f"Error loading questions from dataset: {str(e)}")
# Fallback questions in case dataset is not accessible
EXAM_QUESTIONS = {
"Administrativo_a": [
{
"statement": "El derecho a la protección de la salud se reconoce por la Constitución Española en el:",
"options": {
"A": "Artículo 44.",
"B": "Artículo 46.",
"C": "Artículo 43.",
"D": "Artículo 41."
},
"real_answer": "C",
"theme": "Administrativo-a",
"sanitized_theme": "Administrativo_a",
"version": "Default"
}
]
}