Spaces:
Running
Running
from datasets import load_dataset | |
from typing import Dict, List | |
import re | |
def sanitize_theme_name(theme: str) -> str: | |
"""Convert theme name to valid Python identifier""" | |
# Replace non-alphanumeric chars with underscore | |
sanitized = re.sub(r'[^a-zA-Z0-9_]', '_', theme) | |
# Ensure it starts with a letter | |
if not sanitized[0].isalpha(): | |
sanitized = 'theme_' + sanitized | |
return sanitized | |
def load_questions_from_dataset() -> Dict[str, List[Dict]]: | |
"""Load and format questions from the HuggingFace dataset""" | |
dataset = load_dataset("RafaelJaime/sas_opposition_exam_data") | |
# Group questions by theme | |
questions_by_theme = {} | |
skipped = 0 | |
loaded = 0 | |
for item in dataset['train']: | |
theme = item['theme'] | |
answers = item.get('answers', []) | |
correct_answer = item.get('correct_answer', '') | |
# Skip invalid questions | |
if not answers or not correct_answer or len(answers) < 3: | |
skipped += 1 | |
continue | |
# Ensure we have exactly 4 options | |
while len(answers) < 4: | |
answers.append(answers[-1]) | |
# Get sanitized theme name for use as identifier | |
sanitized_theme = sanitize_theme_name(theme) | |
if sanitized_theme not in questions_by_theme: | |
questions_by_theme[sanitized_theme] = [] | |
# Format question in our standard format | |
try: | |
question = { | |
"statement": item['statement'], | |
"options": { | |
"A": answers[0], | |
"B": answers[1], | |
"C": answers[2], | |
"D": answers[3] | |
}, | |
"real_answer": correct_answer, | |
"theme": theme, # Keep original theme name for display | |
"sanitized_theme": sanitized_theme, # Add sanitized name for internal use | |
"version": item.get('version', 'Default') | |
} | |
questions_by_theme[sanitized_theme].append(question) | |
loaded += 1 | |
except Exception as e: | |
skipped += 1 | |
continue | |
print(f"Loaded {loaded} questions, skipped {skipped} invalid questions") | |
return questions_by_theme | |
# Load questions from dataset | |
try: | |
EXAM_QUESTIONS = load_questions_from_dataset() | |
print(f"Successfully loaded questions for {len(EXAM_QUESTIONS)} themes") | |
except Exception as e: | |
print(f"Error loading questions from dataset: {str(e)}") | |
# Fallback questions in case dataset is not accessible | |
EXAM_QUESTIONS = { | |
"Administrativo_a": [ | |
{ | |
"statement": "El derecho a la protección de la salud se reconoce por la Constitución Española en el:", | |
"options": { | |
"A": "Artículo 44.", | |
"B": "Artículo 46.", | |
"C": "Artículo 43.", | |
"D": "Artículo 41." | |
}, | |
"real_answer": "C", | |
"theme": "Administrativo-a", | |
"sanitized_theme": "Administrativo_a", | |
"version": "Default" | |
} | |
] | |
} |