Spaces:

SASLeaderboard
/

leaderboard

Running

leaderboard / src /evaluation /questions.py

Rafael Jaime Moreno

working on everything

7d9d48e 27 days ago

3.22 kB

	from datasets import load_dataset
	from typing import Dict, List
	import re

	def sanitize_theme_name(theme: str) -> str:
	"""Convert theme name to valid Python identifier"""
	# Replace non-alphanumeric chars with underscore
	sanitized = re.sub(r'[^a-zA-Z0-9_]', '_', theme)
	# Ensure it starts with a letter
	if not sanitized[0].isalpha():
	sanitized = 'theme_' + sanitized
	return sanitized

	def load_questions_from_dataset() -> Dict[str, List[Dict]]:
	"""Load and format questions from the HuggingFace dataset"""
	dataset = load_dataset("RafaelJaime/sas_opposition_exam_data")

	# Group questions by theme
	questions_by_theme = {}
	skipped = 0
	loaded = 0

	for item in dataset['train']:
	theme = item['theme']
	answers = item.get('answers', [])
	correct_answer = item.get('correct_answer', '')

	# Skip invalid questions
	if not answers or not correct_answer or len(answers) < 3:
	skipped += 1
	continue

	# Ensure we have exactly 4 options
	while len(answers) < 4:
	answers.append(answers[-1])

	# Get sanitized theme name for use as identifier
	sanitized_theme = sanitize_theme_name(theme)

	if sanitized_theme not in questions_by_theme:
	questions_by_theme[sanitized_theme] = []

	# Format question in our standard format
	try:
	question = {
	"statement": item['statement'],
	"options": {
	"A": answers[0],
	"B": answers[1],
	"C": answers[2],
	"D": answers[3]
	},
	"real_answer": correct_answer,
	"theme": theme, # Keep original theme name for display
	"sanitized_theme": sanitized_theme, # Add sanitized name for internal use
	"version": item.get('version', 'Default')
	}

	questions_by_theme[sanitized_theme].append(question)
	loaded += 1
	except Exception as e:
	skipped += 1
	continue

	print(f"Loaded {loaded} questions, skipped {skipped} invalid questions")
	return questions_by_theme

	# Load questions from dataset
	try:
	EXAM_QUESTIONS = load_questions_from_dataset()
	print(f"Successfully loaded questions for {len(EXAM_QUESTIONS)} themes")
	except Exception as e:
	print(f"Error loading questions from dataset: {str(e)}")
	# Fallback questions in case dataset is not accessible
	EXAM_QUESTIONS = {
	"Administrativo_a": [
	{
	"statement": "El derecho a la protección de la salud se reconoce por la Constitución Española en el:",
	"options": {
	"A": "Artículo 44.",
	"B": "Artículo 46.",
	"C": "Artículo 43.",
	"D": "Artículo 41."
	},
	"real_answer": "C",
	"theme": "Administrativo-a",
	"sanitized_theme": "Administrativo_a",
	"version": "Default"
	}
	]
	}