File size: 3,224 Bytes
7d9d48e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
from datasets import load_dataset
from typing import Dict, List
import re

def sanitize_theme_name(theme: str) -> str:
    """Convert theme name to valid Python identifier"""
    # Replace non-alphanumeric chars with underscore
    sanitized = re.sub(r'[^a-zA-Z0-9_]', '_', theme)
    # Ensure it starts with a letter
    if not sanitized[0].isalpha():
        sanitized = 'theme_' + sanitized
    return sanitized

def load_questions_from_dataset() -> Dict[str, List[Dict]]:
    """Load and format questions from the HuggingFace dataset"""
    dataset = load_dataset("RafaelJaime/sas_opposition_exam_data")
    
    # Group questions by theme
    questions_by_theme = {}
    skipped = 0
    loaded = 0
    
    for item in dataset['train']:
        theme = item['theme']
        answers = item.get('answers', [])
        correct_answer = item.get('correct_answer', '')
        
        # Skip invalid questions
        if not answers or not correct_answer or len(answers) < 3:
            skipped += 1
            continue
            
        # Ensure we have exactly 4 options
        while len(answers) < 4:
            answers.append(answers[-1])
            
        # Get sanitized theme name for use as identifier
        sanitized_theme = sanitize_theme_name(theme)
            
        if sanitized_theme not in questions_by_theme:
            questions_by_theme[sanitized_theme] = []
            
        # Format question in our standard format
        try:
            question = {
                "statement": item['statement'],
                "options": {
                    "A": answers[0],
                    "B": answers[1],
                    "C": answers[2],
                    "D": answers[3]
                },
                "real_answer": correct_answer,
                "theme": theme,  # Keep original theme name for display
                "sanitized_theme": sanitized_theme,  # Add sanitized name for internal use
                "version": item.get('version', 'Default')
            }
            
            questions_by_theme[sanitized_theme].append(question)
            loaded += 1
        except Exception as e:
            skipped += 1
            continue
    
    print(f"Loaded {loaded} questions, skipped {skipped} invalid questions")
    return questions_by_theme

# Load questions from dataset
try:
    EXAM_QUESTIONS = load_questions_from_dataset()
    print(f"Successfully loaded questions for {len(EXAM_QUESTIONS)} themes")
except Exception as e:
    print(f"Error loading questions from dataset: {str(e)}")
    # Fallback questions in case dataset is not accessible
    EXAM_QUESTIONS = {
        "Administrativo_a": [
            {
                "statement": "El derecho a la protección de la salud se reconoce por la Constitución Española en el:",
                "options": {
                    "A": "Artículo 44.",
                    "B": "Artículo 46.",
                    "C": "Artículo 43.",
                    "D": "Artículo 41."
                },
                "real_answer": "C",
                "theme": "Administrativo-a",
                "sanitized_theme": "Administrativo_a",
                "version": "Default"
            }
        ]
    }