Spaces:

rairo
/

marka-data-api

Running

App Files Files Community

rairo commited on 8 days ago

Commit

544ec28

verified ·

1 Parent(s): f762b45

Update main.py

Browse files

Files changed (1) hide show

main.py +710 -137

main.py CHANGED Viewed

@@ -3,39 +3,141 @@ import json
 import logging
 import re
 import time
 import numpy as np
 import fitz  # PyMuPDF
 from flask import Flask, request, jsonify
 from flask_cors import CORS
 from google import genai
-from google.genai import types
 from sklearn.metrics.pairwise import cosine_similarity
 # --- CONFIGURATION ---
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
-# Directory where your PDFs live (e.g., ./syllabi/A/Physics.pdf)
 SYLLABI_DIR = "syllabi"
-INDEX_FILE = "syllabus_index.json" # Local cache file
 # Google GenAI Config
 GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
 EMBEDDING_MODEL = "models/text-embedding-004"
-# --- GLOBAL STATE (IN-MEMORY) ---
-# Structure: { "A_9706": { "title": "Accounting", "tree": [...] }, ... }
-SYLLABUS_MAP = {}
-# Structure: [ { "id": "...", "vector": [...], "text": "...", "meta": {...} } ]
 VECTOR_DB = []
-VECTOR_MATRIX = None # Numpy array for fast math
 app = Flask(__name__)
 CORS(app)
 # -----------------------------------------------------------------------------
-# 1. THE PARSER ENGINE (Extracts Structure from PDF)
 # -----------------------------------------------------------------------------
 class PDFParser:
@@ -43,15 +145,12 @@ class PDFParser:
         self.filepath = filepath
         self.filename = os.path.basename(filepath)
         self.doc = fitz.open(filepath)
-        # Determine Subject and Level from filename/path
-        # Expected: syllabi/A/Accounting_9706.pdf
-        parts = filepath.split(os.sep)
         self.level = parts[-2] if len(parts) > 1 else "General"
-        # Extract code if present (e.g., 9618)
         self.subject_code = re.search(r'\d{4}', self.filename)
         self.subject_code = self.subject_code.group(0) if self.subject_code else "0000"
-        self.subject_name = self.filename.split('_')[0]
         self.unique_id = f"{self.level}_{self.subject_code}"
     def get_font_characteristics(self):
@@ -64,57 +163,86 @@ class PDFParser:
                     for s in l.get("spans", []):
                         size = round(s["size"], 1)
                         font_sizes[size] = font_sizes.get(size, 0) + len(s["text"])
-        # The font size with the most characters is likely the "Body Text"
-        if not font_sizes: return 10.0
         return max(font_sizes, key=font_sizes.get)
-    def parse(self):
         """
-        Heuristic parsing:
-        - Text significantly larger than body = Topic
-        - Bold text slightly larger than body = Subtopic
-        - Body text = Content/Objectives
         """
         body_size = self.get_font_characteristics()
-        logger.info(f"Parsing {self.filename} (Body size approx {body_size}pt)")
         syllabus_tree = []
         current_topic = None
         current_subtopic = None
-        # Regex to detect "Topic 1" or "1.1" or "Key Question"
         topic_pattern = re.compile(r'^(\d+\.?\s|Key Question\s)', re.IGNORECASE)
-        for page in self.doc:
             blocks = page.get_text("dict")["blocks"]
             for b in blocks:
                 block_text = ""
                 max_size = 0
                 is_bold = False
-                # Reconstruct line text and finding max font style
                 for l in b.get("lines", []):
                     for s in l.get("spans", []):
                         text = s["text"].strip()
-                        if not text: continue
                         block_text += text + " "
-                        if s["size"] > max_size: max_size = s["size"]
-                        if "bold" in s["font"].lower(): is_bold = True
                 block_text = block_text.strip()
-                if len(block_text) < 3: continue # Skip noise
-                # HEURISTIC 1: TOPIC (Large Header)
-                # Usually 2pt+ larger than body
                 if max_size > body_size + 2:
-                    # Save previous
                     if current_subtopic and current_topic:
                         current_topic["children"].append(current_subtopic)
                         current_subtopic = None
                     if current_topic:
                         syllabus_tree.append(current_topic)
                     current_topic = {
                         "id": f"{self.unique_id}_{len(syllabus_tree)}",
                         "title": block_text,
@@ -123,15 +251,19 @@ class PDFParser:
                     }
                     current_subtopic = None
-                # HEURISTIC 2: SUBTOPIC (Bold, slightly larger or same size as body)
-                # Must start with number or specific keyword to reduce noise
-                elif (is_bold and max_size >= body_size) or (topic_pattern.match(block_text) and max_size >= body_size):
                     if current_subtopic and current_topic:
                         current_topic["children"].append(current_subtopic)
-                    # If no topic exists yet, create a dummy one
                     if not current_topic:
-                        current_topic = {"id": f"{self.unique_id}_root", "title": "Syllabus Overview", "type": "topic", "children": []}
                     current_subtopic = {
                         "id": f"{current_topic['id']}_{len(current_topic['children'])}",
@@ -145,11 +277,9 @@ class PDFParser:
                     if current_subtopic:
                         current_subtopic["content"].append(block_text)
                     elif current_topic:
-                        # Sometimes text appears directly under a topic
-                        # Create implicit subtopic
                         current_subtopic = {
                             "id": f"{current_topic['id']}_intro",
-                            "title": "Introduction / Overview",
                             "type": "subtopic",
                             "content": [block_text]
                         }
@@ -165,82 +295,308 @@ class PDFParser:
                 "id": self.unique_id,
                 "subject": self.subject_name,
                 "code": self.subject_code,
-                "level": self.level
             },
             "tree": syllabus_tree
         }
 # -----------------------------------------------------------------------------
-# 2. THE VECTOR ENGINE (Embeddings & Search)
 # -----------------------------------------------------------------------------
 def generate_embeddings(texts):
-    """Generates embeddings using Gemini API (Batching recommended for production)."""
     if not GEMINI_API_KEY:
-        logger.warning("No Gemini API Key found. Skipping embeddings.")
-        return [np.zeros(768) for _ in texts] # Dummy vectors
-    client = genai.Client(api_key=GEMINI_API_KEY)
     results = []
-    # Simple batching to avoid hitting limits
-    batch_size = 10
     for i in range(0, len(texts), batch_size):
-        batch = texts[i:i+batch_size]
         try:
-            resp = client.models.embed_content(
                 model=EMBEDDING_MODEL,
                 contents=batch,
             )
-            # Handle list of embeddings
             for embedding in resp.embeddings:
-                results.append(np.array(embedding.values))
         except Exception as e:
-            logger.error(f"Embedding failed: {e}")
-            # Fallback for failed batch
-            for _ in batch: results.append(np.zeros(768))
     return results
 def build_index():
-    """Walks the directory, parses PDFs, builds JSON tree and Vector Index."""
-    global SYLLABUS_MAP, VECTOR_DB, VECTOR_MATRIX
     logger.info("🚀 Starting Build Process...")
-    # 1. Walk Directory
-    if not os.path.exists(SYLLABI_DIR):
-        logger.error(f"Directory {SYLLABI_DIR} not found.")
-        return
     parsed_data = []
-    for root, dirs, files in os.walk(SYLLABI_DIR):
-        for file in files:
-            if file.endswith(".pdf"):
-                path = os.path.join(root, file)
-                parser = PDFParser(path)
-                data = parser.parse()
-                parsed_data.append(data)
-                # Store in Map
-                SYLLABUS_MAP[data["meta"]["id"]] = data
-    # 2. Flatten for Vectorization
     chunks_to_embed = []
     chunk_metadata = []
     for item in parsed_data:
         meta_base = item["meta"]
         for topic in item["tree"]:
-            for sub in topic["children"]:
-                # Create a rich semantic chunk
-                # Format: "Subject Level - Topic - Subtopic: Content"
-                text_blob = "\n".join(sub["content"])
-                if len(text_blob) < 10: continue # Skip empty chunks
-                rich_text = f"{meta_base['subject']} {meta_base['level']} - {topic['title']} - {sub['title']}:\n{text_blob}"
                 chunks_to_embed.append(rich_text)
                 chunk_metadata.append({
                     "subject_id": meta_base["id"],
@@ -250,33 +606,149 @@ def build_index():
                     "content": text_blob
                 })
-    # 3. Generate Embeddings
     logger.info(f"🧮 Generating embeddings for {len(chunks_to_embed)} chunks...")
     vectors = generate_embeddings(chunks_to_embed)
-    # 4. Populate Global DB
     VECTOR_DB = []
     valid_vectors = []
     for i, vec in enumerate(vectors):
         VECTOR_DB.append({
-            "vector": vec, # Keep for debug/individual access
             "meta": chunk_metadata[i]
         })
-        valid_vectors.append(vec)
     if valid_vectors:
         VECTOR_MATRIX = np.vstack(valid_vectors)
-    logger.info("✅ Indexing Complete.")
 # -----------------------------------------------------------------------------
-# 3. API SERVER (The Retrieval Layer)
 # -----------------------------------------------------------------------------
 @app.route('/health', methods=['GET'])
 def health():
-    return jsonify({"status": "online", "subjects_loaded": list(SYLLABUS_MAP.keys())})
 @app.route('/v1/structure/<subject_id>', methods=['GET'])
 def get_structure(subject_id):
@@ -286,82 +758,183 @@ def get_structure(subject_id):
         return jsonify({"error": "Subject not found"}), 404
     return jsonify(data)
 @app.route('/v1/search', methods=['POST'])
 def search():
     """
     Semantic Retrieval.
     Input: { "query": "...", "filter_subject_id": "..." (optional) }
     """
-    if VECTOR_MATRIX is None:
         return jsonify({"error": "Index not ready"}), 503
-    data = request.json
     query = data.get("query")
     subject_filter = data.get("filter_subject_id")
     if not query:
         return jsonify({"error": "Query required"}), 400
-    # 1. Embed Query
-    client = genai.Client(api_key=GEMINI_API_KEY)
     try:
-        resp = client.models.embed_content(model=EMBEDDING_MODEL, contents=query)
         query_vec = np.array(resp.embeddings[0].values).reshape(1, -1)
     except Exception as e:
         return jsonify({"error": str(e)}), 500
-    # 2. Vector Search (Cosine Similarity)
-    # scores shape: (1, N_chunks)
     scores = cosine_similarity(query_vec, VECTOR_MATRIX)[0]
-    # 3. Filter and Sort
-    results = []
-    # Get top 10 indices
     top_indices = np.argsort(scores)[::-1]
     count = 0
     for idx in top_indices:
-        if scores[idx] < 0.3: break # Threshold cutoff
         entry = VECTOR_DB[idx]
         meta = entry["meta"]
-        # Apply Filter
         if subject_filter and meta["subject_id"] != subject_filter:
             continue
         results.append({
             "score": float(scores[idx]),
             "subject_id": meta["subject_id"],
             "title": meta["title"],
-            "content": meta["content"], # Raw text chunk
-            "node_id": meta["subtopic_id"] # Pointer to the structure tree
         })
         count += 1
-        if count >= 5: break # Limit to top 5
     return jsonify({"results": results})
 # -----------------------------------------------------------------------------
-# 4. STARTUP BOOTSTRAP
 # -----------------------------------------------------------------------------
 def start_app():
-    # In a real deployment, we might load from disk here.
-    # For now, we rebuild on boot.
-    if not os.path.exists(SYLLABI_DIR):
-        os.makedirs(os.path.join(SYLLABI_DIR, "A"), exist_ok=True)
-        os.makedirs(os.path.join(SYLLABI_DIR, "O"), exist_ok=True)
-        logger.warning(f"Created empty {SYLLABI_DIR}. Please add PDFs.")
-    # Run Indexer
-    build_index()
-# Run the builder once on import (or server start)
 with app.app_context():
     start_app()
 if __name__ == '__main__':
-    # Use 7860 for HF Spaces
     app.run(host='0.0.0.0', port=7860)

 import logging
 import re
 import time
+import threading
 import numpy as np
 import fitz  # PyMuPDF
 from flask import Flask, request, jsonify
 from flask_cors import CORS
 from google import genai
 from sklearn.metrics.pairwise import cosine_similarity
+import firebase_admin
+from firebase_admin import credentials, db as firebase_db
 # --- CONFIGURATION ---
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 SYLLABI_DIR = "syllabi"
+PAST_EXAMS_DIR = "past_exams"
 # Google GenAI Config
 GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
 EMBEDDING_MODEL = "models/text-embedding-004"
+# --- GLOBAL STATE (IN-MEMORY CACHE) ---
+# Structure: { "A_9706": { "meta": {...}, "tree": [...] }, ... }
+SYLLABUS_MAP = {}
+# Structure: [ { "vector": [...], "meta": {...} } ]
 VECTOR_DB = []
+VECTOR_MATRIX = None  # Numpy array for fast math
+# Past exam index: { "A_9706": [ { paperId, year, session, fileUrl, pages: [...] }, ... ] }
+EXAM_MAP = {}
 app = Flask(__name__)
 CORS(app)
 # -----------------------------------------------------------------------------
+# 0. FIREBASE INITIALIZATION
+# -----------------------------------------------------------------------------
+firebase_db_ref = None
+def init_firebase():
+    global firebase_db_ref
+    try:
+        credentials_json_string = os.environ.get("FIREBASE")
+        if not credentials_json_string:
+            logger.warning("FIREBASE env var not set. Firebase caching disabled.")
+            return False
+        credentials_json = json.loads(credentials_json_string)
+        firebase_db_url = os.environ.get("Firebase_DB")
+        if not firebase_db_url:
+            logger.warning("Firebase_DB env var not set. Firebase caching disabled.")
+            return False
+        if not firebase_admin._apps:
+            cred = credentials.Certificate(credentials_json)
+            firebase_admin.initialize_app(cred, {"databaseURL": firebase_db_url})
+        firebase_db_ref = firebase_db.reference()
+        logger.info("Firebase initialized successfully in Data API.")
+        return True
+    except Exception as e:
+        logger.error(f"Firebase init failed: {e}")
+        return False
+FIREBASE_AVAILABLE = init_firebase()
+def fb_set(path: str, data):
+    """Write to Firebase, silently fail if unavailable."""
+    if not FIREBASE_AVAILABLE or firebase_db_ref is None:
+        return
+    try:
+        firebase_db_ref.child(path).set(data)
+    except Exception as e:
+        logger.error(f"Firebase write failed [{path}]: {e}")
+def fb_get(path: str):
+    """Read from Firebase, return None if unavailable."""
+    if not FIREBASE_AVAILABLE or firebase_db_ref is None:
+        return None
+    try:
+        return firebase_db_ref.child(path).get()
+    except Exception as e:
+        logger.error(f"Firebase read failed [{path}]: {e}")
+        return None
+# -----------------------------------------------------------------------------
+# 1. BOILERPLATE PAGE DETECTION
+# -----------------------------------------------------------------------------
+# Keywords that identify non-content pages to skip
+BOILERPLATE_TITLE_PATTERNS = re.compile(
+    r'^\s*(about\s+(this\s+)?syllabus|foreword|acknowledgements?|introduction\s+to\s+(cambridge|zimsec)|'
+    r'how\s+to\s+use\s+this\s+syllabus|why\s+choose\s+cambridge|support\s+for\s+teachers|'
+    r'teacher\s+support|resource\s+list|list\s+of\s+resources|further\s+information|'
+    r'copyright|legal\s+notice|syllabus\s+overview\s+at\s+a\s+glance|'
+    r'assessment\s+at\s+a\s+glance|grade\s+descriptions|mathematical\s+notation|'
+    r'command\s+words|glossary\s+of\s+command|changes\s+to\s+this\s+syllabus|'
+    r'other\s+cambridge|university\s+of\s+cambridge|cambridge\s+assessment|'
+    r'published\s+by|contents\s*$|table\s+of\s+contents)\s*$',
+    re.IGNORECASE
+)
+# Keywords that signal content has actually started
+CONTENT_START_PATTERNS = re.compile(
+    r'^\s*((syllabus\s+)?content|subject\s+content|unit\s+\d|topic\s+\d|'
+    r'section\s+\d|module\s+\d|\d+\s+[A-Z]|component\s+\d|paper\s+\d|'
+    r'scheme\s+of\s+work|learning\s+objectives|knowledge.*understanding)',
+    re.IGNORECASE
+)
+def is_boilerplate_block(text: str) -> bool:
+    """Returns True if this block is boilerplate/admin content to skip."""
+    return bool(BOILERPLATE_TITLE_PATTERNS.match(text.strip()))
+def page_is_boilerplate(page_text: str) -> bool:
+    """Returns True if the entire page appears to be admin/front-matter."""
+    lines = [l.strip() for l in page_text.splitlines() if l.strip()]
+    if not lines:
+        return True
+    # Check first substantive line
+    first = lines[0]
+    if BOILERPLATE_TITLE_PATTERNS.match(first):
+        return True
+    # Check if page is very short (< 5 lines) with no numbered items — likely a divider
+    if len(lines) < 5 and not re.search(r'\d+\.\d+|\d+\s+[A-Z]', page_text):
+        # Could be a section divider page — not boilerplate but also empty
+        pass
+    return False
+# -----------------------------------------------------------------------------
+# 2. THE PARSER ENGINE (Extracts Structure from PDF)
 # -----------------------------------------------------------------------------
 class PDFParser:
         self.filepath = filepath
         self.filename = os.path.basename(filepath)
         self.doc = fitz.open(filepath)
+        parts = filepath.replace("\\", "/").split("/")
         self.level = parts[-2] if len(parts) > 1 else "General"
         self.subject_code = re.search(r'\d{4}', self.filename)
         self.subject_code = self.subject_code.group(0) if self.subject_code else "0000"
+        self.subject_name = re.sub(r'[_\-]\d{4}.*', '', self.filename.replace('_', ' ')).strip()
         self.unique_id = f"{self.level}_{self.subject_code}"
     def get_font_characteristics(self):
                     for s in l.get("spans", []):
                         size = round(s["size"], 1)
                         font_sizes[size] = font_sizes.get(size, 0) + len(s["text"])
+        if not font_sizes:
+            return 10.0
         return max(font_sizes, key=font_sizes.get)
+    def _find_content_start_page(self) -> int:
         """
+        Scans pages to find where actual syllabus content begins.
+        Returns the 0-based page index.
         """
+        for page_num, page in enumerate(self.doc):
+            text = page.get_text("text")
+            # Skip empty pages
+            if len(text.strip()) < 30:
+                continue
+            # Skip boilerplate pages
+            if page_is_boilerplate(text):
+                continue
+            # Look for numbered content sections
+            if CONTENT_START_PATTERNS.search(text):
+                logger.info(f"  Content starts at page {page_num + 1} for {self.filename}")
+                return page_num
+            # Also check if this page has numbered topic headers (e.g. "1 Number" or "1.1 ...")
+            if re.search(r'\n\s*\d+\.?\d*\s+[A-Z][a-z]', text):
+                logger.info(f"  Content (numbered) starts at page {page_num + 1} for {self.filename}")
+                return page_num
+        # Fallback: skip first 10% of pages (usually all front-matter)
+        fallback = max(1, len(self.doc) // 10)
+        logger.info(f"  Using fallback content start page {fallback + 1} for {self.filename}")
+        return fallback
+    def parse(self):
         body_size = self.get_font_characteristics()
+        content_start = self._find_content_start_page()
+        logger.info(f"Parsing {self.filename} (Body size ~{body_size}pt, content from page {content_start + 1})")
         syllabus_tree = []
         current_topic = None
         current_subtopic = None
         topic_pattern = re.compile(r'^(\d+\.?\s|Key Question\s)', re.IGNORECASE)
+        for page_num, page in enumerate(self.doc):
+            # Skip pre-content pages entirely
+            if page_num < content_start:
+                continue
             blocks = page.get_text("dict")["blocks"]
             for b in blocks:
                 block_text = ""
                 max_size = 0
                 is_bold = False
                 for l in b.get("lines", []):
                     for s in l.get("spans", []):
                         text = s["text"].strip()
+                        if not text:
+                            continue
                         block_text += text + " "
+                        if s["size"] > max_size:
+                            max_size = s["size"]
+                        if "bold" in s["font"].lower():
+                            is_bold = True
                 block_text = block_text.strip()
+                if len(block_text) < 3:
+                    continue
+                # Skip boilerplate blocks even within content pages
+                if is_boilerplate_block(block_text):
+                    continue
+                # HEURISTIC 1: TOPIC (Large Header — 2pt+ above body)
                 if max_size > body_size + 2:
                     if current_subtopic and current_topic:
                         current_topic["children"].append(current_subtopic)
                         current_subtopic = None
                     if current_topic:
                         syllabus_tree.append(current_topic)
                     current_topic = {
                         "id": f"{self.unique_id}_{len(syllabus_tree)}",
                         "title": block_text,
                     }
                     current_subtopic = None
+                # HEURISTIC 2: SUBTOPIC (Bold, numbered, or keyword-led)
+                elif (is_bold and max_size >= body_size) or \
+                     (topic_pattern.match(block_text) and max_size >= body_size):
                     if current_subtopic and current_topic:
                         current_topic["children"].append(current_subtopic)
                     if not current_topic:
+                        current_topic = {
+                            "id": f"{self.unique_id}_root",
+                            "title": "Syllabus Content",
+                            "type": "topic",
+                            "children": []
+                        }
                     current_subtopic = {
                         "id": f"{current_topic['id']}_{len(current_topic['children'])}",
                     if current_subtopic:
                         current_subtopic["content"].append(block_text)
                     elif current_topic:
                         current_subtopic = {
                             "id": f"{current_topic['id']}_intro",
+                            "title": "Overview",
                             "type": "subtopic",
                             "content": [block_text]
                         }
                 "id": self.unique_id,
                 "subject": self.subject_name,
                 "code": self.subject_code,
+                "level": self.level,
+                "filename": self.filename,
+                "indexed_at": int(time.time())
             },
             "tree": syllabus_tree
         }
 # -----------------------------------------------------------------------------
+# 3. PAST EXAM PAPER PARSER
+# -----------------------------------------------------------------------------
+class ExamPaperParser:
+    """
+    Extracts metadata and full text from past exam PDFs.
+    Expected naming: syllabi_code_year_session_paper.pdf
+    E.g.:  9702_2023_May_Paper1.pdf  or  9702_2023_s1.pdf
+    Falls back to filename parsing when possible.
+    """
+    def __init__(self, filepath):
+        self.filepath = filepath
+        self.filename = os.path.basename(filepath)
+        self.doc = fitz.open(filepath)
+        parts = filepath.replace("\\", "/").split("/")
+        self.level = parts[-2] if len(parts) > 1 else "General"
+        # Parse subject code from filename
+        code_match = re.search(r'\b(\d{4})\b', self.filename)
+        self.subject_code = code_match.group(1) if code_match else "0000"
+        self.unique_id = f"{self.level}_{self.subject_code}"
+        # Parse year
+        year_match = re.search(r'\b(20\d{2}|19\d{2})\b', self.filename)
+        self.year = year_match.group(1) if year_match else "Unknown"
+        # Parse session (May/June, Oct/Nov, etc.)
+        session_match = re.search(
+            r'(may[_\-]?june|oct[_\-]?nov|feb[_\-]?mar|summer|winter|s\d|w\d|m\d)',
+            self.filename, re.IGNORECASE
+        )
+        self.session = session_match.group(1).upper() if session_match else "Unknown"
+        # Parse paper number
+        paper_match = re.search(r'[_\-]p(\d)|paper[\s_\-]?(\d)', self.filename, re.IGNORECASE)
+        if paper_match:
+            self.paper_num = paper_match.group(1) or paper_match.group(2)
+        else:
+            self.paper_num = "1"
+        self.paper_id = f"{self.unique_id}_{self.year}_{self.session}_P{self.paper_num}"
+    def extract_pages(self):
+        """Extract text per page."""
+        pages = []
+        for i, page in enumerate(self.doc):
+            text = page.get_text("text").strip()
+            if text:
+                pages.append({
+                    "page": i + 1,
+                    "text": text[:3000]  # cap per page to avoid huge payloads
+                })
+        return pages
+    def extract_questions(self):
+        """
+        Heuristic: questions usually start with a number followed by a period/bracket.
+        E.g. "1." or "1 " or "(a)" at start of paragraph.
+        Returns list of { number, text }.
+        """
+        questions = []
+        full_text = "\n".join(p["text"] for p in self.extract_pages())
+        # Split by question numbers
+        q_pattern = re.compile(
+            r'(?:^|\n)\s*(\d{1,2})\s*[\.\)]\s+(.+?)(?=\n\s*\d{1,2}\s*[\.\)]|\Z)',
+            re.DOTALL | re.MULTILINE
+        )
+        for m in q_pattern.finditer(full_text):
+            q_num = int(m.group(1))
+            q_text = m.group(2).strip()
+            if len(q_text) > 20:  # filter noise
+                questions.append({"number": q_num, "text": q_text[:2000]})
+        return questions
+    def parse(self):
+        pages = self.extract_pages()
+        questions = self.extract_questions()
+        return {
+            "meta": {
+                "paperId": self.paper_id,
+                "subjectId": self.unique_id,
+                "subjectCode": self.subject_code,
+                "level": self.level,
+                "year": self.year,
+                "session": self.session,
+                "paperNumber": self.paper_num,
+                "filename": self.filename,
+                "totalPages": len(self.doc),
+                "indexed_at": int(time.time())
+            },
+            "pages": pages,
+            "questions": questions
+        }
+# -----------------------------------------------------------------------------
+# 4. THE VECTOR ENGINE (Embeddings & Search)
 # -----------------------------------------------------------------------------
 def generate_embeddings(texts):
+    """Generates embeddings using Gemini API."""
     if not GEMINI_API_KEY:
+        logger.warning("No Gemini API Key. Using dummy vectors.")
+        return [np.zeros(768).tolist() for _ in texts]
+    client_g = genai.Client(api_key=GEMINI_API_KEY)
     results = []
+    batch_size = 10
     for i in range(0, len(texts), batch_size):
+        batch = texts[i:i + batch_size]
         try:
+            resp = client_g.models.embed_content(
                 model=EMBEDDING_MODEL,
                 contents=batch,
             )
             for embedding in resp.embeddings:
+                results.append(embedding.values)
         except Exception as e:
+            logger.error(f"Embedding batch {i} failed: {e}")
+            for _ in batch:
+                results.append(np.zeros(768).tolist())
     return results
+# -----------------------------------------------------------------------------
+# 5. FIREBASE-BACKED INDEX BUILDER
+# -----------------------------------------------------------------------------
+def load_index_from_firebase():
+    """
+    Tries to load the full index from Firebase.
+    Returns True if successfully loaded.
+    """
+    global SYLLABUS_MAP, VECTOR_DB, VECTOR_MATRIX, EXAM_MAP
+    if not FIREBASE_AVAILABLE:
+        return False
+    logger.info("Attempting to load index from Firebase...")
+    try:
+        # Load syllabus map
+        fb_syllabi = fb_get("data_api/syllabi")
+        if not fb_syllabi:
+            logger.info("No syllabus data in Firebase yet.")
+            return False
+        SYLLABUS_MAP = fb_syllabi
+        # Load vector DB
+        fb_vectors = fb_get("data_api/vectors")
+        if not fb_vectors:
+            logger.info("No vector data in Firebase yet.")
+            return False
+        VECTOR_DB = []
+        valid_vectors = []
+        for entry in fb_vectors.values() if isinstance(fb_vectors, dict) else fb_vectors:
+            if not entry:
+                continue
+            vec = np.array(entry["vector"])
+            VECTOR_DB.append({
+                "vector": vec,
+                "meta": entry["meta"]
+            })
+            valid_vectors.append(vec)
+        if valid_vectors:
+            VECTOR_MATRIX = np.vstack(valid_vectors)
+        # Load exam map
+        fb_exams = fb_get("data_api/exams")
+        if fb_exams:
+            EXAM_MAP = fb_exams
+        logger.info(
+            f"Loaded from Firebase: {len(SYLLABUS_MAP)} syllabi, "
+            f"{len(VECTOR_DB)} vectors, {len(EXAM_MAP)} exam subjects."
+        )
+        return True
+    except Exception as e:
+        logger.error(f"Failed to load from Firebase: {e}")
+        return False
+def save_syllabus_to_firebase(subject_id: str, data: dict):
+    """Save a single syllabus entry to Firebase."""
+    # Store tree without numpy arrays (just plain dicts)
+    fb_set(f"data_api/syllabi/{subject_id}", data)
+def save_vectors_to_firebase(vector_entries: list):
+    """Save vector entries to Firebase (store as lists, not numpy)."""
+    fb_data = {}
+    for i, entry in enumerate(vector_entries):
+        key = f"v_{i:06d}"
+        fb_data[key] = {
+            "vector": entry["vector"].tolist() if isinstance(entry["vector"], np.ndarray) else entry["vector"],
+            "meta": entry["meta"]
+        }
+    fb_set("data_api/vectors", fb_data)
+def save_exam_to_firebase(subject_id: str, paper_data: dict):
+    """Save a parsed exam paper under the subject's exam list."""
+    paper_id = paper_data["meta"]["paperId"]
+    # Sanitize key
+    safe_key = re.sub(r'[.\[\]#$/]', '_', paper_id)
+    fb_set(f"data_api/exams/{subject_id}/{safe_key}", paper_data)
 def build_index():
+    """
+    Walks directories, parses PDFs, builds JSON tree and Vector Index,
+    then persists everything to Firebase.
+    """
+    global SYLLABUS_MAP, VECTOR_DB, VECTOR_MATRIX, EXAM_MAP
     logger.info("🚀 Starting Build Process...")
+    # ---- SYLLABI ----
     parsed_data = []
+    if os.path.exists(SYLLABI_DIR):
+        for root, dirs, files in os.walk(SYLLABI_DIR):
+            for file in sorted(files):
+                if file.endswith(".pdf"):
+                    path = os.path.join(root, file)
+                    logger.info(f"Parsing syllabus: {path}")
+                    try:
+                        parser = PDFParser(path)
+                        data = parser.parse()
+                        parsed_data.append(data)
+                        SYLLABUS_MAP[data["meta"]["id"]] = data
+                        save_syllabus_to_firebase(data["meta"]["id"], data)
+                    except Exception as e:
+                        logger.error(f"Failed to parse {path}: {e}")
+    else:
+        logger.warning(f"Directory {SYLLABI_DIR} not found.")
+    # ---- PAST EXAMS ----
+    if os.path.exists(PAST_EXAMS_DIR):
+        for root, dirs, files in os.walk(PAST_EXAMS_DIR):
+            for file in sorted(files):
+                if file.endswith(".pdf"):
+                    path = os.path.join(root, file)
+                    logger.info(f"Parsing exam paper: {path}")
+                    try:
+                        parser = ExamPaperParser(path)
+                        exam_data = parser.parse()
+                        subject_id = exam_data["meta"]["subjectId"]
+                        if subject_id not in EXAM_MAP:
+                            EXAM_MAP[subject_id] = {}
+                        paper_id = exam_data["meta"]["paperId"]
+                        safe_key = re.sub(r'[.\[\]#$/]', '_', paper_id)
+                        EXAM_MAP[subject_id][safe_key] = exam_data
+                        save_exam_to_firebase(subject_id, exam_data)
+                    except Exception as e:
+                        logger.error(f"Failed to parse exam {path}: {e}")
+    else:
+        logger.info(f"No past_exams directory found at {PAST_EXAMS_DIR}. Skipping.")
+    # ---- VECTORIZATION (syllabi only) ----
+    if not parsed_data:
+        logger.info("No new syllabus data to vectorize.")
+        return
     chunks_to_embed = []
     chunk_metadata = []
     for item in parsed_data:
         meta_base = item["meta"]
         for topic in item["tree"]:
+            for sub in topic.get("children", []):
+                text_blob = "\n".join(sub.get("content", []))
+                if len(text_blob) < 10:
+                    continue
+                rich_text = (
+                    f"{meta_base['subject']} {meta_base['level']} "
+                    f"- {topic['title']} - {sub['title']}:\n{text_blob}"
+                )
                 chunks_to_embed.append(rich_text)
                 chunk_metadata.append({
                     "subject_id": meta_base["id"],
                     "content": text_blob
                 })
     logger.info(f"🧮 Generating embeddings for {len(chunks_to_embed)} chunks...")
     vectors = generate_embeddings(chunks_to_embed)
     VECTOR_DB = []
     valid_vectors = []
     for i, vec in enumerate(vectors):
+        np_vec = np.array(vec)
         VECTOR_DB.append({
+            "vector": np_vec,
             "meta": chunk_metadata[i]
         })
+        valid_vectors.append(np_vec)
     if valid_vectors:
         VECTOR_MATRIX = np.vstack(valid_vectors)
+    # Persist to Firebase
+    save_vectors_to_firebase(VECTOR_DB)
+    logger.info(
+        f"✅ Indexing Complete. "
+        f"{len(SYLLABUS_MAP)} syllabi, {len(VECTOR_DB)} vectors, "
+        f"{sum(len(v) for v in EXAM_MAP.values())} exam papers."
+    )
+# -----------------------------------------------------------------------------
+# 6. DIRECTORY WATCHER — Auto-index new PDFs
+# -----------------------------------------------------------------------------
+_indexed_files = set()
+def _collect_existing_files():
+    """Collect all currently-present PDFs to avoid re-indexing on boot."""
+    for d in [SYLLABI_DIR, PAST_EXAMS_DIR]:
+        if not os.path.exists(d):
+            continue
+        for root, _, files in os.walk(d):
+            for f in files:
+                if f.endswith(".pdf"):
+                    _indexed_files.add(os.path.join(root, f))
+def _watch_directories(interval=30):
+    """Background thread: detect new PDFs and index them."""
+    while True:
+        time.sleep(interval)
+        for directory, is_exam in [(SYLLABI_DIR, False), (PAST_EXAMS_DIR, True)]:
+            if not os.path.exists(directory):
+                continue
+            for root, _, files in os.walk(directory):
+                for file in files:
+                    if not file.endswith(".pdf"):
+                        continue
+                    path = os.path.join(root, file)
+                    if path in _indexed_files:
+                        continue
+                    logger.info(f"🆕 New PDF detected: {path}")
+                    _indexed_files.add(path)
+                    try:
+                        if is_exam:
+                            parser = ExamPaperParser(path)
+                            exam_data = parser.parse()
+                            subject_id = exam_data["meta"]["subjectId"]
+                            if subject_id not in EXAM_MAP:
+                                EXAM_MAP[subject_id] = {}
+                            paper_id = exam_data["meta"]["paperId"]
+                            safe_key = re.sub(r'[.\[\]#$/]', '_', paper_id)
+                            EXAM_MAP[subject_id][safe_key] = exam_data
+                            save_exam_to_firebase(subject_id, exam_data)
+                        else:
+                            parser = PDFParser(path)
+                            data = parser.parse()
+                            SYLLABUS_MAP[data["meta"]["id"]] = data
+                            save_syllabus_to_firebase(data["meta"]["id"], data)
+                            # Re-vectorize just this document
+                            _incremental_vectorize(data)
+                    except Exception as e:
+                        logger.error(f"Error indexing new file {path}: {e}")
+def _incremental_vectorize(syllabus_data: dict):
+    """Add vectors for a single newly-uploaded syllabus."""
+    global VECTOR_DB, VECTOR_MATRIX
+    meta_base = syllabus_data["meta"]
+    chunks = []
+    metas = []
+    for topic in syllabus_data["tree"]:
+        for sub in topic.get("children", []):
+            text_blob = "\n".join(sub.get("content", []))
+            if len(text_blob) < 10:
+                continue
+            rich_text = (
+                f"{meta_base['subject']} {meta_base['level']} "
+                f"- {topic['title']} - {sub['title']}:\n{text_blob}"
+            )
+            chunks.append(rich_text)
+            metas.append({
+                "subject_id": meta_base["id"],
+                "topic_id": topic["id"],
+                "subtopic_id": sub["id"],
+                "title": sub["title"],
+                "content": text_blob
+            })
+    if not chunks:
+        return
+    vectors = generate_embeddings(chunks)
+    for i, vec in enumerate(vectors):
+        np_vec = np.array(vec)
+        VECTOR_DB.append({"vector": np_vec, "meta": metas[i]})
+    if VECTOR_DB:
+        VECTOR_MATRIX = np.vstack([e["vector"] for e in VECTOR_DB])
+    # Persist full updated vector set
+    save_vectors_to_firebase(VECTOR_DB)
+    logger.info(f"Incremental vectorize complete for {meta_base['id']}.")
 # -----------------------------------------------------------------------------
+# 7. API ENDPOINTS
 # -----------------------------------------------------------------------------
 @app.route('/health', methods=['GET'])
 def health():
+    return jsonify({
+        "status": "online",
+        "subjects_loaded": list(SYLLABUS_MAP.keys()),
+        "vector_chunks": len(VECTOR_DB),
+        "exam_subjects": list(EXAM_MAP.keys()),
+        "firebase": FIREBASE_AVAILABLE
+    })
 @app.route('/v1/structure/<subject_id>', methods=['GET'])
 def get_structure(subject_id):
         return jsonify({"error": "Subject not found"}), 404
     return jsonify(data)
+@app.route('/v1/subjects', methods=['GET'])
+def list_subjects():
+    """Returns metadata for all indexed syllabi."""
+    result = []
+    for sid, data in SYLLABUS_MAP.items():
+        result.append(data.get("meta", {"id": sid}))
+    return jsonify(result)
 @app.route('/v1/search', methods=['POST'])
 def search():
     """
     Semantic Retrieval.
     Input: { "query": "...", "filter_subject_id": "..." (optional) }
     """
+    if VECTOR_MATRIX is None or len(VECTOR_DB) == 0:
         return jsonify({"error": "Index not ready"}), 503
+    data = request.json or {}
     query = data.get("query")
     subject_filter = data.get("filter_subject_id")
     if not query:
         return jsonify({"error": "Query required"}), 400
+    if not GEMINI_API_KEY:
+        return jsonify({"error": "Embedding API not configured"}), 503
+    client_g = genai.Client(api_key=GEMINI_API_KEY)
     try:
+        resp = client_g.models.embed_content(model=EMBEDDING_MODEL, contents=query)
         query_vec = np.array(resp.embeddings[0].values).reshape(1, -1)
     except Exception as e:
         return jsonify({"error": str(e)}), 500
     scores = cosine_similarity(query_vec, VECTOR_MATRIX)[0]
     top_indices = np.argsort(scores)[::-1]
+    results = []
     count = 0
     for idx in top_indices:
+        if scores[idx] < 0.3:
+            break
         entry = VECTOR_DB[idx]
         meta = entry["meta"]
         if subject_filter and meta["subject_id"] != subject_filter:
             continue
         results.append({
             "score": float(scores[idx]),
             "subject_id": meta["subject_id"],
             "title": meta["title"],
+            "content": meta["content"],
+            "node_id": meta["subtopic_id"]
         })
         count += 1
+        if count >= 5:
+            break
     return jsonify({"results": results})
+@app.route('/v1/exams', methods=['GET'])
+def list_exams():
+    """
+    List past exam papers.
+    Query param: subject_id (optional)
+    """
+    subject_id = request.args.get("subject_id")
+    if subject_id:
+        papers = EXAM_MAP.get(subject_id, {})
+        result = [p["meta"] for p in papers.values() if isinstance(p, dict) and "meta" in p]
+    else:
+        result = []
+        for sid, papers in EXAM_MAP.items():
+            for p in papers.values():
+                if isinstance(p, dict) and "meta" in p:
+                    result.append(p["meta"])
+    return jsonify(result)
+@app.route('/v1/exams/<paper_id>', methods=['GET'])
+def get_exam(paper_id):
+    """
+    Get full exam paper (pages + questions).
+    paper_id format: A_9702_2023_MAY_P1
+    """
+    safe_key = re.sub(r'[.\[\]#$/]', '_', paper_id)
+    for sid, papers in EXAM_MAP.items():
+        for key, paper in papers.items():
+            if key == safe_key or (isinstance(paper, dict) and
+               paper.get("meta", {}).get("paperId") == paper_id):
+                return jsonify(paper)
+    return jsonify({"error": "Exam paper not found"}), 404
+@app.route('/v1/exams/<paper_id>/questions', methods=['GET'])
+def get_exam_questions(paper_id):
+    """Get just the extracted questions from a past paper."""
+    safe_key = re.sub(r'[.\[\]#$/]', '_', paper_id)
+    for sid, papers in EXAM_MAP.items():
+        for key, paper in papers.items():
+            if key == safe_key or (isinstance(paper, dict) and
+               paper.get("meta", {}).get("paperId") == paper_id):
+                return jsonify({
+                    "paperId": paper_id,
+                    "meta": paper.get("meta"),
+                    "questions": paper.get("questions", [])
+                })
+    return jsonify({"error": "Exam paper not found"}), 404
+@app.route('/v1/rebuild', methods=['POST'])
+def trigger_rebuild():
+    """
+    Trigger a full index rebuild (admin use).
+    Optionally pass { "force": true } to bypass Firebase cache.
+    """
+    auth_header = request.headers.get("Authorization", "")
+    rebuild_key = os.environ.get("REBUILD_SECRET", "")
+    if rebuild_key and auth_header != f"Bearer {rebuild_key}":
+        return jsonify({"error": "Unauthorized"}), 401
+    def _rebuild_bg():
+        global SYLLABUS_MAP, VECTOR_DB, VECTOR_MATRIX, EXAM_MAP
+        SYLLABUS_MAP = {}
+        VECTOR_DB = []
+        VECTOR_MATRIX = None
+        EXAM_MAP = {}
+        build_index()
+    t = threading.Thread(target=_rebuild_bg, daemon=True)
+    t.start()
+    return jsonify({"status": "rebuild started"}), 202
 # -----------------------------------------------------------------------------
+# 8. STARTUP BOOTSTRAP
 # -----------------------------------------------------------------------------
 def start_app():
+    # Create directories if needed
+    for d in [SYLLABI_DIR, PAST_EXAMS_DIR]:
+        if not os.path.exists(d):
+            os.makedirs(os.path.join(d, "A"), exist_ok=True)
+            os.makedirs(os.path.join(d, "O"), exist_ok=True)
+            logger.info(f"Created empty directory: {d}")
+    # Try to load from Firebase first
+    loaded = load_index_from_firebase()
+    if not loaded:
+        # Build from scratch
+        build_index()
+    else:
+        logger.info("Served from Firebase cache. Skipping full rebuild.")
+    # Collect existing files so the watcher doesn't re-index them
+    _collect_existing_files()
+    # Start background watcher for new uploads
+    watcher = threading.Thread(target=_watch_directories, daemon=True)
+    watcher.start()
+    logger.info("Directory watcher started.")
 with app.app_context():
     start_app()
 if __name__ == '__main__':
     app.run(host='0.0.0.0', port=7860)