rairo commited on
Commit
544ec28
·
verified ·
1 Parent(s): f762b45

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +710 -137
main.py CHANGED
@@ -3,39 +3,141 @@ import json
3
  import logging
4
  import re
5
  import time
 
6
  import numpy as np
7
  import fitz # PyMuPDF
8
  from flask import Flask, request, jsonify
9
  from flask_cors import CORS
10
  from google import genai
11
- from google.genai import types
12
  from sklearn.metrics.pairwise import cosine_similarity
13
 
 
 
 
14
  # --- CONFIGURATION ---
15
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
16
  logger = logging.getLogger(__name__)
17
 
18
- # Directory where your PDFs live (e.g., ./syllabi/A/Physics.pdf)
19
  SYLLABI_DIR = "syllabi"
20
- INDEX_FILE = "syllabus_index.json" # Local cache file
21
 
22
  # Google GenAI Config
23
  GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
24
  EMBEDDING_MODEL = "models/text-embedding-004"
25
 
26
- # --- GLOBAL STATE (IN-MEMORY) ---
27
- # Structure: { "A_9706": { "title": "Accounting", "tree": [...] }, ... }
28
- SYLLABUS_MAP = {}
29
 
30
- # Structure: [ { "id": "...", "vector": [...], "text": "...", "meta": {...} } ]
31
  VECTOR_DB = []
32
- VECTOR_MATRIX = None # Numpy array for fast math
 
 
 
33
 
34
  app = Flask(__name__)
35
  CORS(app)
36
 
37
  # -----------------------------------------------------------------------------
38
- # 1. THE PARSER ENGINE (Extracts Structure from PDF)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  # -----------------------------------------------------------------------------
40
 
41
  class PDFParser:
@@ -43,15 +145,12 @@ class PDFParser:
43
  self.filepath = filepath
44
  self.filename = os.path.basename(filepath)
45
  self.doc = fitz.open(filepath)
46
-
47
- # Determine Subject and Level from filename/path
48
- # Expected: syllabi/A/Accounting_9706.pdf
49
- parts = filepath.split(os.sep)
50
  self.level = parts[-2] if len(parts) > 1 else "General"
51
- # Extract code if present (e.g., 9618)
52
  self.subject_code = re.search(r'\d{4}', self.filename)
53
  self.subject_code = self.subject_code.group(0) if self.subject_code else "0000"
54
- self.subject_name = self.filename.split('_')[0]
55
  self.unique_id = f"{self.level}_{self.subject_code}"
56
 
57
  def get_font_characteristics(self):
@@ -64,57 +163,86 @@ class PDFParser:
64
  for s in l.get("spans", []):
65
  size = round(s["size"], 1)
66
  font_sizes[size] = font_sizes.get(size, 0) + len(s["text"])
67
-
68
- # The font size with the most characters is likely the "Body Text"
69
- if not font_sizes: return 10.0
70
  return max(font_sizes, key=font_sizes.get)
71
 
72
- def parse(self):
73
  """
74
- Heuristic parsing:
75
- - Text significantly larger than body = Topic
76
- - Bold text slightly larger than body = Subtopic
77
- - Body text = Content/Objectives
78
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  body_size = self.get_font_characteristics()
80
- logger.info(f"Parsing {self.filename} (Body size approx {body_size}pt)")
 
81
 
82
  syllabus_tree = []
83
  current_topic = None
84
  current_subtopic = None
85
-
86
- # Regex to detect "Topic 1" or "1.1" or "Key Question"
87
  topic_pattern = re.compile(r'^(\d+\.?\s|Key Question\s)', re.IGNORECASE)
88
 
89
- for page in self.doc:
 
 
 
 
90
  blocks = page.get_text("dict")["blocks"]
91
  for b in blocks:
92
  block_text = ""
93
  max_size = 0
94
  is_bold = False
95
-
96
- # Reconstruct line text and finding max font style
97
  for l in b.get("lines", []):
98
  for s in l.get("spans", []):
99
  text = s["text"].strip()
100
- if not text: continue
 
101
  block_text += text + " "
102
- if s["size"] > max_size: max_size = s["size"]
103
- if "bold" in s["font"].lower(): is_bold = True
104
-
 
 
105
  block_text = block_text.strip()
106
- if len(block_text) < 3: continue # Skip noise
 
 
 
 
 
107
 
108
- # HEURISTIC 1: TOPIC (Large Header)
109
- # Usually 2pt+ larger than body
110
  if max_size > body_size + 2:
111
- # Save previous
112
  if current_subtopic and current_topic:
113
  current_topic["children"].append(current_subtopic)
114
  current_subtopic = None
115
  if current_topic:
116
  syllabus_tree.append(current_topic)
117
-
118
  current_topic = {
119
  "id": f"{self.unique_id}_{len(syllabus_tree)}",
120
  "title": block_text,
@@ -123,15 +251,19 @@ class PDFParser:
123
  }
124
  current_subtopic = None
125
 
126
- # HEURISTIC 2: SUBTOPIC (Bold, slightly larger or same size as body)
127
- # Must start with number or specific keyword to reduce noise
128
- elif (is_bold and max_size >= body_size) or (topic_pattern.match(block_text) and max_size >= body_size):
129
  if current_subtopic and current_topic:
130
  current_topic["children"].append(current_subtopic)
131
-
132
- # If no topic exists yet, create a dummy one
133
  if not current_topic:
134
- current_topic = {"id": f"{self.unique_id}_root", "title": "Syllabus Overview", "type": "topic", "children": []}
 
 
 
 
 
135
 
136
  current_subtopic = {
137
  "id": f"{current_topic['id']}_{len(current_topic['children'])}",
@@ -145,11 +277,9 @@ class PDFParser:
145
  if current_subtopic:
146
  current_subtopic["content"].append(block_text)
147
  elif current_topic:
148
- # Sometimes text appears directly under a topic
149
- # Create implicit subtopic
150
  current_subtopic = {
151
  "id": f"{current_topic['id']}_intro",
152
- "title": "Introduction / Overview",
153
  "type": "subtopic",
154
  "content": [block_text]
155
  }
@@ -165,82 +295,308 @@ class PDFParser:
165
  "id": self.unique_id,
166
  "subject": self.subject_name,
167
  "code": self.subject_code,
168
- "level": self.level
 
 
169
  },
170
  "tree": syllabus_tree
171
  }
172
 
 
173
  # -----------------------------------------------------------------------------
174
- # 2. THE VECTOR ENGINE (Embeddings & Search)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  # -----------------------------------------------------------------------------
176
 
177
  def generate_embeddings(texts):
178
- """Generates embeddings using Gemini API (Batching recommended for production)."""
179
  if not GEMINI_API_KEY:
180
- logger.warning("No Gemini API Key found. Skipping embeddings.")
181
- return [np.zeros(768) for _ in texts] # Dummy vectors
182
 
183
- client = genai.Client(api_key=GEMINI_API_KEY)
184
  results = []
185
-
186
- # Simple batching to avoid hitting limits
187
- batch_size = 10
188
  for i in range(0, len(texts), batch_size):
189
- batch = texts[i:i+batch_size]
190
  try:
191
- resp = client.models.embed_content(
192
  model=EMBEDDING_MODEL,
193
  contents=batch,
194
  )
195
- # Handle list of embeddings
196
  for embedding in resp.embeddings:
197
- results.append(np.array(embedding.values))
198
  except Exception as e:
199
- logger.error(f"Embedding failed: {e}")
200
- # Fallback for failed batch
201
- for _ in batch: results.append(np.zeros(768))
202
-
203
  return results
204
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  def build_index():
206
- """Walks the directory, parses PDFs, builds JSON tree and Vector Index."""
207
- global SYLLABUS_MAP, VECTOR_DB, VECTOR_MATRIX
208
-
 
 
 
209
  logger.info("🚀 Starting Build Process...")
210
-
211
- # 1. Walk Directory
212
- if not os.path.exists(SYLLABI_DIR):
213
- logger.error(f"Directory {SYLLABI_DIR} not found.")
214
- return
215
 
 
216
  parsed_data = []
217
-
218
- for root, dirs, files in os.walk(SYLLABI_DIR):
219
- for file in files:
220
- if file.endswith(".pdf"):
221
- path = os.path.join(root, file)
222
- parser = PDFParser(path)
223
- data = parser.parse()
224
- parsed_data.append(data)
225
-
226
- # Store in Map
227
- SYLLABUS_MAP[data["meta"]["id"]] = data
228
-
229
- # 2. Flatten for Vectorization
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
  chunks_to_embed = []
231
  chunk_metadata = []
232
 
233
  for item in parsed_data:
234
  meta_base = item["meta"]
235
  for topic in item["tree"]:
236
- for sub in topic["children"]:
237
- # Create a rich semantic chunk
238
- # Format: "Subject Level - Topic - Subtopic: Content"
239
- text_blob = "\n".join(sub["content"])
240
- if len(text_blob) < 10: continue # Skip empty chunks
241
-
242
- rich_text = f"{meta_base['subject']} {meta_base['level']} - {topic['title']} - {sub['title']}:\n{text_blob}"
243
-
 
244
  chunks_to_embed.append(rich_text)
245
  chunk_metadata.append({
246
  "subject_id": meta_base["id"],
@@ -250,33 +606,149 @@ def build_index():
250
  "content": text_blob
251
  })
252
 
253
- # 3. Generate Embeddings
254
  logger.info(f"🧮 Generating embeddings for {len(chunks_to_embed)} chunks...")
255
  vectors = generate_embeddings(chunks_to_embed)
256
 
257
- # 4. Populate Global DB
258
  VECTOR_DB = []
259
  valid_vectors = []
260
-
261
  for i, vec in enumerate(vectors):
 
262
  VECTOR_DB.append({
263
- "vector": vec, # Keep for debug/individual access
264
  "meta": chunk_metadata[i]
265
  })
266
- valid_vectors.append(vec)
267
 
268
  if valid_vectors:
269
  VECTOR_MATRIX = np.vstack(valid_vectors)
270
-
271
- logger.info("✅ Indexing Complete.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
 
273
  # -----------------------------------------------------------------------------
274
- # 3. API SERVER (The Retrieval Layer)
275
  # -----------------------------------------------------------------------------
276
 
277
  @app.route('/health', methods=['GET'])
278
  def health():
279
- return jsonify({"status": "online", "subjects_loaded": list(SYLLABUS_MAP.keys())})
 
 
 
 
 
 
 
280
 
281
  @app.route('/v1/structure/<subject_id>', methods=['GET'])
282
  def get_structure(subject_id):
@@ -286,82 +758,183 @@ def get_structure(subject_id):
286
  return jsonify({"error": "Subject not found"}), 404
287
  return jsonify(data)
288
 
 
 
 
 
 
 
 
 
 
 
289
  @app.route('/v1/search', methods=['POST'])
290
  def search():
291
  """
292
  Semantic Retrieval.
293
  Input: { "query": "...", "filter_subject_id": "..." (optional) }
294
  """
295
- if VECTOR_MATRIX is None:
296
  return jsonify({"error": "Index not ready"}), 503
297
 
298
- data = request.json
299
  query = data.get("query")
300
  subject_filter = data.get("filter_subject_id")
301
-
302
  if not query:
303
  return jsonify({"error": "Query required"}), 400
304
 
305
- # 1. Embed Query
306
- client = genai.Client(api_key=GEMINI_API_KEY)
 
 
307
  try:
308
- resp = client.models.embed_content(model=EMBEDDING_MODEL, contents=query)
309
  query_vec = np.array(resp.embeddings[0].values).reshape(1, -1)
310
  except Exception as e:
311
  return jsonify({"error": str(e)}), 500
312
 
313
- # 2. Vector Search (Cosine Similarity)
314
- # scores shape: (1, N_chunks)
315
  scores = cosine_similarity(query_vec, VECTOR_MATRIX)[0]
316
-
317
- # 3. Filter and Sort
318
- results = []
319
- # Get top 10 indices
320
  top_indices = np.argsort(scores)[::-1]
321
-
 
322
  count = 0
323
  for idx in top_indices:
324
- if scores[idx] < 0.3: break # Threshold cutoff
325
-
326
  entry = VECTOR_DB[idx]
327
  meta = entry["meta"]
328
-
329
- # Apply Filter
330
  if subject_filter and meta["subject_id"] != subject_filter:
331
  continue
332
-
333
  results.append({
334
  "score": float(scores[idx]),
335
  "subject_id": meta["subject_id"],
336
  "title": meta["title"],
337
- "content": meta["content"], # Raw text chunk
338
- "node_id": meta["subtopic_id"] # Pointer to the structure tree
339
  })
340
-
341
  count += 1
342
- if count >= 5: break # Limit to top 5
 
343
 
344
  return jsonify({"results": results})
345
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
  # -----------------------------------------------------------------------------
347
- # 4. STARTUP BOOTSTRAP
348
  # -----------------------------------------------------------------------------
349
 
350
  def start_app():
351
- # In a real deployment, we might load from disk here.
352
- # For now, we rebuild on boot.
353
- if not os.path.exists(SYLLABI_DIR):
354
- os.makedirs(os.path.join(SYLLABI_DIR, "A"), exist_ok=True)
355
- os.makedirs(os.path.join(SYLLABI_DIR, "O"), exist_ok=True)
356
- logger.warning(f"Created empty {SYLLABI_DIR}. Please add PDFs.")
357
-
358
- # Run Indexer
359
- build_index()
360
-
361
- # Run the builder once on import (or server start)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
  with app.app_context():
363
  start_app()
364
 
365
  if __name__ == '__main__':
366
- # Use 7860 for HF Spaces
367
  app.run(host='0.0.0.0', port=7860)
 
3
  import logging
4
  import re
5
  import time
6
+ import threading
7
  import numpy as np
8
  import fitz # PyMuPDF
9
  from flask import Flask, request, jsonify
10
  from flask_cors import CORS
11
  from google import genai
 
12
  from sklearn.metrics.pairwise import cosine_similarity
13
 
14
+ import firebase_admin
15
+ from firebase_admin import credentials, db as firebase_db
16
+
17
  # --- CONFIGURATION ---
18
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
19
  logger = logging.getLogger(__name__)
20
 
 
21
  SYLLABI_DIR = "syllabi"
22
+ PAST_EXAMS_DIR = "past_exams"
23
 
24
  # Google GenAI Config
25
  GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
26
  EMBEDDING_MODEL = "models/text-embedding-004"
27
 
28
+ # --- GLOBAL STATE (IN-MEMORY CACHE) ---
29
+ # Structure: { "A_9706": { "meta": {...}, "tree": [...] }, ... }
30
+ SYLLABUS_MAP = {}
31
 
32
+ # Structure: [ { "vector": [...], "meta": {...} } ]
33
  VECTOR_DB = []
34
+ VECTOR_MATRIX = None # Numpy array for fast math
35
+
36
+ # Past exam index: { "A_9706": [ { paperId, year, session, fileUrl, pages: [...] }, ... ] }
37
+ EXAM_MAP = {}
38
 
39
  app = Flask(__name__)
40
  CORS(app)
41
 
42
  # -----------------------------------------------------------------------------
43
+ # 0. FIREBASE INITIALIZATION
44
+ # -----------------------------------------------------------------------------
45
+
46
+ firebase_db_ref = None
47
+
48
+ def init_firebase():
49
+ global firebase_db_ref
50
+ try:
51
+ credentials_json_string = os.environ.get("FIREBASE")
52
+ if not credentials_json_string:
53
+ logger.warning("FIREBASE env var not set. Firebase caching disabled.")
54
+ return False
55
+
56
+ credentials_json = json.loads(credentials_json_string)
57
+ firebase_db_url = os.environ.get("Firebase_DB")
58
+
59
+ if not firebase_db_url:
60
+ logger.warning("Firebase_DB env var not set. Firebase caching disabled.")
61
+ return False
62
+
63
+ if not firebase_admin._apps:
64
+ cred = credentials.Certificate(credentials_json)
65
+ firebase_admin.initialize_app(cred, {"databaseURL": firebase_db_url})
66
+
67
+ firebase_db_ref = firebase_db.reference()
68
+ logger.info("Firebase initialized successfully in Data API.")
69
+ return True
70
+ except Exception as e:
71
+ logger.error(f"Firebase init failed: {e}")
72
+ return False
73
+
74
+ FIREBASE_AVAILABLE = init_firebase()
75
+
76
+ def fb_set(path: str, data):
77
+ """Write to Firebase, silently fail if unavailable."""
78
+ if not FIREBASE_AVAILABLE or firebase_db_ref is None:
79
+ return
80
+ try:
81
+ firebase_db_ref.child(path).set(data)
82
+ except Exception as e:
83
+ logger.error(f"Firebase write failed [{path}]: {e}")
84
+
85
+ def fb_get(path: str):
86
+ """Read from Firebase, return None if unavailable."""
87
+ if not FIREBASE_AVAILABLE or firebase_db_ref is None:
88
+ return None
89
+ try:
90
+ return firebase_db_ref.child(path).get()
91
+ except Exception as e:
92
+ logger.error(f"Firebase read failed [{path}]: {e}")
93
+ return None
94
+
95
+ # -----------------------------------------------------------------------------
96
+ # 1. BOILERPLATE PAGE DETECTION
97
+ # -----------------------------------------------------------------------------
98
+
99
+ # Keywords that identify non-content pages to skip
100
+ BOILERPLATE_TITLE_PATTERNS = re.compile(
101
+ r'^\s*(about\s+(this\s+)?syllabus|foreword|acknowledgements?|introduction\s+to\s+(cambridge|zimsec)|'
102
+ r'how\s+to\s+use\s+this\s+syllabus|why\s+choose\s+cambridge|support\s+for\s+teachers|'
103
+ r'teacher\s+support|resource\s+list|list\s+of\s+resources|further\s+information|'
104
+ r'copyright|legal\s+notice|syllabus\s+overview\s+at\s+a\s+glance|'
105
+ r'assessment\s+at\s+a\s+glance|grade\s+descriptions|mathematical\s+notation|'
106
+ r'command\s+words|glossary\s+of\s+command|changes\s+to\s+this\s+syllabus|'
107
+ r'other\s+cambridge|university\s+of\s+cambridge|cambridge\s+assessment|'
108
+ r'published\s+by|contents\s*$|table\s+of\s+contents)\s*$',
109
+ re.IGNORECASE
110
+ )
111
+
112
+ # Keywords that signal content has actually started
113
+ CONTENT_START_PATTERNS = re.compile(
114
+ r'^\s*((syllabus\s+)?content|subject\s+content|unit\s+\d|topic\s+\d|'
115
+ r'section\s+\d|module\s+\d|\d+\s+[A-Z]|component\s+\d|paper\s+\d|'
116
+ r'scheme\s+of\s+work|learning\s+objectives|knowledge.*understanding)',
117
+ re.IGNORECASE
118
+ )
119
+
120
+ def is_boilerplate_block(text: str) -> bool:
121
+ """Returns True if this block is boilerplate/admin content to skip."""
122
+ return bool(BOILERPLATE_TITLE_PATTERNS.match(text.strip()))
123
+
124
+ def page_is_boilerplate(page_text: str) -> bool:
125
+ """Returns True if the entire page appears to be admin/front-matter."""
126
+ lines = [l.strip() for l in page_text.splitlines() if l.strip()]
127
+ if not lines:
128
+ return True
129
+ # Check first substantive line
130
+ first = lines[0]
131
+ if BOILERPLATE_TITLE_PATTERNS.match(first):
132
+ return True
133
+ # Check if page is very short (< 5 lines) with no numbered items — likely a divider
134
+ if len(lines) < 5 and not re.search(r'\d+\.\d+|\d+\s+[A-Z]', page_text):
135
+ # Could be a section divider page — not boilerplate but also empty
136
+ pass
137
+ return False
138
+
139
+ # -----------------------------------------------------------------------------
140
+ # 2. THE PARSER ENGINE (Extracts Structure from PDF)
141
  # -----------------------------------------------------------------------------
142
 
143
  class PDFParser:
 
145
  self.filepath = filepath
146
  self.filename = os.path.basename(filepath)
147
  self.doc = fitz.open(filepath)
148
+
149
+ parts = filepath.replace("\\", "/").split("/")
 
 
150
  self.level = parts[-2] if len(parts) > 1 else "General"
 
151
  self.subject_code = re.search(r'\d{4}', self.filename)
152
  self.subject_code = self.subject_code.group(0) if self.subject_code else "0000"
153
+ self.subject_name = re.sub(r'[_\-]\d{4}.*', '', self.filename.replace('_', ' ')).strip()
154
  self.unique_id = f"{self.level}_{self.subject_code}"
155
 
156
  def get_font_characteristics(self):
 
163
  for s in l.get("spans", []):
164
  size = round(s["size"], 1)
165
  font_sizes[size] = font_sizes.get(size, 0) + len(s["text"])
166
+ if not font_sizes:
167
+ return 10.0
 
168
  return max(font_sizes, key=font_sizes.get)
169
 
170
+ def _find_content_start_page(self) -> int:
171
  """
172
+ Scans pages to find where actual syllabus content begins.
173
+ Returns the 0-based page index.
 
 
174
  """
175
+ for page_num, page in enumerate(self.doc):
176
+ text = page.get_text("text")
177
+ # Skip empty pages
178
+ if len(text.strip()) < 30:
179
+ continue
180
+ # Skip boilerplate pages
181
+ if page_is_boilerplate(text):
182
+ continue
183
+ # Look for numbered content sections
184
+ if CONTENT_START_PATTERNS.search(text):
185
+ logger.info(f" Content starts at page {page_num + 1} for {self.filename}")
186
+ return page_num
187
+ # Also check if this page has numbered topic headers (e.g. "1 Number" or "1.1 ...")
188
+ if re.search(r'\n\s*\d+\.?\d*\s+[A-Z][a-z]', text):
189
+ logger.info(f" Content (numbered) starts at page {page_num + 1} for {self.filename}")
190
+ return page_num
191
+
192
+ # Fallback: skip first 10% of pages (usually all front-matter)
193
+ fallback = max(1, len(self.doc) // 10)
194
+ logger.info(f" Using fallback content start page {fallback + 1} for {self.filename}")
195
+ return fallback
196
+
197
+ def parse(self):
198
  body_size = self.get_font_characteristics()
199
+ content_start = self._find_content_start_page()
200
+ logger.info(f"Parsing {self.filename} (Body size ~{body_size}pt, content from page {content_start + 1})")
201
 
202
  syllabus_tree = []
203
  current_topic = None
204
  current_subtopic = None
205
+
 
206
  topic_pattern = re.compile(r'^(\d+\.?\s|Key Question\s)', re.IGNORECASE)
207
 
208
+ for page_num, page in enumerate(self.doc):
209
+ # Skip pre-content pages entirely
210
+ if page_num < content_start:
211
+ continue
212
+
213
  blocks = page.get_text("dict")["blocks"]
214
  for b in blocks:
215
  block_text = ""
216
  max_size = 0
217
  is_bold = False
218
+
 
219
  for l in b.get("lines", []):
220
  for s in l.get("spans", []):
221
  text = s["text"].strip()
222
+ if not text:
223
+ continue
224
  block_text += text + " "
225
+ if s["size"] > max_size:
226
+ max_size = s["size"]
227
+ if "bold" in s["font"].lower():
228
+ is_bold = True
229
+
230
  block_text = block_text.strip()
231
+ if len(block_text) < 3:
232
+ continue
233
+
234
+ # Skip boilerplate blocks even within content pages
235
+ if is_boilerplate_block(block_text):
236
+ continue
237
 
238
+ # HEURISTIC 1: TOPIC (Large Header — 2pt+ above body)
 
239
  if max_size > body_size + 2:
 
240
  if current_subtopic and current_topic:
241
  current_topic["children"].append(current_subtopic)
242
  current_subtopic = None
243
  if current_topic:
244
  syllabus_tree.append(current_topic)
245
+
246
  current_topic = {
247
  "id": f"{self.unique_id}_{len(syllabus_tree)}",
248
  "title": block_text,
 
251
  }
252
  current_subtopic = None
253
 
254
+ # HEURISTIC 2: SUBTOPIC (Bold, numbered, or keyword-led)
255
+ elif (is_bold and max_size >= body_size) or \
256
+ (topic_pattern.match(block_text) and max_size >= body_size):
257
  if current_subtopic and current_topic:
258
  current_topic["children"].append(current_subtopic)
259
+
 
260
  if not current_topic:
261
+ current_topic = {
262
+ "id": f"{self.unique_id}_root",
263
+ "title": "Syllabus Content",
264
+ "type": "topic",
265
+ "children": []
266
+ }
267
 
268
  current_subtopic = {
269
  "id": f"{current_topic['id']}_{len(current_topic['children'])}",
 
277
  if current_subtopic:
278
  current_subtopic["content"].append(block_text)
279
  elif current_topic:
 
 
280
  current_subtopic = {
281
  "id": f"{current_topic['id']}_intro",
282
+ "title": "Overview",
283
  "type": "subtopic",
284
  "content": [block_text]
285
  }
 
295
  "id": self.unique_id,
296
  "subject": self.subject_name,
297
  "code": self.subject_code,
298
+ "level": self.level,
299
+ "filename": self.filename,
300
+ "indexed_at": int(time.time())
301
  },
302
  "tree": syllabus_tree
303
  }
304
 
305
+
306
  # -----------------------------------------------------------------------------
307
+ # 3. PAST EXAM PAPER PARSER
308
+ # -----------------------------------------------------------------------------
309
+
310
+ class ExamPaperParser:
311
+ """
312
+ Extracts metadata and full text from past exam PDFs.
313
+ Expected naming: syllabi_code_year_session_paper.pdf
314
+ E.g.: 9702_2023_May_Paper1.pdf or 9702_2023_s1.pdf
315
+ Falls back to filename parsing when possible.
316
+ """
317
+
318
+ def __init__(self, filepath):
319
+ self.filepath = filepath
320
+ self.filename = os.path.basename(filepath)
321
+ self.doc = fitz.open(filepath)
322
+
323
+ parts = filepath.replace("\\", "/").split("/")
324
+ self.level = parts[-2] if len(parts) > 1 else "General"
325
+
326
+ # Parse subject code from filename
327
+ code_match = re.search(r'\b(\d{4})\b', self.filename)
328
+ self.subject_code = code_match.group(1) if code_match else "0000"
329
+ self.unique_id = f"{self.level}_{self.subject_code}"
330
+
331
+ # Parse year
332
+ year_match = re.search(r'\b(20\d{2}|19\d{2})\b', self.filename)
333
+ self.year = year_match.group(1) if year_match else "Unknown"
334
+
335
+ # Parse session (May/June, Oct/Nov, etc.)
336
+ session_match = re.search(
337
+ r'(may[_\-]?june|oct[_\-]?nov|feb[_\-]?mar|summer|winter|s\d|w\d|m\d)',
338
+ self.filename, re.IGNORECASE
339
+ )
340
+ self.session = session_match.group(1).upper() if session_match else "Unknown"
341
+
342
+ # Parse paper number
343
+ paper_match = re.search(r'[_\-]p(\d)|paper[\s_\-]?(\d)', self.filename, re.IGNORECASE)
344
+ if paper_match:
345
+ self.paper_num = paper_match.group(1) or paper_match.group(2)
346
+ else:
347
+ self.paper_num = "1"
348
+
349
+ self.paper_id = f"{self.unique_id}_{self.year}_{self.session}_P{self.paper_num}"
350
+
351
+ def extract_pages(self):
352
+ """Extract text per page."""
353
+ pages = []
354
+ for i, page in enumerate(self.doc):
355
+ text = page.get_text("text").strip()
356
+ if text:
357
+ pages.append({
358
+ "page": i + 1,
359
+ "text": text[:3000] # cap per page to avoid huge payloads
360
+ })
361
+ return pages
362
+
363
+ def extract_questions(self):
364
+ """
365
+ Heuristic: questions usually start with a number followed by a period/bracket.
366
+ E.g. "1." or "1 " or "(a)" at start of paragraph.
367
+ Returns list of { number, text }.
368
+ """
369
+ questions = []
370
+ full_text = "\n".join(p["text"] for p in self.extract_pages())
371
+
372
+ # Split by question numbers
373
+ q_pattern = re.compile(
374
+ r'(?:^|\n)\s*(\d{1,2})\s*[\.\)]\s+(.+?)(?=\n\s*\d{1,2}\s*[\.\)]|\Z)',
375
+ re.DOTALL | re.MULTILINE
376
+ )
377
+ for m in q_pattern.finditer(full_text):
378
+ q_num = int(m.group(1))
379
+ q_text = m.group(2).strip()
380
+ if len(q_text) > 20: # filter noise
381
+ questions.append({"number": q_num, "text": q_text[:2000]})
382
+
383
+ return questions
384
+
385
+ def parse(self):
386
+ pages = self.extract_pages()
387
+ questions = self.extract_questions()
388
+
389
+ return {
390
+ "meta": {
391
+ "paperId": self.paper_id,
392
+ "subjectId": self.unique_id,
393
+ "subjectCode": self.subject_code,
394
+ "level": self.level,
395
+ "year": self.year,
396
+ "session": self.session,
397
+ "paperNumber": self.paper_num,
398
+ "filename": self.filename,
399
+ "totalPages": len(self.doc),
400
+ "indexed_at": int(time.time())
401
+ },
402
+ "pages": pages,
403
+ "questions": questions
404
+ }
405
+
406
+
407
+ # -----------------------------------------------------------------------------
408
+ # 4. THE VECTOR ENGINE (Embeddings & Search)
409
  # -----------------------------------------------------------------------------
410
 
411
  def generate_embeddings(texts):
412
+ """Generates embeddings using Gemini API."""
413
  if not GEMINI_API_KEY:
414
+ logger.warning("No Gemini API Key. Using dummy vectors.")
415
+ return [np.zeros(768).tolist() for _ in texts]
416
 
417
+ client_g = genai.Client(api_key=GEMINI_API_KEY)
418
  results = []
419
+ batch_size = 10
420
+
 
421
  for i in range(0, len(texts), batch_size):
422
+ batch = texts[i:i + batch_size]
423
  try:
424
+ resp = client_g.models.embed_content(
425
  model=EMBEDDING_MODEL,
426
  contents=batch,
427
  )
 
428
  for embedding in resp.embeddings:
429
+ results.append(embedding.values)
430
  except Exception as e:
431
+ logger.error(f"Embedding batch {i} failed: {e}")
432
+ for _ in batch:
433
+ results.append(np.zeros(768).tolist())
434
+
435
  return results
436
 
437
+
438
+ # -----------------------------------------------------------------------------
439
+ # 5. FIREBASE-BACKED INDEX BUILDER
440
+ # -----------------------------------------------------------------------------
441
+
442
+ def load_index_from_firebase():
443
+ """
444
+ Tries to load the full index from Firebase.
445
+ Returns True if successfully loaded.
446
+ """
447
+ global SYLLABUS_MAP, VECTOR_DB, VECTOR_MATRIX, EXAM_MAP
448
+
449
+ if not FIREBASE_AVAILABLE:
450
+ return False
451
+
452
+ logger.info("Attempting to load index from Firebase...")
453
+
454
+ try:
455
+ # Load syllabus map
456
+ fb_syllabi = fb_get("data_api/syllabi")
457
+ if not fb_syllabi:
458
+ logger.info("No syllabus data in Firebase yet.")
459
+ return False
460
+
461
+ SYLLABUS_MAP = fb_syllabi
462
+
463
+ # Load vector DB
464
+ fb_vectors = fb_get("data_api/vectors")
465
+ if not fb_vectors:
466
+ logger.info("No vector data in Firebase yet.")
467
+ return False
468
+
469
+ VECTOR_DB = []
470
+ valid_vectors = []
471
+
472
+ for entry in fb_vectors.values() if isinstance(fb_vectors, dict) else fb_vectors:
473
+ if not entry:
474
+ continue
475
+ vec = np.array(entry["vector"])
476
+ VECTOR_DB.append({
477
+ "vector": vec,
478
+ "meta": entry["meta"]
479
+ })
480
+ valid_vectors.append(vec)
481
+
482
+ if valid_vectors:
483
+ VECTOR_MATRIX = np.vstack(valid_vectors)
484
+
485
+ # Load exam map
486
+ fb_exams = fb_get("data_api/exams")
487
+ if fb_exams:
488
+ EXAM_MAP = fb_exams
489
+
490
+ logger.info(
491
+ f"Loaded from Firebase: {len(SYLLABUS_MAP)} syllabi, "
492
+ f"{len(VECTOR_DB)} vectors, {len(EXAM_MAP)} exam subjects."
493
+ )
494
+ return True
495
+
496
+ except Exception as e:
497
+ logger.error(f"Failed to load from Firebase: {e}")
498
+ return False
499
+
500
+
501
+ def save_syllabus_to_firebase(subject_id: str, data: dict):
502
+ """Save a single syllabus entry to Firebase."""
503
+ # Store tree without numpy arrays (just plain dicts)
504
+ fb_set(f"data_api/syllabi/{subject_id}", data)
505
+
506
+
507
+ def save_vectors_to_firebase(vector_entries: list):
508
+ """Save vector entries to Firebase (store as lists, not numpy)."""
509
+ fb_data = {}
510
+ for i, entry in enumerate(vector_entries):
511
+ key = f"v_{i:06d}"
512
+ fb_data[key] = {
513
+ "vector": entry["vector"].tolist() if isinstance(entry["vector"], np.ndarray) else entry["vector"],
514
+ "meta": entry["meta"]
515
+ }
516
+ fb_set("data_api/vectors", fb_data)
517
+
518
+
519
+ def save_exam_to_firebase(subject_id: str, paper_data: dict):
520
+ """Save a parsed exam paper under the subject's exam list."""
521
+ paper_id = paper_data["meta"]["paperId"]
522
+ # Sanitize key
523
+ safe_key = re.sub(r'[.\[\]#$/]', '_', paper_id)
524
+ fb_set(f"data_api/exams/{subject_id}/{safe_key}", paper_data)
525
+
526
+
527
  def build_index():
528
+ """
529
+ Walks directories, parses PDFs, builds JSON tree and Vector Index,
530
+ then persists everything to Firebase.
531
+ """
532
+ global SYLLABUS_MAP, VECTOR_DB, VECTOR_MATRIX, EXAM_MAP
533
+
534
  logger.info("🚀 Starting Build Process...")
 
 
 
 
 
535
 
536
+ # ---- SYLLABI ----
537
  parsed_data = []
538
+
539
+ if os.path.exists(SYLLABI_DIR):
540
+ for root, dirs, files in os.walk(SYLLABI_DIR):
541
+ for file in sorted(files):
542
+ if file.endswith(".pdf"):
543
+ path = os.path.join(root, file)
544
+ logger.info(f"Parsing syllabus: {path}")
545
+ try:
546
+ parser = PDFParser(path)
547
+ data = parser.parse()
548
+ parsed_data.append(data)
549
+ SYLLABUS_MAP[data["meta"]["id"]] = data
550
+ save_syllabus_to_firebase(data["meta"]["id"], data)
551
+ except Exception as e:
552
+ logger.error(f"Failed to parse {path}: {e}")
553
+ else:
554
+ logger.warning(f"Directory {SYLLABI_DIR} not found.")
555
+
556
+ # ---- PAST EXAMS ----
557
+ if os.path.exists(PAST_EXAMS_DIR):
558
+ for root, dirs, files in os.walk(PAST_EXAMS_DIR):
559
+ for file in sorted(files):
560
+ if file.endswith(".pdf"):
561
+ path = os.path.join(root, file)
562
+ logger.info(f"Parsing exam paper: {path}")
563
+ try:
564
+ parser = ExamPaperParser(path)
565
+ exam_data = parser.parse()
566
+ subject_id = exam_data["meta"]["subjectId"]
567
+
568
+ if subject_id not in EXAM_MAP:
569
+ EXAM_MAP[subject_id] = {}
570
+
571
+ paper_id = exam_data["meta"]["paperId"]
572
+ safe_key = re.sub(r'[.\[\]#$/]', '_', paper_id)
573
+ EXAM_MAP[subject_id][safe_key] = exam_data
574
+ save_exam_to_firebase(subject_id, exam_data)
575
+ except Exception as e:
576
+ logger.error(f"Failed to parse exam {path}: {e}")
577
+ else:
578
+ logger.info(f"No past_exams directory found at {PAST_EXAMS_DIR}. Skipping.")
579
+
580
+ # ---- VECTORIZATION (syllabi only) ----
581
+ if not parsed_data:
582
+ logger.info("No new syllabus data to vectorize.")
583
+ return
584
+
585
  chunks_to_embed = []
586
  chunk_metadata = []
587
 
588
  for item in parsed_data:
589
  meta_base = item["meta"]
590
  for topic in item["tree"]:
591
+ for sub in topic.get("children", []):
592
+ text_blob = "\n".join(sub.get("content", []))
593
+ if len(text_blob) < 10:
594
+ continue
595
+
596
+ rich_text = (
597
+ f"{meta_base['subject']} {meta_base['level']} "
598
+ f"- {topic['title']} - {sub['title']}:\n{text_blob}"
599
+ )
600
  chunks_to_embed.append(rich_text)
601
  chunk_metadata.append({
602
  "subject_id": meta_base["id"],
 
606
  "content": text_blob
607
  })
608
 
 
609
  logger.info(f"🧮 Generating embeddings for {len(chunks_to_embed)} chunks...")
610
  vectors = generate_embeddings(chunks_to_embed)
611
 
 
612
  VECTOR_DB = []
613
  valid_vectors = []
614
+
615
  for i, vec in enumerate(vectors):
616
+ np_vec = np.array(vec)
617
  VECTOR_DB.append({
618
+ "vector": np_vec,
619
  "meta": chunk_metadata[i]
620
  })
621
+ valid_vectors.append(np_vec)
622
 
623
  if valid_vectors:
624
  VECTOR_MATRIX = np.vstack(valid_vectors)
625
+
626
+ # Persist to Firebase
627
+ save_vectors_to_firebase(VECTOR_DB)
628
+
629
+ logger.info(
630
+ f"✅ Indexing Complete. "
631
+ f"{len(SYLLABUS_MAP)} syllabi, {len(VECTOR_DB)} vectors, "
632
+ f"{sum(len(v) for v in EXAM_MAP.values())} exam papers."
633
+ )
634
+
635
+
636
+ # -----------------------------------------------------------------------------
637
+ # 6. DIRECTORY WATCHER — Auto-index new PDFs
638
+ # -----------------------------------------------------------------------------
639
+
640
+ _indexed_files = set()
641
+
642
+ def _collect_existing_files():
643
+ """Collect all currently-present PDFs to avoid re-indexing on boot."""
644
+ for d in [SYLLABI_DIR, PAST_EXAMS_DIR]:
645
+ if not os.path.exists(d):
646
+ continue
647
+ for root, _, files in os.walk(d):
648
+ for f in files:
649
+ if f.endswith(".pdf"):
650
+ _indexed_files.add(os.path.join(root, f))
651
+
652
+
653
+ def _watch_directories(interval=30):
654
+ """Background thread: detect new PDFs and index them."""
655
+ while True:
656
+ time.sleep(interval)
657
+ for directory, is_exam in [(SYLLABI_DIR, False), (PAST_EXAMS_DIR, True)]:
658
+ if not os.path.exists(directory):
659
+ continue
660
+ for root, _, files in os.walk(directory):
661
+ for file in files:
662
+ if not file.endswith(".pdf"):
663
+ continue
664
+ path = os.path.join(root, file)
665
+ if path in _indexed_files:
666
+ continue
667
+
668
+ logger.info(f"🆕 New PDF detected: {path}")
669
+ _indexed_files.add(path)
670
+
671
+ try:
672
+ if is_exam:
673
+ parser = ExamPaperParser(path)
674
+ exam_data = parser.parse()
675
+ subject_id = exam_data["meta"]["subjectId"]
676
+
677
+ if subject_id not in EXAM_MAP:
678
+ EXAM_MAP[subject_id] = {}
679
+ paper_id = exam_data["meta"]["paperId"]
680
+ safe_key = re.sub(r'[.\[\]#$/]', '_', paper_id)
681
+ EXAM_MAP[subject_id][safe_key] = exam_data
682
+ save_exam_to_firebase(subject_id, exam_data)
683
+ else:
684
+ parser = PDFParser(path)
685
+ data = parser.parse()
686
+ SYLLABUS_MAP[data["meta"]["id"]] = data
687
+ save_syllabus_to_firebase(data["meta"]["id"], data)
688
+ # Re-vectorize just this document
689
+ _incremental_vectorize(data)
690
+
691
+ except Exception as e:
692
+ logger.error(f"Error indexing new file {path}: {e}")
693
+
694
+
695
+ def _incremental_vectorize(syllabus_data: dict):
696
+ """Add vectors for a single newly-uploaded syllabus."""
697
+ global VECTOR_DB, VECTOR_MATRIX
698
+
699
+ meta_base = syllabus_data["meta"]
700
+ chunks = []
701
+ metas = []
702
+
703
+ for topic in syllabus_data["tree"]:
704
+ for sub in topic.get("children", []):
705
+ text_blob = "\n".join(sub.get("content", []))
706
+ if len(text_blob) < 10:
707
+ continue
708
+ rich_text = (
709
+ f"{meta_base['subject']} {meta_base['level']} "
710
+ f"- {topic['title']} - {sub['title']}:\n{text_blob}"
711
+ )
712
+ chunks.append(rich_text)
713
+ metas.append({
714
+ "subject_id": meta_base["id"],
715
+ "topic_id": topic["id"],
716
+ "subtopic_id": sub["id"],
717
+ "title": sub["title"],
718
+ "content": text_blob
719
+ })
720
+
721
+ if not chunks:
722
+ return
723
+
724
+ vectors = generate_embeddings(chunks)
725
+
726
+ for i, vec in enumerate(vectors):
727
+ np_vec = np.array(vec)
728
+ VECTOR_DB.append({"vector": np_vec, "meta": metas[i]})
729
+
730
+ if VECTOR_DB:
731
+ VECTOR_MATRIX = np.vstack([e["vector"] for e in VECTOR_DB])
732
+
733
+ # Persist full updated vector set
734
+ save_vectors_to_firebase(VECTOR_DB)
735
+ logger.info(f"Incremental vectorize complete for {meta_base['id']}.")
736
+
737
 
738
  # -----------------------------------------------------------------------------
739
+ # 7. API ENDPOINTS
740
  # -----------------------------------------------------------------------------
741
 
742
  @app.route('/health', methods=['GET'])
743
  def health():
744
+ return jsonify({
745
+ "status": "online",
746
+ "subjects_loaded": list(SYLLABUS_MAP.keys()),
747
+ "vector_chunks": len(VECTOR_DB),
748
+ "exam_subjects": list(EXAM_MAP.keys()),
749
+ "firebase": FIREBASE_AVAILABLE
750
+ })
751
+
752
 
753
  @app.route('/v1/structure/<subject_id>', methods=['GET'])
754
  def get_structure(subject_id):
 
758
  return jsonify({"error": "Subject not found"}), 404
759
  return jsonify(data)
760
 
761
+
762
+ @app.route('/v1/subjects', methods=['GET'])
763
+ def list_subjects():
764
+ """Returns metadata for all indexed syllabi."""
765
+ result = []
766
+ for sid, data in SYLLABUS_MAP.items():
767
+ result.append(data.get("meta", {"id": sid}))
768
+ return jsonify(result)
769
+
770
+
771
  @app.route('/v1/search', methods=['POST'])
772
  def search():
773
  """
774
  Semantic Retrieval.
775
  Input: { "query": "...", "filter_subject_id": "..." (optional) }
776
  """
777
+ if VECTOR_MATRIX is None or len(VECTOR_DB) == 0:
778
  return jsonify({"error": "Index not ready"}), 503
779
 
780
+ data = request.json or {}
781
  query = data.get("query")
782
  subject_filter = data.get("filter_subject_id")
783
+
784
  if not query:
785
  return jsonify({"error": "Query required"}), 400
786
 
787
+ if not GEMINI_API_KEY:
788
+ return jsonify({"error": "Embedding API not configured"}), 503
789
+
790
+ client_g = genai.Client(api_key=GEMINI_API_KEY)
791
  try:
792
+ resp = client_g.models.embed_content(model=EMBEDDING_MODEL, contents=query)
793
  query_vec = np.array(resp.embeddings[0].values).reshape(1, -1)
794
  except Exception as e:
795
  return jsonify({"error": str(e)}), 500
796
 
 
 
797
  scores = cosine_similarity(query_vec, VECTOR_MATRIX)[0]
 
 
 
 
798
  top_indices = np.argsort(scores)[::-1]
799
+
800
+ results = []
801
  count = 0
802
  for idx in top_indices:
803
+ if scores[idx] < 0.3:
804
+ break
805
  entry = VECTOR_DB[idx]
806
  meta = entry["meta"]
807
+
 
808
  if subject_filter and meta["subject_id"] != subject_filter:
809
  continue
810
+
811
  results.append({
812
  "score": float(scores[idx]),
813
  "subject_id": meta["subject_id"],
814
  "title": meta["title"],
815
+ "content": meta["content"],
816
+ "node_id": meta["subtopic_id"]
817
  })
818
+
819
  count += 1
820
+ if count >= 5:
821
+ break
822
 
823
  return jsonify({"results": results})
824
 
825
+
826
+ @app.route('/v1/exams', methods=['GET'])
827
+ def list_exams():
828
+ """
829
+ List past exam papers.
830
+ Query param: subject_id (optional)
831
+ """
832
+ subject_id = request.args.get("subject_id")
833
+
834
+ if subject_id:
835
+ papers = EXAM_MAP.get(subject_id, {})
836
+ result = [p["meta"] for p in papers.values() if isinstance(p, dict) and "meta" in p]
837
+ else:
838
+ result = []
839
+ for sid, papers in EXAM_MAP.items():
840
+ for p in papers.values():
841
+ if isinstance(p, dict) and "meta" in p:
842
+ result.append(p["meta"])
843
+
844
+ return jsonify(result)
845
+
846
+
847
+ @app.route('/v1/exams/<paper_id>', methods=['GET'])
848
+ def get_exam(paper_id):
849
+ """
850
+ Get full exam paper (pages + questions).
851
+ paper_id format: A_9702_2023_MAY_P1
852
+ """
853
+ safe_key = re.sub(r'[.\[\]#$/]', '_', paper_id)
854
+
855
+ for sid, papers in EXAM_MAP.items():
856
+ for key, paper in papers.items():
857
+ if key == safe_key or (isinstance(paper, dict) and
858
+ paper.get("meta", {}).get("paperId") == paper_id):
859
+ return jsonify(paper)
860
+
861
+ return jsonify({"error": "Exam paper not found"}), 404
862
+
863
+
864
+ @app.route('/v1/exams/<paper_id>/questions', methods=['GET'])
865
+ def get_exam_questions(paper_id):
866
+ """Get just the extracted questions from a past paper."""
867
+ safe_key = re.sub(r'[.\[\]#$/]', '_', paper_id)
868
+
869
+ for sid, papers in EXAM_MAP.items():
870
+ for key, paper in papers.items():
871
+ if key == safe_key or (isinstance(paper, dict) and
872
+ paper.get("meta", {}).get("paperId") == paper_id):
873
+ return jsonify({
874
+ "paperId": paper_id,
875
+ "meta": paper.get("meta"),
876
+ "questions": paper.get("questions", [])
877
+ })
878
+
879
+ return jsonify({"error": "Exam paper not found"}), 404
880
+
881
+
882
+ @app.route('/v1/rebuild', methods=['POST'])
883
+ def trigger_rebuild():
884
+ """
885
+ Trigger a full index rebuild (admin use).
886
+ Optionally pass { "force": true } to bypass Firebase cache.
887
+ """
888
+ auth_header = request.headers.get("Authorization", "")
889
+ rebuild_key = os.environ.get("REBUILD_SECRET", "")
890
+ if rebuild_key and auth_header != f"Bearer {rebuild_key}":
891
+ return jsonify({"error": "Unauthorized"}), 401
892
+
893
+ def _rebuild_bg():
894
+ global SYLLABUS_MAP, VECTOR_DB, VECTOR_MATRIX, EXAM_MAP
895
+ SYLLABUS_MAP = {}
896
+ VECTOR_DB = []
897
+ VECTOR_MATRIX = None
898
+ EXAM_MAP = {}
899
+ build_index()
900
+
901
+ t = threading.Thread(target=_rebuild_bg, daemon=True)
902
+ t.start()
903
+ return jsonify({"status": "rebuild started"}), 202
904
+
905
+
906
  # -----------------------------------------------------------------------------
907
+ # 8. STARTUP BOOTSTRAP
908
  # -----------------------------------------------------------------------------
909
 
910
  def start_app():
911
+ # Create directories if needed
912
+ for d in [SYLLABI_DIR, PAST_EXAMS_DIR]:
913
+ if not os.path.exists(d):
914
+ os.makedirs(os.path.join(d, "A"), exist_ok=True)
915
+ os.makedirs(os.path.join(d, "O"), exist_ok=True)
916
+ logger.info(f"Created empty directory: {d}")
917
+
918
+ # Try to load from Firebase first
919
+ loaded = load_index_from_firebase()
920
+
921
+ if not loaded:
922
+ # Build from scratch
923
+ build_index()
924
+ else:
925
+ logger.info("Served from Firebase cache. Skipping full rebuild.")
926
+
927
+ # Collect existing files so the watcher doesn't re-index them
928
+ _collect_existing_files()
929
+
930
+ # Start background watcher for new uploads
931
+ watcher = threading.Thread(target=_watch_directories, daemon=True)
932
+ watcher.start()
933
+ logger.info("Directory watcher started.")
934
+
935
+
936
  with app.app_context():
937
  start_app()
938
 
939
  if __name__ == '__main__':
 
940
  app.run(host='0.0.0.0', port=7860)