TRIAL

Running

App Files Files Community

TRIAL / app.py

atz21

Update app.py

6295c4b verified 4 months ago

raw

history blame contribute delete

70.2 kB

	import os
	import re
	import json
	import subprocess
	import time
	import shutil
	import img2pdf
	import gradio as gr
	from google import genai # NEW SDK
	from pdf2image import convert_from_path
	from PIL import Image, ImageDraw, ImageFont
	import cv2
	import numpy as np
	from PyPDF2 import PdfReader, PdfWriter
	from prompts import QP_MS_TRANSCRIPTION_PROMPT, get_grading_prompt
	from supabase import create_client, Client

	# ---------------- CONFIG ----------------
	# Multi-API Key Configuration for handling RESOURCE_EXHAUSTED errors
	class GeminiClientManager:
	"""Manages multiple Gemini API keys with automatic rotation on quota exhaustion."""

	def __init__(self):
	# Load all three API keys from environment
	self.api_keys = [
	os.getenv("GEMINI_API_KEY_1"),
	os.getenv("GEMINI_API_KEY_2"),
	os.getenv("GEMINI_API_KEY_3")
	]

	# Filter out None values
	self.api_keys = [key for key in self.api_keys if key]

	if not self.api_keys:
	raise ValueError("❌ No API keys found! Please set at least GEMINI_API_KEY_1")

	print(f"✅ Loaded {len(self.api_keys)} Gemini API key(s)")

	# Current key index (0 = primary)
	self.current_key_index = 0

	# Create clients for all keys
	self.clients = [genai.Client(api_key=key) for key in self.api_keys]

	def get_current_client(self):
	"""Get the currently active client."""
	return self.clients[self.current_key_index]

	def rotate_to_next_key(self):
	"""Rotate to the next available API key."""
	if len(self.api_keys) == 1:
	print("⚠️ Only one API key available, cannot rotate")
	return False

	old_index = self.current_key_index
	self.current_key_index = (self.current_key_index + 1) % len(self.api_keys)
	print(f"🔄 Rotating from API key #{old_index + 1} to API key #{self.current_key_index + 1}")
	return True

	def reset_to_primary(self):
	"""Reset to primary (first) API key."""
	if self.current_key_index != 0:
	print(f"🔙 Resetting to primary API key #1")
	self.current_key_index = 0

	# Initialize the client manager
	client_manager = GeminiClientManager()
	client = client_manager.get_current_client() # For backward compatibility
	GRID_ROWS, GRID_COLS = 20, 14

	# Supabase configuration
	SUPABASE_URL = os.getenv("SUPABASE_URL")
	SUPABASE_SERVICE_KEY = os.getenv("SUPABASE_SERVICE_KEY")
	SUPABASE_BUCKET = "examfiles"

	# Initialize Supabase client (only if credentials are available)
	supabase_client = None
	if SUPABASE_URL and SUPABASE_SERVICE_KEY:
	try:
	supabase_client = create_client(SUPABASE_URL, SUPABASE_SERVICE_KEY)
	print("✅ Supabase client initialized successfully")
	except Exception as e:
	print(f"⚠️ Supabase initialization failed: {e}")
	else:
	print("⚠️ Supabase credentials not found - file upload to storage disabled")

	# ---------------- PROMPTS ----------------
	# Prompts are now imported from prompts.py

	# ---------------- SUPABASE HELPERS ----------------
	def upload_file_to_supabase(local_path, file_type="unknown", timestamp=None):
	"""
	Upload a file to Supabase Storage.

	Args:
	local_path (str): Local file path
	file_type (str): Type of file (qp, ms, ans, graded, imprinted)
	timestamp (str): Unix timestamp for folder organization (optional)

	Returns:
	str: Public URL of uploaded file or None if upload failed
	"""
	if not supabase_client:
	print("⚠️ Supabase not configured - skipping upload")
	return None

	try:
	if timestamp is None:
	timestamp = str(int(time.time()))

	original_name = os.path.basename(local_path)
	# Use original filename without prefix for cleaner storage
	remote_path = f"{timestamp}/{original_name}"

	print(f"📤 Uploading {file_type} to Supabase: {remote_path}")

	with open(local_path, "rb") as f:
	supabase_client.storage.from_(SUPABASE_BUCKET).upload(
	remote_path,
	f,
	file_options={"upsert": "true"}
	)

	public_url = f"{SUPABASE_URL}/storage/v1/object/public/{SUPABASE_BUCKET}/{remote_path}"
	print(f"✅ Uploaded successfully: {public_url}")
	return public_url

	except Exception as e:
	print(f"❌ Supabase upload failed for {file_type}: {e}")
	return None

	def process_and_upload_input_files(qp_file_obj, ms_file_obj, ans_file_obj):
	"""
	Process uploaded files and upload them to Supabase using a shared timestamp.

	Args:
	qp_file_obj: Gradio file object for Question Paper
	ms_file_obj: Gradio file object for Markscheme
	ans_file_obj: Gradio file object for Answer Sheet

	Returns:
	tuple: (qp_path, ms_path, ans_path, upload_urls_dict, timestamp)
	"""
	print("\n" + "="*60)
	print("📁 PROCESSING INPUT FILES")
	print("="*60)

	# Generate single timestamp for this entire run
	run_timestamp = str(int(time.time()))
	print(f"🕐 Run timestamp: {run_timestamp}")

	upload_urls = {
	"qp_url": None,
	"ms_url": None,
	"ans_url": None
	}

	# Get local paths from Gradio file objects
	qp_path = qp_file_obj.name if qp_file_obj else None
	ms_path = ms_file_obj.name if ms_file_obj else None
	ans_path = ans_file_obj.name if ans_file_obj else None

	# Upload to Supabase if configured (all files use same timestamp)
	if supabase_client:
	if qp_path:
	upload_urls["qp_url"] = upload_file_to_supabase(qp_path, "qp", run_timestamp)
	if ms_path:
	upload_urls["ms_url"] = upload_file_to_supabase(ms_path, "ms", run_timestamp)
	if ans_path:
	upload_urls["ans_url"] = upload_file_to_supabase(ans_path, "ans", run_timestamp)

	print("="*60 + "\n")

	return qp_path, ms_path, ans_path, upload_urls, run_timestamp



	# ---------------- HELPERS ----------------
	def parse_md_table(md):
	"""Parse a Markdown table into a list of rows."""
	lines = [l for l in md.split("\n") if l.strip()]
	if len(lines) < 3:
	return []
	lines = lines[2:] # skip header + separator
	rows = []
	for line in lines:
	parts = [c.strip() for c in line.strip("\|").split("\|")]
	# Filter out empty strings from leading/trailing pipes
	clean_parts = [p for p in parts if p]
	if clean_parts:
	rows.append(clean_parts)
	return rows

	def convert_html_color_spans(md_text):
	"""Convert HTML color spans to LaTeX textcolor commands."""
	pattern = r'<span\s+style="color:\s([^"]+)">\s(.?)\s</span>'
	def repl(m):
	color = m.group(1).strip()
	text = m.group(2)
	return fr'\textcolor{{{color}}}{{{text}}}'
	return re.sub(pattern, repl, md_text, flags=re.IGNORECASE)

	def cleanup_markdown_for_latex(md_text):
	"""Clean up markdown text for better LaTeX conversion."""
	# Ensure spacing between bold headers and tables
	md_text = re.sub(r'(\\Markscheme vs Student Answer\\)\s*(\\|)', r'\1\n\n\2', md_text)

	# Convert common unicode math symbols to LaTeX (safety net)
	replacements = {
	'∫': r'\int ',
	'²': '^2',
	'³': '^3',
	'½': r'\frac{1}{2}',
	'¼': r'\frac{1}{4}',
	'∞': r'\infty',
	'≤': r'\leq',
	'≥': r'\geq',
	'≠': r'\neq',
	'±': r'\pm',
	'×': r'\times',
	'÷': r'\div',
	'√': r'\sqrt',
	'∑': r'\sum',
	'∏': r'\prod',
	'∂': r'\partial',
	'π': r'\pi',
	'θ': r'\theta',
	'α': r'\alpha',
	'β': r'\beta',
	'γ': r'\gamma',
	'δ': r'\delta',
	'ε': r'\epsilon',
	'λ': r'\lambda',
	'μ': r'\mu',
	'σ': r'\sigma',
	'Δ': r'\Delta',
	'Σ': r'\Sigma',
	'Ω': r'\Omega'
	}

	for char, latex in replacements.items():
	md_text = md_text.replace(char, f'${latex}$')

	return md_text

	def escape_latex_special_chars(text):
	"""Escape special LaTeX characters in text."""
	replacements = {
	'%': r'\%',
	'&': r'\&',
	'#': r'\#',
	'_': r'\_',
	'{': r'\{',
	'}': r'\}',
	'~': r'\textasciitilde{}',
	'^': r'\textasciicircum{}'
	}

	# Don't escape if already in math mode or LaTeX command
	if '$' in text or '\\' in text:
	return text

	for char, escaped in replacements.items():
	text = text.replace(char, escaped)

	return text

	def save_as_pdf(text, filename="output.pdf"):
	"""
	Convert Markdown text to PDF using Pandoc with pdflatex.
	Extracts the Examiner's Summary Report and places it at the top with enhanced formatting.
	Converts HTML color spans to LaTeX textcolor commands.

	Args:
	text (str): Markdown content to convert
	filename (str): Output PDF filename

	Returns:
	str: Path to the generated PDF file

	Raises:
	Exception: If Pandoc or pdflatex is not available, or conversion fails
	"""
	# Sanitize filename - replace spaces and special characters with underscores
	# This prevents issues with pdflatex and file operations
	import string
	valid_chars = f"-_.() {string.ascii_letters}{string.digits}"
	sanitized_filename = ''.join(c if c in valid_chars else '_' for c in filename)
	# Replace multiple spaces with single underscore
	sanitized_filename = re.sub(r'\s+', '_', sanitized_filename)
	# Remove double underscores
	sanitized_filename = re.sub(r'_+', '_', sanitized_filename)

	if sanitized_filename != filename:
	print(f"ℹ️ Sanitized filename: '{filename}' → '{sanitized_filename}'")
	filename = sanitized_filename

	base_name = os.path.splitext(filename)[0]
	temp_md_file = f"{base_name}_input.md"
	temp_tex_file = f"{base_name}_temp.tex"

	print("\n" + "="*60)
	print("📄 MARKDOWN TO PDF CONVERSION PROCESS")
	print("="*60)

	try:
	# Step 1: Extract Summary Report Table
	print("\n[STEP 1/6] Extracting Examiner's Summary Report...")
	summary_pattern = re.compile(
	r"### Examiner's Summary Report\s\n\n(\\|.?\\|)\s\n\n\\Total:\s(.?)\\*",
	re.DOTALL
	)
	summary_match = summary_pattern.search(text)

	if summary_match:
	summary_table_md = summary_match.group(1)
	summary_total = summary_match.group(2)
	text = summary_pattern.sub("", text)
	print(f" ✅ SUCCESS: Extracted summary report with total: {summary_total}")
	else:
	summary_table_md = ""
	summary_total = ""
	print(" ⚠️ WARNING: No Examiner's Summary Report found in markdown")

	# Step 2: Clean up markdown
	print("\n[STEP 2/6] Cleaning markdown and converting HTML to LaTeX...")
	text = cleanup_markdown_for_latex(text)
	text = convert_html_color_spans(text)
	print(" ✅ SUCCESS: Markdown cleaned and HTML color spans converted")

	# Save cleaned markdown
	with open(temp_md_file, 'w', encoding='utf-8') as f:
	f.write(text)
	print(f" 📝 Saved cleaned markdown to: {temp_md_file}")

	# Step 3: Convert MD to LaTeX via Pandoc
	print("\n[STEP 3/6] Converting markdown to LaTeX using Pandoc...")
	pandoc_cmd = [
	"pandoc",
	"--from=markdown",
	"--to=latex",
	"--standalone",
	temp_md_file,
	"-o", temp_tex_file
	]
	print(f" 🔧 Running: {' '.join(pandoc_cmd)}")

	result = subprocess.run(pandoc_cmd, capture_output=True, check=False)

	if result.returncode != 0:
	try:
	stderr = result.stderr.decode('utf-8', errors='replace')
	except:
	stderr = str(result.stderr)
	print(f" ❌ FAILED: Pandoc returned error code {result.returncode}")
	print(f" Error details: {stderr[:500]}")
	raise Exception(f"Pandoc conversion failed: {stderr}")

	if not os.path.exists(temp_tex_file):
	print(f" ❌ FAILED: LaTeX file not created at {temp_tex_file}")
	raise Exception("Pandoc did not create the expected LaTeX file")

	print(f" ✅ SUCCESS: LaTeX file created at {temp_tex_file}")

	# Step 4: Modify the generated LaTeX
	print("\n[STEP 4/6] Enhancing LaTeX document...")
	with open(temp_tex_file, "r", encoding="utf-8") as f:
	tex = f.read()

	tex = tex.replace(
	r"\documentclass{article}",
	r"\documentclass[12pt]{extarticle}"
	)

	insert_packages = r"""\usepackage[a4paper, margin=1in]{geometry}
	\usepackage{xcolor}
	\usepackage{colortbl}
	\usepackage{booktabs}
	\usepackage{array}
	\usepackage{longtable}
	\renewcommand{\arraystretch}{1.4}
	\newcolumntype{L}[1]{>{\raggedright\arraybackslash}p{#1}}"""

	tex = tex.replace(r"\begin{document}", insert_packages + "\n\\begin{document}")
	print(" ✅ SUCCESS: Enhanced document class and added packages")

	# Step 5: Build enhanced LaTeX table for summary
	if summary_table_md:
	print("\n[STEP 5/6] Building enhanced summary table...")
	summary_rows = parse_md_table(summary_table_md)
	print(f" 📊 Parsed {len(summary_rows)} rows from summary table")

	summary_latex = r"""\section*{Examiner's Summary Report}
	\begin{center}
	\rowcolors{2}{gray!10}{white}
	\begin{tabular}{\|c\|c\|c\|L{8cm}\|}
	\hline
	\rowcolor{gray!30}
	\textbf{Question} & \textbf{Marks} & \textbf{Remark} & \textbf{Feedback} \\ \hline
	"""
	for row in summary_rows:
	if len(row) >= 4:
	feedback = row[3]
	if not ('$' in feedback or '\\textcolor' in feedback):
	feedback = feedback.replace('%', r'\%').replace('&', r'\&').replace('#', r'\#')

	summary_latex += f"{row[0]} & {row[1]} & {row[2]} & {feedback} \\\\ \\hline\n"

	summary_latex += r"\end{tabular}"
	summary_latex += "\n\\end{center}\n\n"
	summary_latex += f"\\vspace{{0.5cm}}\\noindent\\textbf{{\\Large Overall Score: {summary_total}}}\n\n"
	summary_latex += "\\hrulefill\n\\vspace{1cm}\n\n"
	summary_latex += "\\newpage\n\n"

	tex = tex.replace(
	r"\begin{document}",
	r"\begin{document}" + "\n\n" + summary_latex
	)
	print(" ✅ SUCCESS: Summary table with zebra striping injected at document top")
	else:
	print("\n[STEP 5/6] Skipping summary table (not found)")

	with open(temp_tex_file, "w", encoding="utf-8") as f:
	f.write(tex)

	# Step 6: Compile PDF with pdflatex
	print("\n[STEP 6/6] Compiling PDF with pdflatex...")
	pdflatex_cmd = [
	"pdflatex",
	"-interaction=nonstopmode",
	f"-output-directory={os.path.dirname(os.path.abspath(temp_tex_file)) or '.'}",
	temp_tex_file
	]

	print(" 🔧 Running pdflatex (pass 1/2)...")
	result1 = subprocess.run(pdflatex_cmd, capture_output=True, check=False)

	print(" 🔧 Running pdflatex (pass 2/2)...")
	result2 = subprocess.run(pdflatex_cmd, capture_output=True, check=False)

	temp_pdf = temp_tex_file.replace(".tex", ".pdf")

	if not os.path.exists(temp_pdf):
	print(f" ❌ FAILED: PDF not created at {temp_pdf}")

	try:
	stderr = result2.stderr.decode('utf-8', errors='replace')
	except:
	stderr = str(result2.stderr)

	log_file = temp_tex_file.replace(".tex", ".log")
	if os.path.exists(log_file):
	print(f" 📋 Checking LaTeX log file: {log_file}")
	try:
	with open(log_file, 'r', encoding='utf-8', errors='replace') as f:
	log_content = f.read()
	error_lines = [line for line in log_content.split('\n') if '!' in line]
	if error_lines:
	print(f" ❌ LaTeX Errors found ({len(error_lines)} lines):")
	for err_line in error_lines[:10]:
	print(f" {err_line}")
	stderr += "\n\nLaTeX Errors:\n" + "\n".join(error_lines[:10])
	except Exception as log_err:
	print(f" ⚠️ Could not read log file: {log_err}")

	raise Exception(f"pdflatex failed to create PDF. Error: {stderr[:1000]}")

	print(f" ✅ SUCCESS: PDF compiled at {temp_pdf}")

	# Move output PDF to final filename
	if os.path.exists(filename):
	os.remove(filename)
	os.rename(temp_pdf, filename)
	print(f" 📦 Moved to final location: {filename}")

	# Clean up temporary files
	print("\n[CLEANUP] Removing temporary files...")
	cleaned_count = 0
	for ext in [".md", ".tex", ".aux", ".log", ".out"]:
	temp_file = base_name + ext
	if os.path.exists(temp_file):
	os.remove(temp_file)
	cleaned_count += 1
	for prefix in ["_input", "_temp"]:
	temp_file = base_name + prefix + ext
	if os.path.exists(temp_file):
	os.remove(temp_file)
	cleaned_count += 1
	print(f" 🧹 Cleaned up {cleaned_count} temporary files")

	print("\n" + "="*60)
	print("✅ PDF CONVERSION COMPLETED SUCCESSFULLY")
	print(f"📄 Output file: {filename}")
	print("="*60 + "\n")

	return filename

	except subprocess.CalledProcessError as e:
	print(f"\n❌ SUBPROCESS ERROR: {e}")
	print(f" STDOUT: {e.stdout}")
	print(f" STDERR: {e.stderr}")
	print("="*60 + "\n")
	raise Exception(f"PDF conversion failed: {e.stderr}")

	except FileNotFoundError as e:
	print(f"\n❌ FILE NOT FOUND ERROR: {e}")
	print("="*60)
	print("⚠️ REQUIRED TOOLS MISSING")
	print("Please install the following:")
	print(" • pandoc")
	print(" • texlive (or MiKTeX on Windows)")
	print(" • texlive-latex-extra (for extarticle class)")
	print("="*60 + "\n")
	raise Exception(
	"Pandoc or pdflatex not found. Please install:\n"
	" - pandoc\n"
	" - texlive (or MiKTeX on Windows)\n"
	" - texlive-latex-extra (for extarticle class)"
	)

	except Exception as e:
	print(f"\n❌ UNEXPECTED ERROR: {e}")
	import traceback
	traceback.print_exc()
	print("="*60 + "\n")
	raise

	def compress_pdf(input_path, output_path=None, max_size=2010241024):
	if output_path is None:
	base, ext = os.path.splitext(input_path)
	output_path = f"{base}_compressed{ext}"

	try:
	size = os.path.getsize(input_path)
	except Exception:
	return input_path

	if size <= max_size:
	print(f"ℹ️ Not compressing {input_path} ({size/1024/1024:.2f} MB <= {max_size/1024/1024} MB)")
	return input_path

	print(f"🔎 Compressing {input_path} ({size/1024/1024:.2f} MB) -> {output_path}")
	try:
	gs_cmd = [
	"gs", "-sDEVICE=pdfwrite",
	"-dCompatibilityLevel=1.4",
	"-dPDFSETTINGS=/ebook",
	"-dNOPAUSE", "-dQUIET", "-dBATCH",
	f"-sOutputFile={output_path}", input_path
	]
	subprocess.run(gs_cmd, check=True)
	new_size = os.path.getsize(output_path)
	print(f"✅ Compression done. New size: {new_size/1024/1024:.2f} MB")
	if new_size <= max_size:
	return output_path
	else:
	print("⚠️ Compressed file still larger than threshold; returning original")
	return input_path
	except Exception as e:
	print("❌ Compression error:", e)
	return input_path

	def upload_to_gemini(path, display_name=None):
	"""
	Upload a file to Gemini using the NEW google-genai SDK.
	Uses the current active API key from client_manager.
	"""
	print(f"📤 Uploading {path} to Gemini...")
	try:
	current_client = client_manager.get_current_client()
	uploaded_file = current_client.files.upload(file=path)

	# Wait for processing to complete
	print(f"⏳ Waiting for file processing: {uploaded_file.name}")
	while uploaded_file.state.name == "PROCESSING":
	time.sleep(2)
	uploaded_file = current_client.files.get(name=uploaded_file.name)

	if uploaded_file.state.name == "FAILED":
	raise Exception(f"File processing failed: {uploaded_file.name}")

	print(f"✅ Uploaded and processed: {uploaded_file.name}")
	return uploaded_file
	except Exception as e:
	print(f"❌ Upload failed for {path}: {e}")
	raise

	def merge_pdfs(paths, output_path):
	writer = PdfWriter()
	for p in paths:
	reader = PdfReader(p)
	for page in reader.pages:
	writer.add_page(page)
	with open(output_path, "wb") as f:
	writer.write(f)
	return output_path

	def gemini_generate_content(prompt_text, file_upload_obj=None, image_obj=None, model_name="gemini-2.5-pro", fallback_model="gemini-2.5-flash", fallback_model_2="gemini-2.5-flash-lite", file_path=None):
	"""
	Send prompt_text and optionally an uploaded file (or an image object/list) to the model using NEW SDK.
	Automatically rotates through available API keys on RESOURCE_EXHAUSTED errors.
	When rotating keys with file uploads, re-uploads the file with the new API key.

	Args:
	prompt_text: The prompt to send
	file_upload_obj: Previously uploaded file object (optional)
	image_obj: Image or list of images (optional)
	model_name: Primary model to use
	fallback_model: First fallback model if primary fails
	fallback_model_2: Second fallback model if first fallback fails
	file_path: Local file path (needed for re-upload when rotating keys)

	Returns textual response and prints progress.
	"""
	contents = [prompt_text]
	current_file_obj = file_upload_obj

	if current_file_obj:
	contents.append(current_file_obj)

	if image_obj:
	if isinstance(image_obj, list):
	for img_path in image_obj:
	if isinstance(img_path, str):
	pil_img = Image.open(img_path)
	contents.append(pil_img)
	else:
	contents.append(img_path)
	else:
	if isinstance(image_obj, str):
	pil_img = Image.open(image_obj)
	contents.append(pil_img)
	else:
	contents.append(image_obj)

	print("📡 Sending request to Gemini (prompt length:", len(prompt_text), "chars )")

	# Try with all available API keys
	max_attempts = len(client_manager.api_keys)
	attempt = 0

	while attempt < max_attempts:
	current_client = client_manager.get_current_client()
	current_key_num = client_manager.current_key_index + 1

	# Update contents with current file object
	contents = [prompt_text]
	if current_file_obj:
	contents.append(current_file_obj)
	if image_obj:
	if isinstance(image_obj, list):
	for img_path in image_obj:
	if isinstance(img_path, str):
	pil_img = Image.open(img_path)
	contents.append(pil_img)
	else:
	contents.append(img_path)
	else:
	if isinstance(image_obj, str):
	pil_img = Image.open(image_obj)
	contents.append(pil_img)
	else:
	contents.append(image_obj)

	# Try primary model first
	try:
	print(f"🔑 Using API key #{current_key_num} with model {model_name}")
	response = current_client.models.generate_content(
	model=model_name,
	contents=contents
	)
	raw_text = response.text
	print(f"📥 Received response (chars): {len(raw_text)}")

	# Success! Reset to primary key for next request
	client_manager.reset_to_primary()
	return raw_text

	except Exception as e:
	error_str = str(e)
	print(f"❌ Generation failed with API key #{current_key_num} and model {model_name}: {e}")

	# Check if it's a RESOURCE_EXHAUSTED error
	if "429" in error_str or "RESOURCE_EXHAUSTED" in error_str:
	print(f"⚠️ Quota exhausted for API key #{current_key_num} with model {model_name}")

	# Try first fallback model with SAME API key
	print(f"⚡ Trying fallback model {fallback_model} with same API key #{current_key_num}")
	try:
	response = current_client.models.generate_content(
	model=fallback_model,
	contents=contents
	)
	raw_text = response.text
	print(f"📥 Received response (chars): {len(raw_text)}")
	client_manager.reset_to_primary()
	return raw_text
	except Exception as e_fallback:
	error_fallback_str = str(e_fallback)
	print(f"❌ Fallback model {fallback_model} also failed: {e_fallback}")

	# Check if first fallback also exhausted
	if "429" in error_fallback_str or "RESOURCE_EXHAUSTED" in error_fallback_str:
	print(f"⚠️ First fallback model also exhausted for API key #{current_key_num}")

	# Try second fallback model with SAME API key
	print(f"⚡ Trying second fallback model {fallback_model_2} with same API key #{current_key_num}")
	try:
	response = current_client.models.generate_content(
	model=fallback_model_2,
	contents=contents
	)
	raw_text = response.text
	print(f"📥 Received response (chars): {len(raw_text)}")
	client_manager.reset_to_primary()
	return raw_text
	except Exception as e_fallback_2:
	error_fallback_2_str = str(e_fallback_2)
	print(f"❌ Second fallback model {fallback_model_2} also failed: {e_fallback_2}")

	# Check if second fallback also exhausted
	if "429" in error_fallback_2_str or "RESOURCE_EXHAUSTED" in error_fallback_2_str:
	print(f"⚠️ All 3 models exhausted for API key #{current_key_num}")

	# Now try next API key if available
	if attempt < max_attempts - 1:
	# Check if we have file uploads and can re-upload
	if file_upload_obj and file_path:
	print(f"🔄 Rotating to next API key and re-uploading file...")
	client_manager.rotate_to_next_key()

	# Re-upload file with new API key
	try:
	print(f"📤 Re-uploading file with API key #{client_manager.current_key_index + 1}...")
	current_file_obj = upload_to_gemini(file_path)
	print(f"✅ File re-uploaded successfully")
	except Exception as upload_error:
	print(f"❌ Failed to re-upload file: {upload_error}")
	raise Exception(f"Failed to re-upload file with new API key: {upload_error}")

	attempt += 1
	print(f"🔄 Retrying with next API key (attempt {attempt + 1}/{max_attempts})...")
	continue
	elif file_upload_obj and not file_path:
	print("⚠️ WARNING: Cannot rotate API keys - file_path not provided for re-upload!")
	print(" To enable API key rotation with file uploads, pass file_path parameter.")
	raise Exception(f"All 3 models exhausted for API key #{current_key_num}. Cannot rotate without file_path.")
	else:
	# No file uploads, safe to rotate
	client_manager.rotate_to_next_key()
	attempt += 1
	print(f"🔄 Trying next API key (attempt {attempt + 1}/{max_attempts})...")
	continue
	else:
	raise Exception(f"All {max_attempts} API key(s) exhausted with all 3 models.")
	else:
	# Second fallback failed with different error
	raise Exception(f"Second fallback model failed: {e_fallback_2}")
	else:
	# First fallback failed with different error
	raise Exception(f"First fallback model failed: {e_fallback}")

	elif "403" in error_str or "PERMISSION_DENIED" in error_str:
	# This happens when trying to access a file uploaded with a different API key
	print(f"⚠️ Permission denied - likely due to file uploaded with different API key")

	# Try to re-upload if we have the file path
	if file_path and attempt < max_attempts - 1:
	print(f"🔄 Attempting to re-upload file with next API key...")
	client_manager.rotate_to_next_key()

	try:
	print(f"📤 Re-uploading file with API key #{client_manager.current_key_index + 1}...")
	current_file_obj = upload_to_gemini(file_path)
	print(f"✅ File re-uploaded successfully")
	attempt += 1
	print(f"🔄 Retrying with next API key (attempt {attempt + 1}/{max_attempts})...")
	continue
	except Exception as upload_error:
	print(f"❌ Failed to re-upload file: {upload_error}")
	raise Exception(f"Failed to re-upload file with new API key: {upload_error}")
	else:
	raise Exception(f"File access denied. Cannot re-upload without file_path. Error: {e}")

	else:
	# Other error - try fallback models with same key
	print(f"⚡ Trying fallback model {fallback_model} with same API key #{current_key_num}")
	try:
	response = current_client.models.generate_content(
	model=fallback_model,
	contents=contents
	)
	raw_text = response.text
	print(f"📥 Received response (chars): {len(raw_text)}")
	client_manager.reset_to_primary()
	return raw_text
	except Exception as e2:
	print(f"❌ First fallback also failed: {e2}")

	# Try second fallback
	print(f"⚡ Trying second fallback model {fallback_model_2} with same API key #{current_key_num}")
	try:
	response = current_client.models.generate_content(
	model=fallback_model_2,
	contents=contents
	)
	raw_text = response.text
	print(f"📥 Received response (chars): {len(raw_text)}")
	client_manager.reset_to_primary()
	return raw_text
	except Exception as e3:
	print(f"❌ Second fallback also failed: {e3}")

	# If we have more keys, try them
	if attempt < max_attempts - 1:
	if file_upload_obj and file_path:
	print(f"🔄 Rotating to next API key and re-uploading file...")
	client_manager.rotate_to_next_key()

	try:
	print(f"📤 Re-uploading file with API key #{client_manager.current_key_index + 1}...")
	current_file_obj = upload_to_gemini(file_path)
	print(f"✅ File re-uploaded successfully")
	except Exception as upload_error:
	print(f"❌ Failed to re-upload file: {upload_error}")
	raise Exception(f"Failed to re-upload file with new API key: {upload_error}")

	attempt += 1
	print(f"🔄 Retrying with next API key (attempt {attempt + 1}/{max_attempts})...")
	continue
	elif file_upload_obj and not file_path:
	raise Exception(f"All models failed. Cannot rotate keys without file_path. Last error: {e3}")
	else:
	client_manager.rotate_to_next_key()
	attempt += 1
	print(f"🔄 Trying next API key (attempt {attempt + 1}/{max_attempts})...")
	continue
	else:
	raise Exception(f"All attempts failed. Last error: {e3}")

	# If we exhausted all attempts
	raise Exception(f"❌ All {max_attempts} API key(s) exhausted. Please check your quota or try again later.")





	# ---------------- PARSERS ----------------
	def extract_question_ids_from_qpms(text: str):
	"""Extract question IDs from QP+MS transcript."""
	print("🔎 Extracting question IDs from QP+MS transcript using regex...")

	clean_text = text.replace("\u00A0", " ").replace("\t", " ")

	primary_matches = re.findall(r"^\sQuestion\s[:\s]\s*([\dA-Za-z.()]+)", clean_text, re.MULTILINE)
	if primary_matches:
	print(f"✅ Extracted {len(primary_matches)} question IDs from explicit 'Question X' lines.")
	print("IDs:", primary_matches)
	return primary_matches

	fallback_matches = re.findall(r"^\s(\d+(?:[.)]\|$[a-zA-Z0-9]+$)?[a-zA-Z0-9])", clean_text, re.MULTILINE)
	if fallback_matches:
	print(f"✅ Extracted {len(fallback_matches)} question IDs (fallback numbered lists).")
	print("IDs:", fallback_matches)
	else:
	print("⚠️ No question IDs extracted; will send NA placeholder.")
	return fallback_matches

	def build_as_cot_prompt_with_expected_ids(expected_ids, qpms_text=None):
	"""
	Construct the AS transcription prompt injecting the expected IDs block and graph detection instructions,
	modifying it to include a Chain-of-Thought (CoT) section using a <think> tag, and
	requiring mathematical expressions to be enclosed in LaTeX dollar delimiters ($...$).
	Includes explicit rules for interpreting NA-like answers and no-response situations.
	"""

	if not expected_ids:
	ids_block = "{NA}"
	else:
	ids_block = "{\n" + "\n".join(expected_ids) + "\n}"

	qpms_section = ""
	if qpms_text is not None:
	qpms_section = (
	"\nYou are also provided with the full transcript of the Question Paper and Markscheme (QP+MS) below."
	"\nUse it primarily to resolve ambiguous handwriting and to confirm expected answers when needed."
	"\n--- BEGIN QP+MS TRANSCRIPT ---\n"
	f"{qpms_text.strip()}\n"
	"--- END QP+MS TRANSCRIPT ---\n"
	)

	prompt = f"""You are a high-quality handwritten transcription assistant, performing transcription with a Chain-of-Thought process.
	INPUT: This PDF contains a student's handwritten answer sheet.
	{qpms_section}
	TASK:
	1. THINKING: Before transcribing each answer, document your thought process inside a <think> tag.
	- Identify the question ID. If inferred, note why.
	- Detail any ambiguities (unclear numbers, symbols, or structures).
	- Explain how ambiguities were resolved, including whether the QP+MS transcript was consulted.
	- If QP+MS was consulted but you chose not to change the transcription, state this.
	- If the initial question label was incorrect (e.g., 2.a vs 2.b), correct it and briefly explain the reasoning in <think>.
	Example Thinking:
	<think>
	- Found Question 3(a).
	- The term could be '$2x$' or '21x'.
	- Markscheme uses '$21x$', but handwriting matches '$2x$'.
	- Decision: transcribe '$2x$'.
	</think>

	2. TRANSCRIPTION: Transcribe the student's answers directly and faithfully.
	- Assign each answer to a labelled question ID when present.
	- For unlabeled answers, segment logically and mark inferred IDs as "INFERRED: <id>".
	- Mathematical expressions and standalone variables must appear inside LaTeX dollar delimiters ($...$).
	- If a diagram/graph is omitted, write [Graph omitted].
	- If handwriting is unreadable: [illegible].

	ANSWER-INTERPRETATION RULES:
	- If the student writes “NA”, “N/A”, “Not Applicable”, or clear equivalents → record exactly as NA.
	- If the student leaves the space blank, crosses it out, makes no meaningful attempt, or provides no answer → record [No response].

	Ensure deterministic formatting so subsequent models can grade directly from this aligned format.

	Expected questions (if missing, write NA):
	{ids_block}
	-----------------------
	OUTPUT FORMAT:
	<think>...</think>
	Question <id>
	AS:<transcribed answer or placeholder>
	<think>...</think>
	Question <id>
	AS:<transcribed answer or placeholder>
	...
	==== GRAPH FOUND ANSWERS ====
	Graph found in:
	- Answer <number> → Page <number>
	(one per line)
	==== END GRAPH FOUND ===="""

	return prompt



	def extract_graph_questions_from_ms(text: str):
	"""Extract graph questions and page numbers from MS transcript."""
	clean_text = text.replace("\u00A0", " ").replace("\t", " ")
	match = re.search(r"==== GRAPH EXPECTED QUESTIONS ====\s(.?)\s*==== END GRAPH EXPECTED ====",
	clean_text, re.S)
	graph_dict = {}
	if match:
	block = match.group(1)
	for line in block.splitlines():
	line = line.strip()
	if line.startswith("- Question"):
	q_match = re.match(r"- Question\s+([\dA-Za-z.()]+)\s→\sPage\s*(\d+)", line)
	if q_match:
	q_id, page = q_match.groups()
	graph_dict[q_id] = int(page)
	return graph_dict

	def extract_graph_answers_from_as(text: str):
	"""Extract graph answers and page numbers from AS transcript."""
	clean_text = text.replace("\u00A0", " ").replace("\t", " ")
	block = re.search(r"==== GRAPH FOUND ANSWERS ====\s(.?)\s*==== END GRAPH FOUND ====",
	clean_text, re.S)
	graph_dict = {}
	if block:
	for line in block.group(1).splitlines():
	line = line.strip()
	if line.startswith("- Answer"):
	match = re.match(r"- Answer\s+([\dA-Za-z.()]+)\s→\sPage\s*(\d+)", line)
	if match:
	ans_id, page = match.groups()
	graph_dict[ans_id] = int(page)
	return graph_dict

	def extract_marks_from_grading(grading_text):
	"""
	Parse the grading markdown and extract marks per question from the Awarded column only.
	"""
	print("🔎 Extracting awarded marks from grading output...")
	grading_json = {"grading": []}

	question_blocks = re.split(r"###\s*Question\s+", grading_text)
	for block in question_blocks[1:]:
	first_line = block.strip().splitlines()[0].strip() if block.strip().splitlines() else ""
	q_id_match = re.match(r"([0-9]+(?:[a-zA-Z]\|$[^)]+$\|(?:\.[a-zA-Z0-9]+))*)", first_line)
	if not q_id_match:
	q_id = first_line.split()[0] if first_line else ""
	else:
	q_id = q_id_match.group(1).strip()

	# Extract marks only from the "Awarded" column (4th column in the table)
	awarded = []
	lines = block.split('\n')
	for line in lines:
	if '\|' in line:
	parts = [p.strip() for p in line.split('\|')]
	# Check if this is a data row (not header or separator) and has at least 5 columns
	if len(parts) >= 5 and not parts[1].startswith('-'):
	awarded_col = parts[4] # 4th column (index 4 because of leading empty from split)
	# Extract mark codes from the awarded column
	marks = re.findall(r"\b([MABCR]\d+\|[MABCR]0)\b", awarded_col)
	awarded.extend(marks)

	grading_json["grading"].append({
	"question": q_id,
	"marks_awarded": awarded
	})
	print("✅ Extracted grading marks for", len(grading_json["grading"]), "question blocks.")
	print(json.dumps(grading_json, indent=2))
	return grading_json

	def check_and_correct_total_marks(grading_text):
	"""
	Verifies the total marks in the Examiner's Summary Report against
	the sum of individual question marks. Corrects if discrepancy found.

	Args:
	grading_text (str): The full grading markdown text

	Returns:
	tuple: (corrected_text, calculated_awarded, calculated_possible, was_corrected)
	"""
	print("\n" + "="*60)
	print("🔍 VERIFYING TOTAL MARKS IN SUMMARY REPORT")
	print("="*60)

	question_marks = {}
	calculated_total_awarded = 0
	calculated_total_possible = 0

	# Updated pattern to match BOTH formats:
	# ### Question <1.a> (with angle brackets)
	# ### Question 1.a (without angle brackets)
	# The <? makes the opening bracket optional
	# The >? makes the closing bracket optional
	question_block_pattern = re.compile(
	r"### Question\s<?([0-9]+(?:[.()][a-z0-9]+))>?\s[\s\S]?\\Total:\s(\d+)/(\d+)\\*",
	re.DOTALL \| re.IGNORECASE
	)

	matches = question_block_pattern.finditer(grading_text)
	for match in matches:
	question_id = match.group(1).strip()
	awarded = int(match.group(2))
	possible = int(match.group(3))
	question_marks[question_id] = {'awarded': awarded, 'possible': possible}
	calculated_total_awarded += awarded
	calculated_total_possible += possible

	print(f"\n� Exltracted marks from {len(question_marks)} questions:")
	for q_id, marks in question_marks.items():
	print(f" Question {q_id}: {marks['awarded']}/{marks['possible']}")

	print(f"\n📈 Calculated totals from individual questions:")
	print(f" Awarded: {calculated_total_awarded}")
	print(f" Possible: {calculated_total_possible}")

	# Find the summary report section
	summary_report_start = grading_text.find("### Examiner's Summary Report")
	if summary_report_start == -1:
	print("⚠️ Warning: Could not find '### Examiner's Summary Report' section.")
	return grading_text, calculated_total_awarded, calculated_total_possible, False

	summary_section = grading_text[summary_report_start:]
	summary_total_pattern = re.compile(r"(\\Total:\s)(\d+)/(\d+)(\\*)")
	summary_match = summary_total_pattern.search(summary_section)

	original_summary_awarded = 0
	original_summary_possible = 0

	if summary_match:
	original_summary_awarded = int(summary_match.group(2))
	original_summary_possible = int(summary_match.group(3))
	print(f"\n📋 Original summary report total: {original_summary_awarded}/{original_summary_possible}")
	else:
	print("⚠️ Warning: Could not find overall total in summary report.")
	return grading_text, calculated_total_awarded, calculated_total_possible, False

	# Check for discrepancies
	corrected_report_text = grading_text
	total_mismatch = False

	if calculated_total_awarded != original_summary_awarded:
	print(f"\n❌ DISCREPANCY FOUND in awarded marks!")
	print(f" Calculated: {calculated_total_awarded}")
	print(f" Reported: {original_summary_awarded}")
	total_mismatch = True

	if calculated_total_possible != original_summary_possible:
	print(f"\n❌ DISCREPANCY FOUND in possible marks!")
	print(f" Calculated: {calculated_total_possible}")
	print(f" Reported: {original_summary_possible}")
	total_mismatch = True

	if total_mismatch:
	print(f"\n🔧 CORRECTING summary total:")
	print(f" FROM: {original_summary_awarded}/{original_summary_possible}")
	print(f" TO: {calculated_total_awarded}/{calculated_total_possible}")

	# Correct only in the summary section
	corrected_summary_section = re.sub(
	summary_total_pattern,
	rf"\g<1>{calculated_total_awarded}/{calculated_total_possible}\g<4>",
	summary_section,
	count=1
	)

	corrected_report_text = grading_text[:summary_report_start] + corrected_summary_section
	print("✅ Total marks corrected successfully!")
	else:
	print("\n✅ Total marks are CORRECT - no correction needed!")

	print("="*60 + "\n")

	return corrected_report_text, calculated_total_awarded, calculated_total_possible, total_mismatch

	# ---------------- MAPPING/IMPRINT HELPERS ----------------
	def ask_gemini_for_mapping_batch(image_paths, grading_json, expected_ids=None, rows=GRID_ROWS, cols=GRID_COLS):
	"""
	Send multiple page images together to Gemini for batch mapping processing.
	"""
	ids_block = "{NA}"
	if expected_ids:
	ids_block = "{\n" + "\n".join(expected_ids) + "\n}"

	prompt = f"""You are an exam marker. Your role is to identify where each question begins on each page.
	The pages are divided into a {rows} x {cols} grid. Each cell has a RUNNING NUMBER label.
	For each question in the grading JSON, return the cell NUMBER where the FIRST STEP of that question begins.
	⚠ IMPORTANT RULES:
	- Do not place marks inside another question's answer area.
	- Prefer placing the marks in a BLANK cell immediately to the RIGHT of the answer step. If no blank cell is available to the right, then place in a blank cell to the LEFT.
	- Never place marks above or below the answer.
	- Each question should have unique cell number
	- If a question serial number is visible in the answer image, you must mandatorily identify the corresponding question using the grading JSON.
	IMPORTANT: For your help i have provided u questions that u can expect in the images:
	{ids_block}
	Return JSON only, like:
	[{{"page": 1, "question": "1(a)", "cell_number": 15}}, ...]
	Grading JSON:
	{json.dumps(grading_json, indent=2)}"""

	images = [Image.open(p) for p in image_paths]

	print(f"📡 Sending batch mapping request for {len(image_paths)} pages to Gemini...")

	try:
	contents = [prompt] + images
	response = client.models.generate_content(
	model="gemini-2.5-flash",
	contents=contents
	)
	raw_text = response.text
	except:
	print("⚠️ Trying fallback model for mapping...")
	contents = [prompt] + images
	response = client.models.generate_content(
	model="gemini-2.5-flash-preview-09-2025",
	contents=contents
	)
	raw_text = response.text

	print("📥 Batch mapping response (chars):", len(raw_text))
	print("🔎 Gemini raw batch output:")
	print(raw_text)

	try:
	match = re.search(r'(\[.*\])', raw_text, re.DOTALL)
	if match:
	mapping = json.loads(match.group(1))
	print(f"✅ Parsed Gemini batch mapping for {len(image_paths)} pages")
	return mapping
	else:
	print("❌ Failed to find JSON array in response")
	return []
	except Exception as e:
	print(f"❌ Failed to parse Gemini JSON mapping: {e}")
	return []

	def normalize_question_id(qid):
	"""
	Normalize question ID to a standard format for matching.
	Converts formats like:
	- "1(a)" -> "1.a"
	- "2(c).i" -> "2.c.i"
	- "3.d.ii" -> "3.d.ii" (already normalized)
	"""
	if not qid:
	return qid

	# Replace parentheses format: 1(a) -> 1.a
	qid = re.sub(r'(\d+)$([a-zA-Z])$', r'\1.\2', qid)

	# Replace format like 2(c).i -> 2.c.i
	qid = re.sub(r'(\d+)$([a-zA-Z]+)$\.', r'\1.\2.', qid)

	return qid

	def imprint_marks_using_mapping(pdf_path, grading_json, output_pdf, expected_ids=None, rows=GRID_ROWS, cols=GRID_COLS):
	"""
	Convert PDF to images, create grid-numbered images for batch sending to Gemini,
	then annotate and produce imprinted PDF.
	"""
	print("📄 Converting answer PDF to images for imprinting...")
	pages = convert_from_path(pdf_path, dpi=100)
	annotated_page_paths = []
	temp_grid_images = []

	for p_index, page in enumerate(pages):
	img = page.convert("RGB")
	w, h = img.size
	cell_w, cell_h = w / cols, h / rows

	draw = ImageDraw.Draw(img)
	try:
	num_font = ImageFont.truetype("arial.ttf", 20)
	except Exception:
	num_font = ImageFont.load_default()

	cell_num = 1
	for r in range(rows):
	for c in range(cols):
	x = int(c * cell_w + cell_w / 2)
	y = int(r * cell_h + cell_h / 2)
	text = str(cell_num)
	bbox = draw.textbbox((0, 0), text, font=num_font)
	tw = bbox[2] - bbox[0]
	th = bbox[3] - bbox[1]
	draw.text((x - tw/2, y - th/2), text, fill="black", font=num_font)
	cell_num += 1

	temp_path = f"page_{p_index+1}_grid.png"
	img.save(temp_path, "PNG")
	temp_grid_images.append(temp_path)
	print("🛰 Created grid image:", temp_path)

	print("📡 Sending page images to Gemini in batches for mapping...")
	batch_size = 10
	all_mappings = []

	for start in range(0, len(temp_grid_images), batch_size):
	batch_paths = temp_grid_images[start:start+batch_size]
	batch_mapping = ask_gemini_for_mapping_batch(batch_paths, grading_json, expected_ids, rows, cols)
	all_mappings.extend(batch_mapping)
	print(f"✅ Processed batch {start//batch_size + 1}: pages {start+1}-{start+len(batch_paths)}")

	print("🖊 Annotating pages with marks...")
	for p_index, page in enumerate(pages):
	page_num = p_index + 1
	page_img = page.convert("RGB")
	img_cv = np.array(page_img)
	img_cv = cv2.cvtColor(img_cv, cv2.COLOR_RGB2BGR)
	h, w, _ = img_cv.shape
	cell_w_px, cell_h_px = w / cols, h / rows

	page_mappings = [m for m in all_mappings if m.get("page") == page_num]

	for item in page_mappings:
	qid = item.get("question")
	cell_number = item.get("cell_number")
	if qid is None or cell_number is None:
	continue

	# Normalize the question ID from Gemini mapping
	normalized_qid = normalize_question_id(qid)

	# Try exact match first with normalized ID
	marks_list = next((g["marks_awarded"] for g in grading_json.get("grading", [])
	if g["question"] == normalized_qid), [])

	# If no match, try case-insensitive match
	if not marks_list:
	marks_list = next((g["marks_awarded"] for g in grading_json.get("grading", [])
	if g["question"].lower() == normalized_qid.lower()), [])

	# If still no match, try with original qid
	if not marks_list:
	marks_list = next((g["marks_awarded"] for g in grading_json.get("grading", [])
	if g["question"] == qid), [])

	marks_text = ",".join(marks_list) if marks_list else "?"

	if marks_text == "?":
	print(f"⚠️ No marks found for question '{qid}' (normalized: '{normalized_qid}') on page {page_num}")

	row = (cell_number - 1) // cols
	col = (cell_number - 1) % cols

	x_c = int((col + 1) * cell_w_px - cell_w_px / 4)
	y_c = int((row + 0.5) * cell_h_px)

	font_scale = max(1.0, min(2.0, cell_h_px / 40.0))
	thickness = max(2, int(font_scale * 2))
	cv2.putText(img_cv, marks_text, (x_c, y_c), cv2.FONT_HERSHEY_SIMPLEX,
	font_scale, (0, 0, 255), thickness, cv2.LINE_AA)
	print(f"🖊 Marks annotated for page {page_num}, question {qid}: {marks_text}")

	annotated_path = f"annotated_page_{page_num}.png"
	cv2.imwrite(annotated_path, img_cv)
	annotated_page_paths.append(annotated_path)
	print("✅ Annotated page saved:", annotated_path)

	print("📑 Merging annotated pages into final PDF...")
	with open(output_pdf, "wb") as f:
	f.write(img2pdf.convert(annotated_page_paths))

	compressed = compress_pdf(output_pdf)
	print("📑 Imprinted PDF saved to:", compressed)
	return compressed

	def extract_pdf_pages_as_images(pdf_path, page_numbers, prefix):
	"""
	Extracts unique pages (1-based) from a PDF as images, saves as PNG, returns list of file paths.
	Handles cases where requested pages don't exist in the PDF.
	"""
	if not page_numbers:
	print(f"⚠️ No page numbers provided for extraction")
	return []

	unique_pages = sorted(set(page_numbers))

	# First, get the total page count to validate requested pages
	try:
	from PyPDF2 import PdfReader
	reader = PdfReader(pdf_path)
	total_pages = len(reader.pages)
	print(f"📄 PDF has {total_pages} total pages")

	# Filter out invalid page numbers
	valid_pages = [p for p in unique_pages if 1 <= p <= total_pages]
	invalid_pages = [p for p in unique_pages if p not in valid_pages]

	if invalid_pages:
	print(f"⚠️ Skipping invalid page numbers (out of range): {invalid_pages}")

	if not valid_pages:
	print(f"❌ No valid pages to extract from {pdf_path}")
	return []

	unique_pages = valid_pages
	except Exception as e:
	print(f"⚠️ Could not validate page numbers: {e}. Proceeding with extraction...")

	# Extract the pages
	try:
	images = convert_from_path(pdf_path, dpi=200, first_page=min(unique_pages), last_page=max(unique_pages))
	except Exception as e:
	print(f"❌ Failed to convert PDF pages to images: {e}")
	return []

	out_paths = []
	for idx, page_num in enumerate(unique_pages):
	img_idx = page_num - min(unique_pages)

	# Bounds check to prevent index errors
	if img_idx >= len(images):
	print(f"⚠️ Page {page_num} not found in extracted images (index {img_idx} >= {len(images)}). Skipping...")
	continue

	try:
	img = images[img_idx]
	out_path = f"{prefix}_page_{page_num}.png"
	img.save(out_path, "PNG")
	print(f"📤 Extracted graph page {page_num} from {pdf_path} as {out_path}")
	out_paths.append(out_path)
	except Exception as e:
	print(f"❌ Failed to save page {page_num}: {e}")
	continue

	return out_paths

	# ---------------- PIPELINE ----------------
	def align_and_grade_pipeline(qp_path, ms_path, ans_path, subject="Maths", imprint=False, run_timestamp=None):
	"""
	Final pipeline with graph-aware grading logic using NEW SDK.

	Args:
	qp_path: Path to Question Paper PDF
	ms_path: Path to Markscheme PDF
	ans_path: Path to Answer Sheet PDF
	subject: Subject name (Maths or Science)
	imprint: Whether to generate imprinted PDF
	run_timestamp: Unix timestamp for organizing files in Supabase
	"""
	try:
	print("🔁 Starting pipeline...")
	qp_path = compress_pdf(qp_path)
	ms_path = compress_pdf(ms_path)
	ans_path = compress_pdf(ans_path)

	merged_qpms_path = os.path.splitext(qp_path)[0] + "_merged_qp_ms.pdf"
	merge_pdfs([qp_path, ms_path], merged_qpms_path)
	print("📎 Merged QP + MS ->", merged_qpms_path)

	print("🔼 Uploading files to Gemini...")
	merged_uploaded = upload_to_gemini(merged_qpms_path)
	ans_uploaded = upload_to_gemini(ans_path)
	print("✅ Upload complete.")

	print("1.i) Transcribing QP+MS (questions first, then full markscheme, with graph detection)...")
	qpms_prompt = QP_MS_TRANSCRIPTION_PROMPT["content"] + "\nAt the end, also list all questions in the markscheme where a graph is expected, in the format:\nGraph expected in:\n- Question <number> → Page <number>\n(One per line, after ==== MARKSCHEME END ====)"
	qpms_text = gemini_generate_content(qpms_prompt, file_upload_obj=merged_uploaded, model_name="gemini-2.5-flash", fallback_model="gemini-2.5-flash-preview-09-2025", fallback_model_2="gemini-2.5-flash-lite", file_path=merged_qpms_path)
	print("📄 QP+MS transcription received. Saving debug file: debug_qpms_transcript.txt")
	with open("debug_qpms_transcript.txt", "w", encoding="utf-8") as f:
	f.write(qpms_text)

	ms_graph_mapping = extract_graph_questions_from_ms(qpms_text)
	print("🖼️ Graph-expected questions in MS:", ms_graph_mapping)
	ms_graph_pages = list(ms_graph_mapping.values())
	ms_graph_images = []
	if ms_graph_pages:
	ms_graph_images = extract_pdf_pages_as_images(merged_qpms_path, ms_graph_pages, prefix="qpms_graph")

	extracted_ids = extract_question_ids_from_qpms(qpms_text)
	if not extracted_ids:
	extracted_ids = ["NA"]

	print("1.ii) Building AS transcription prompt with expected question IDs and graph detection, sending to Gemini...")
	as_prompt = build_as_cot_prompt_with_expected_ids(extracted_ids, qpms_text) + "\nAt the end, also list all answers where a graph is found, in the format:\nGraph found in:\n- Answer <number> → Page <number>\n(One per line, after all answers)"
	as_text = gemini_generate_content(as_prompt, file_upload_obj=ans_uploaded, model_name="gemini-2.5-flash", fallback_model="gemini-2.5-flash-preview-09-2025", fallback_model_2="gemini-2.5-flash-lite", file_path=ans_path)
	print("📝 AS transcription received. Saving debug file: debug_as_transcript.txt")
	with open("debug_as_transcript.txt", "w", encoding="utf-8") as f:
	f.write(as_text)

	as_graph_mapping = extract_graph_answers_from_as(as_text)
	print("🖼️ Graph-attempted answers in AS:", as_graph_mapping)
	as_graph_pages = list(as_graph_mapping.values())
	as_graph_images = []
	if as_graph_pages:
	as_graph_images = extract_pdf_pages_as_images(ans_path, as_graph_pages, prefix="as_graph")

	print("2) Preparing grading input and sending to Gemini for grading...")
	grading_input = (
	"=== QP+MS TRANSCRIPT BEGIN ===\n"
	+ qpms_text
	+ "\n=== QP+MS TRANSCRIPT END ===\n\n"
	+ "=== ANSWER SHEET TRANSCRIPT BEGIN ===\n"
	+ as_text
	+ "\n=== ANSWER SHEET TRANSCRIPT END ===\n"
	)
	if ms_graph_images or as_graph_images:
	graph_note = "\n\n---\nSome questions require graphs. I've attached the relevant graph pages from QP+MS and from the Answer Sheet. Use them as visual context when grading.\n---\n"
	grading_input += graph_note
	grading_prompt_obj = get_grading_prompt(subject.lower())
	grading_prompt_system = grading_prompt_obj["content"]
	grading_images = ms_graph_images + as_graph_images
	grading_text = gemini_generate_content(grading_prompt_system + "\n\nPlease grade the following transcripts:\n" + grading_input, image_obj=grading_images if grading_images else None, model_name="gemini-2.5-pro", fallback_model="gemini-2.5-flash")
	print("🧾 Grading output received. Saving debug file: debug_grading.md")
	with open("debug_grading.md", "w", encoding="utf-8") as f:
	f.write(grading_text)

	# Verify and correct total marks if needed
	grading_text, calc_awarded, calc_possible, was_corrected = check_and_correct_total_marks(grading_text)

	if was_corrected:
	print("📝 Saving corrected grading to debug file: debug_grading_corrected.md")
	with open("debug_grading_corrected.md", "w", encoding="utf-8") as f:
	f.write(grading_text)

	base_name = os.path.splitext(os.path.basename(ans_path))[0]
	grading_pdf_path = save_as_pdf(grading_text, f"{base_name}_graded.pdf")
	print("📄 Grading PDF saved:", grading_pdf_path)

	grading_json = extract_marks_from_grading(grading_text)
	with open("debug_grading_json.json", "w", encoding="utf-8") as f:
	json.dump(grading_json, f, indent=2, ensure_ascii=False)
	print("🔧 Grading marks extraction complete.")

	imprinted_pdf_path = None
	if imprint:
	print("✍ Imprint option enabled. Starting imprinting process...")
	imprinted_pdf_path = f"{base_name}_imprinted.pdf"
	imprinted_pdf_path = imprint_marks_using_mapping(ans_path, grading_json, imprinted_pdf_path, extracted_ids)
	print("✅ Imprinting finished. Imprinted PDF at:", imprinted_pdf_path)

	# Upload output files to Supabase (using same timestamp as input files)
	output_urls = {
	"graded_pdf_url": None,
	"imprinted_pdf_url": None
	}

	if supabase_client:
	print("\n📤 Uploading output files to Supabase...")
	if grading_pdf_path:
	output_urls["graded_pdf_url"] = upload_file_to_supabase(grading_pdf_path, "graded", run_timestamp)
	if imprinted_pdf_path:
	output_urls["imprinted_pdf_url"] = upload_file_to_supabase(imprinted_pdf_path, "imprinted", run_timestamp)

	print("🏁 Pipeline finished successfully.")
	return qpms_text, as_text, grading_text, grading_pdf_path, imprinted_pdf_path, output_urls

	except Exception as e:
	print("❌ Pipeline error:", e)
	import traceback
	traceback.print_exc()
	return f"❌ Error: {e}", None, None, None, None, {}

	# ---------------- GRADIO UI ----------------
	with gr.Blocks(title="AI Grading (Pandoc + pdflatex)") as demo:
	gr.Markdown("## 📘 AI Grading — Using Pandoc + pdflatex for PDF Generation")
	gr.Markdown("✅ Now using Pandoc with pdflatex for professional-quality PDF outputs!")

	if supabase_client:
	gr.Markdown("☁️ Supabase Storage: Enabled - All files will be uploaded to cloud storage")
	else:
	gr.Markdown("⚠️ Supabase Storage: Disabled - Files will only be processed locally")

	with gr.Row():
	qp_file = gr.File(label="📄 Upload Question Paper (PDF)")
	ms_file = gr.File(label="📄 Upload Markscheme (PDF)")
	ans_file = gr.File(label="📝 Upload Student Answer Sheet (PDF)")

	with gr.Row():
	subject_dropdown = gr.Dropdown(
	choices=["Maths", "Science", "Economics"],
	value="Maths",
	label="📚 Subject",
	info="Select the subject to apply appropriate grading guidelines"
	)
	imprint_toggle = gr.Checkbox(label="✍ Imprint Marks on Student Answer Sheet", value=False)

	run_button = gr.Button("🚀 Run Pipeline")

	# File URLs section (only shown if Supabase is enabled)
	if supabase_client:
	with gr.Accordion("☁️ Uploaded File URLs", open=False):
	file_urls_box = gr.Textbox(label="Cloud Storage URLs", lines=8, interactive=False)

	with gr.Row():
	qpms_box = gr.Textbox(label="📑 QP+MS Transcript", lines=12)
	as_box = gr.Textbox(label="📝 AS Transcript", lines=12)

	grading_output_box = gr.Textbox(label="🧾 Grading (Markdown)", lines=20)
	grading_pdf_file = gr.File(label="📥 Download Grading PDF")
	imprint_pdf_file = gr.File(label="📥 Download Imprinted PDF (Optional)")

	def run_pipeline(qp_file_obj, ms_file_obj, ans_file_obj, subject_choice, imprint_flag):
	if not qp_file_obj or not ms_file_obj or not ans_file_obj:
	error_msg = "❌ Please upload all three files"
	if supabase_client:
	return error_msg, "", "", None, None, ""
	else:
	return error_msg, "", "", None, None

	# Process and upload input files (generates shared timestamp)
	qp_path, ms_path, ans_path, input_urls, run_timestamp = process_and_upload_input_files(
	qp_file_obj, ms_file_obj, ans_file_obj
	)

	# Run the grading pipeline (pass timestamp to keep all files together)
	qpms_text, as_text, grading_text, grading_pdf_path, imprinted_pdf_path, output_urls = align_and_grade_pipeline(
	qp_path, ms_path, ans_path, subject=subject_choice, imprint=imprint_flag, run_timestamp=run_timestamp
	)

	# Build URLs summary
	urls_summary = ""
	if supabase_client:
	urls_summary = f"📤 UPLOADED FILES (Timestamp: {run_timestamp}):\n\n"
	urls_summary += "INPUT FILES:\n"
	if input_urls.get("qp_url"):
	urls_summary += f"• Question Paper: {input_urls['qp_url']}\n"
	if input_urls.get("ms_url"):
	urls_summary += f"• Markscheme: {input_urls['ms_url']}\n"
	if input_urls.get("ans_url"):
	urls_summary += f"• Answer Sheet: {input_urls['ans_url']}\n"

	urls_summary += "\nOUTPUT FILES:\n"
	if output_urls.get("graded_pdf_url"):
	urls_summary += f"• Graded PDF: {output_urls['graded_pdf_url']}\n"
	if output_urls.get("imprinted_pdf_url"):
	urls_summary += f"• Imprinted PDF: {output_urls['imprinted_pdf_url']}\n"

	urls_summary += f"\n📁 All files stored in: examfiles/{run_timestamp}/\n"

	if not any(input_urls.values()) and not any(output_urls.values()):
	urls_summary += "\n⚠️ No files were uploaded to Supabase"

	if supabase_client:
	return (
	qpms_text or "",
	as_text or "",
	grading_text or "",
	grading_pdf_path,
	imprinted_pdf_path,
	urls_summary
	)
	else:
	return (
	qpms_text or "",
	as_text or "",
	grading_text or "",
	grading_pdf_path,
	imprinted_pdf_path
	)

	# Set up the click handler based on whether Supabase is enabled
	if supabase_client:
	run_button.click(
	fn=run_pipeline,
	inputs=[qp_file, ms_file, ans_file, subject_dropdown, imprint_toggle],
	outputs=[qpms_box, as_box, grading_output_box, grading_pdf_file, imprint_pdf_file, file_urls_box]
	)
	else:
	run_button.click(
	fn=run_pipeline,
	inputs=[qp_file, ms_file, ans_file, subject_dropdown, imprint_toggle],
	outputs=[qpms_box, as_box, grading_output_box, grading_pdf_file, imprint_pdf_file]
	)

	if __name__ == "__main__":
	demo.launch()