Spaces:

BinKhoaLe1812
/

EdSummariser

Running

App Files Files Community

EdSummariser / utils /service /pdf.py

LiamKhoaLe

Upd NVIDIA ana

7a1ebee about 2 months ago

raw

history blame contribute delete

38.2 kB

	"""
	PDF generation utilities for StudyBuddy
	"""
	import os
	import tempfile
	import markdown
	import re
	from datetime import datetime
	from typing import List, Dict
	from fastapi import HTTPException
	from utils.logger import get_logger
	from helpers.setup import gemini_rotator, nvidia_rotator

	logger = get_logger("PDF", __name__)


	async def _parse_markdown_content(content: str, heading1_style, heading2_style, heading3_style, normal_style, code_style):
	"""
	Enhanced markdown parser that properly handles bold/italic formatting
	"""
	from reportlab.platypus import Paragraph, Spacer
	from reportlab.lib.units import inch

	story = []
	lines = content.split('\n')
	i = 0

	while i < len(lines):
	line = lines[i].strip()

	if not line:
	story.append(Spacer(1, 6))
	i += 1
	continue

	# Headers
	if line.startswith('#'):
	level = len(line) - len(line.lstrip('#'))
	header_text = line.lstrip('# ').strip()
	header_text = _format_inline_markdown(header_text)

	if level == 1:
	story.append(Paragraph(header_text, heading1_style))
	elif level == 2:
	story.append(Paragraph(header_text, heading2_style))
	elif level == 3:
	story.append(Paragraph(header_text, heading3_style))
	else:
	story.append(Paragraph(header_text, normal_style))

	# Code blocks with language detection
	elif line.startswith('```'):
	# Extract language if specified
	language = line[3:].strip() if len(line) > 3 else 'text'

	# Auto-detect language if not specified
	if language == 'text':
	language = _detect_language_from_content(lines, i)

	code_lines = []
	i += 1
	while i < len(lines) and not lines[i].strip().startswith('```'):
	code_lines.append(lines[i])
	i += 1

	if code_lines:
	# Mermaid diagrams → render via Kroki PNG for PDF with retry logic
	if language.lower() == 'mermaid':
	try:
	from reportlab.platypus import Image, Spacer
	mermaid_code = '\n'.join(code_lines)
	# Use retry logic from diagram.py
	from helpers.diagram import _render_mermaid_with_retry
	img_bytes = await _render_mermaid_with_retry(mermaid_code, user_id=user_id)

	if img_bytes and len(img_bytes) > 0:
	import io
	img = Image(io.BytesIO(img_bytes))
	# Fit within page width (~6 inches after margins)
	max_width = 6.0 * inch
	if img.drawWidth > max_width:
	scale = max_width / float(img.drawWidth)
	img.drawWidth = max_width
	img.drawHeight = img.drawHeight * scale
	story.append(img)
	story.append(Spacer(1, 12))
	i += 1
	continue
	else:
	logger.warning("[PDF] Mermaid render returned empty image after retries, falling back to code block")
	except Exception as me:
	logger.warning(f"[PDF] Mermaid render failed after retries, falling back to code block: {me}")

	# Fallback: render as code block with mermaid syntax
	from reportlab.platypus import XPreformatted, Paragraph
	raw_code = '\n'.join(code_lines)
	raw_code = raw_code.replace('\t', ' ')
	raw_code = raw_code.replace('\r\n', '\n').replace('\r', '\n')
	raw_code = re.sub(r'[^\x09\x0A\x20-\x7E]', '', raw_code)
	escaped = raw_code.replace('&', '&').replace('<', '<').replace('>', '>')
	lang_header = f"<font color='#9aa5b1' size='8'>[MERMAID DIAGRAM]</font>"
	story.append(Paragraph(lang_header, code_style))
	story.append(XPreformatted(escaped, code_style))
	i += 1
	continue

	from reportlab.platypus import XPreformatted, Paragraph
	# Join and sanitize code content: expand tabs, remove control chars that render as squares
	raw_code = '\n'.join(code_lines)
	raw_code = raw_code.replace('\t', ' ')
	raw_code = raw_code.replace('\r\n', '\n').replace('\r', '\n')
	# Strip non-printable except tab/newline
	raw_code = re.sub(r'[^\x09\x0A\x20-\x7E]', '', raw_code)

	# Escape for XML and apply lightweight syntax highlighting
	escaped = raw_code.replace('&', '&').replace('<', '<').replace('>', '>')
	highlighted = _apply_syntax_highlight(escaped, language)

	# Add a small language header, then render highlighted code with XPreformatted to preserve spacing
	lang_header = f"<font color='#9aa5b1' size='8'>[{language.upper()}]</font>"
	story.append(Paragraph(lang_header, code_style))
	story.append(XPreformatted(highlighted, code_style))

	# Lists (including nested)
	elif line.startswith('- ') or line.startswith('* '):
	# Count indentation level
	indent_level = len(line) - len(line.lstrip())
	list_text = line[2:].strip()
	list_text = _format_inline_markdown(list_text)

	# Add indentation based on level
	indent = " " * (indent_level // 2) if indent_level > 0 else ""
	story.append(Paragraph(f"{indent}• {list_text}", normal_style))

	# Numbered lists (including nested)
	elif re.match(r'^\d+\.\s', line):
	# Count indentation level
	indent_level = len(line) - len(line.lstrip())
	list_text = re.sub(r'^\d+\.\s', '', line)
	list_text = _format_inline_markdown(list_text)

	# Add indentation based on level
	indent = " " * (indent_level // 2) if indent_level > 0 else ""
	story.append(Paragraph(f"{indent}• {list_text}", normal_style))

	# Blockquotes
	elif line.startswith('> '):
	quote_text = line[2:].strip()
	quote_text = _format_inline_markdown(quote_text)
	story.append(Paragraph(f"<i>{quote_text}</i>", normal_style))

	# Horizontal rules
	elif line.startswith('---') or line.startswith('***'):
	story.append(Spacer(1, 12))
	story.append(Paragraph("_" * 50, normal_style))
	story.append(Spacer(1, 12))

	# Regular paragraphs - collect multi-line paragraphs
	else:
	paragraph_lines = [line]
	i += 1

	# Collect continuation lines until we hit a blank line or another block type
	while i < len(lines):
	next_line = lines[i].strip()

	# Stop if we hit a blank line
	if not next_line:
	break

	# Stop if we hit a new block type
	if (next_line.startswith('#') or
	next_line.startswith('```') or
	next_line.startswith('- ') or
	next_line.startswith('* ') or
	re.match(r'^\d+\.\s', next_line) or
	next_line.startswith('> ') or
	next_line.startswith('---') or
	next_line.startswith('***')):
	break

	paragraph_lines.append(next_line)
	i += 1

	# Process the complete paragraph
	paragraph_text = ' '.join(paragraph_lines)
	formatted_text = _format_inline_markdown(paragraph_text)
	story.append(Paragraph(formatted_text, normal_style))
	continue # Don't increment i again since we already did it in the loop

	i += 1

	return story


	def _detect_language_from_content(lines: list, start_index: int) -> str:
	"""
	Auto-detect programming language from code content
	"""
	# Look at the next few lines to detect language
	sample_lines = []
	for i in range(start_index + 1, min(start_index + 10, len(lines))):
	if lines[i].strip().startswith('```'):
	break
	sample_lines.append(lines[i])

	sample_text = '\n'.join(sample_lines)

	# Python detection
	if (re.search(r'\bdef\s+\w+', sample_text) or
	re.search(r'\bclass\s+\w+', sample_text) or
	re.search(r'\bimport\s+\w+', sample_text) or
	re.search(r'\bfrom\s+\w+', sample_text)):
	return 'python'

	# JavaScript detection
	if (re.search(r'\bfunction\s+\w+', sample_text) or
	re.search(r'\bvar\s+\w+', sample_text) or
	re.search(r'\blet\s+\w+', sample_text) or
	re.search(r'\bconst\s+\w+', sample_text) or
	re.search(r'=>', sample_text)):
	return 'javascript'

	# Java detection
	if (re.search(r'\bpublic\s+class', sample_text) or
	re.search(r'\bprivate\s+\w+', sample_text) or
	re.search(r'\bSystem\.out\.print', sample_text) or
	re.search(r'\bimport\s+java\.', sample_text)):
	return 'java'

	# JSON detection
	if (re.search(r'^\s*[{}]', sample_text) or
	re.search(r'"[^"]"\s:', sample_text) or
	re.search(r'\btrue\b\|\bfalse\b\|\bnull\b', sample_text)):
	return 'json'

	# XML/HTML detection
	if (re.search(r'<[^>]+>', sample_text) or
	re.search(r'<[^>]+>', sample_text)):
	return 'xml'

	# SQL detection
	if (re.search(r'\bSELECT\b', sample_text, re.IGNORECASE) or
	re.search(r'\bFROM\b', sample_text, re.IGNORECASE) or
	re.search(r'\bWHERE\b', sample_text, re.IGNORECASE) or
	re.search(r'\bINSERT\b', sample_text, re.IGNORECASE)):
	return 'sql'

	# YAML detection
	if (re.search(r'^\s*\w+:', sample_text) or
	re.search(r'^\s*-\s+', sample_text)):
	return 'yaml'

	# Bash detection
	if (re.search(r'^\s*#!', sample_text) or
	re.search(r'\$\w+', sample_text) or
	re.search(r'^\s\w+.\\|', sample_text)):
	return 'bash'

	return 'text'


	def _format_code_block(code_text: str, language: str) -> str:
	"""
	Deprecated: We now render code blocks with Preformatted to avoid paragraph parser errors.
	Kept for compatibility if referenced elsewhere; returns escaped plain text.
	"""
	code_text = code_text.replace('&', '&').replace('<', '<').replace('>', '>')
	return f"<font name='Courier' size='9'>{code_text}</font>"


	def _highlight_python(code: str) -> str:
	"""Python syntax highlighting"""
	# Keywords
	keywords = ['def', 'class', 'if', 'else', 'elif', 'for', 'while', 'try', 'except', 'finally',
	'import', 'from', 'as', 'with', 'return', 'yield', 'lambda', 'and', 'or', 'not',
	'in', 'is', 'True', 'False', 'None', 'pass', 'break', 'continue', 'raise', 'assert']

	# Built-in functions
	builtins = ['print', 'len', 'str', 'int', 'float', 'list', 'dict', 'tuple', 'set', 'range',
	'enumerate', 'zip', 'map', 'filter', 'sorted', 'reversed', 'open', 'input']

	# String literals
	code = re.sub(r'("""[\s\S]*?""")', r'<font color="#008000">\1</font>', code) # Triple quotes
	code = re.sub(r'(".*?")', r'<font color="#008000">\1</font>', code) # Double quotes
	code = re.sub(r"('''[\s\S]*?''')", r'<font color="#008000">\1</font>', code) # Triple single quotes
	code = re.sub(r"('.*?')", r'<font color="#008000">\1</font>', code) # Single quotes

	# Comments
	code = re.sub(r'(#.*?)$', r'<font color="#808080">\1</font>', code, flags=re.MULTILINE)

	# Keywords
	for keyword in keywords:
	code = re.sub(r'\b(' + keyword + r')\b', r'<font color="#0000FF"><b>\1</b></font>', code)

	# Built-in functions
	for builtin in builtins:
	code = re.sub(r'\b(' + builtin + r')\b', r'<font color="#800080">\1</font>', code)

	# Numbers
	code = re.sub(r'\b(\d+\.?\d*)\b', r'<font color="#FF0000">\1</font>', code)

	return f"<font name='Courier' size='9'>{code}</font>"


	def _highlight_json(code: str) -> str:
	"""JSON syntax highlighting"""
	# Strings
	code = re.sub(r'(".*?")', r'<font color="#008000">\1</font>', code)

	# Numbers
	code = re.sub(r'\b(\d+\.?\d*)\b', r'<font color="#FF0000">\1</font>', code)

	# Keywords
	code = re.sub(r'\b(true\|false\|null)\b', r'<font color="#0000FF"><b>\1</b></font>', code)

	# Punctuation
	code = re.sub(r'([{}[\]])', r'<font color="#800080"><b>\1</b></font>', code)
	code = re.sub(r'([,])', r'<font color="#800080">\1</font>', code)

	return f"<font name='Courier' size='9'>{code}</font>"


	def _highlight_xml(code: str) -> str:
	"""XML/HTML syntax highlighting"""
	# Tags
	code = re.sub(r'(<[^>]*>)', r'<font color="#0000FF"><b>\1</b></font>', code)

	# Attributes
	code = re.sub(r'(\w+)=', r'<font color="#800080">\1</font>=', code)

	# Attribute values
	code = re.sub(r'="([^"]*)"', r'="<font color="#008000">\1</font>"', code)

	# Comments
	code = re.sub(r'(<!--[\s\S]*?-->)', r'<font color="#808080">\1</font>', code)

	return f"<font name='Courier' size='9'>{code}</font>"


	def _highlight_java(code: str) -> str:
	"""Java syntax highlighting"""
	# Keywords
	keywords = ['public', 'private', 'protected', 'static', 'final', 'class', 'interface', 'extends',
	'implements', 'if', 'else', 'for', 'while', 'do', 'switch', 'case', 'break', 'continue',
	'return', 'try', 'catch', 'finally', 'throw', 'throws', 'new', 'this', 'super', 'import',
	'package', 'void', 'int', 'long', 'float', 'double', 'boolean', 'char', 'byte', 'short',
	'true', 'false', 'null']

	# String literals
	code = re.sub(r'(".*?")', r'<font color="#008000">\1</font>', code)
	code = re.sub(r"('.*?')", r'<font color="#008000">\1</font>', code)

	# Comments
	code = re.sub(r'(//.*?)$', r'<font color="#808080">\1</font>', code, flags=re.MULTILINE)
	code = re.sub(r'(/\[\s\S]?\*/)', r'<font color="#808080">\1</font>', code)

	# Keywords
	for keyword in keywords:
	code = re.sub(r'\b(' + keyword + r')\b', r'<font color="#0000FF"><b>\1</b></font>', code)

	# Numbers
	code = re.sub(r'\b(\d+\.?\d*[fFdDlL]?)\b', r'<font color="#FF0000">\1</font>', code)

	return f"<font name='Courier' size='9'>{code}</font>"


	def _highlight_javascript(code: str) -> str:
	"""JavaScript syntax highlighting"""
	# Keywords
	keywords = ['function', 'var', 'let', 'const', 'if', 'else', 'for', 'while', 'do', 'switch',
	'case', 'break', 'continue', 'return', 'try', 'catch', 'finally', 'throw', 'new',
	'this', 'typeof', 'instanceof', 'true', 'false', 'null', 'undefined', 'async', 'await']

	# String literals
	code = re.sub(r'(".*?")', r'<font color="#008000">\1</font>', code)
	code = re.sub(r"('.*?')", r'<font color="#008000">\1</font>', code)
	code = re.sub(r'(`.*?`)', r'<font color="#008000">\1</font>', code) # Template literals

	# Comments
	code = re.sub(r'(//.*?)$', r'<font color="#808080">\1</font>', code, flags=re.MULTILINE)
	code = re.sub(r'(/\[\s\S]?\*/)', r'<font color="#808080">\1</font>', code)

	# Keywords
	for keyword in keywords:
	code = re.sub(r'\b(' + keyword + r')\b', r'<font color="#0000FF"><b>\1</b></font>', code)

	# Numbers
	code = re.sub(r'\b(\d+\.?\d*)\b', r'<font color="#FF0000">\1</font>', code)

	return f"<font name='Courier' size='9'>{code}</font>"


	def _highlight_sql(code: str) -> str:
	"""SQL syntax highlighting"""
	# Keywords
	keywords = ['SELECT', 'FROM', 'WHERE', 'INSERT', 'UPDATE', 'DELETE', 'CREATE', 'DROP', 'ALTER',
	'TABLE', 'INDEX', 'VIEW', 'DATABASE', 'SCHEMA', 'JOIN', 'LEFT', 'RIGHT', 'INNER', 'OUTER',
	'ON', 'GROUP', 'BY', 'ORDER', 'HAVING', 'UNION', 'DISTINCT', 'COUNT', 'SUM', 'AVG', 'MAX', 'MIN',
	'AND', 'OR', 'NOT', 'IN', 'BETWEEN', 'LIKE', 'IS', 'NULL', 'ASC', 'DESC', 'LIMIT', 'OFFSET']

	# String literals
	code = re.sub(r"('.*?')", r'<font color="#008000">\1</font>', code)

	# Comments
	code = re.sub(r'(--.*?)$', r'<font color="#808080">\1</font>', code, flags=re.MULTILINE)
	code = re.sub(r'(/\[\s\S]?\*/)', r'<font color="#808080">\1</font>', code)

	# Keywords (case insensitive)
	for keyword in keywords:
	code = re.sub(r'\b(' + keyword + r')\b', r'<font color="#0000FF"><b>\1</b></font>', code, flags=re.IGNORECASE)

	# Numbers
	code = re.sub(r'\b(\d+\.?\d*)\b', r'<font color="#FF0000">\1</font>', code)

	return f"<font name='Courier' size='9'>{code}</font>"


	def _highlight_yaml(code: str) -> str:
	"""YAML syntax highlighting"""
	# Keys
	code = re.sub(r'^(\s*)([^:]+):', r'\1<font color="#0000FF"><b>\2</b></font>:', code, flags=re.MULTILINE)

	# String values
	code = re.sub(r'(".*?")', r'<font color="#008000">\1</font>', code)
	code = re.sub(r"('.*?')", r'<font color="#008000">\1</font>', code)

	# Numbers
	code = re.sub(r'\b(\d+\.?\d*)\b', r'<font color="#FF0000">\1</font>', code)

	# Booleans
	code = re.sub(r'\b(true\|false\|yes\|no\|on\|off)\b', r'<font color="#800080"><b>\1</b></font>', code)

	# Comments
	code = re.sub(r'(#.*?)$', r'<font color="#808080">\1</font>', code, flags=re.MULTILINE)

	return f"<font name='Courier' size='9'>{code}</font>"


	def _highlight_bash(code: str) -> str:
	"""Bash/Shell syntax highlighting"""
	# Comments
	code = re.sub(r'(#.*?)$', r'<font color="#808080">\1</font>', code, flags=re.MULTILINE)

	# Commands (first word on line)
	code = re.sub(r'^(\s)([a-zA-Z_][a-zA-Z0-9_])', r'\1<font color="#0000FF"><b>\2</b></font>', code, flags=re.MULTILINE)

	# Variables
	code = re.sub(r'(\$[a-zA-Z_][a-zA-Z0-9_]*)', r'<font color="#800080">\1</font>', code)
	code = re.sub(r'(\$\{[^}]+\})', r'<font color="#800080">\1</font>', code)

	# Strings
	code = re.sub(r'(".*?")', r'<font color="#008000">\1</font>', code)
	code = re.sub(r"('.*?')", r'<font color="#008000">\1</font>', code)

	# Redirections and pipes
	code = re.sub(r'([<>\|&])', r'<font color="#FF0000"><b>\1</b></font>', code)

	return f"<font name='Courier' size='9'>{code}</font>"


	def _format_inline_markdown(text: str) -> str:
	"""
	Format inline markdown elements (bold, italic, code, links)
	"""
	# Escape HTML characters first
	text = text.replace('&', '&')
	text = text.replace('<', '<')
	text = text.replace('>', '>')

	# Process in order of precedence to avoid nested tag conflicts
	# 1. Inline code (`code`) - highest precedence, no nested formatting
	text = re.sub(r'`([^`]+)`', r'<font name="Courier" size="9">\1</font>', text)

	# 2. Bold text (text or __text__) - but not inside code blocks
	text = re.sub(r'(?<!`)\\([^]+)\\*(?!`)', r'<b>\1</b>', text)
	text = re.sub(r'(?<!`)__(?!_)([^_]+)__(?!`)', r'<b>\1</b>', text)

	# 3. Italic text (text or _text_) - but not inside code blocks or bold
	text = re.sub(r'(?<!`)(?<!\)\([^]+)\(?!\*)(?!`)', r'<i>\1</i>', text)
	text = re.sub(r'(?<!`)(?<!_)_([^_]+)_(?!_)(?!`)', r'<i>\1</i>', text)

	# 4. Strikethrough (~~text~~) - but not inside other formatting
	text = re.sub(r'~~([^~]+)~~', r'<strike>\1</strike>', text)

	# 5. Links [text](url) - convert to clickable text
	text = re.sub(r'\[([^\]]+)\]$([^)]+)$', r'<link href="\2">\1</link>', text)

	# 6. Line breaks
	text = text.replace('\n', '<br/>')

	return text


	def _apply_syntax_highlight(escaped_code: str, language: str) -> str:
	"""
	Apply professional IDE-like syntax highlighting on XML-escaped code text.
	Works with escaped entities (< > &), so regexes should not rely on raw quotes.
	"""
	def sub_outside_tags(pattern, repl, text, flags=0):
	parts = re.split(r'(</?[^>]+>)', text)
	for idx in range(0, len(parts)):
	if idx % 2 == 0: # outside tags
	parts[idx] = re.sub(pattern, repl, parts[idx], flags=flags)
	return ''.join(parts)

	out = escaped_code
	lang = (language or 'text').lower()

	if lang in ('python', 'py'):
	# Comments first (gray)
	out = sub_outside_tags(r"(#[^\n]*)", r"<font color='#6a737d'>\1</font>", out)
	# Docstrings (green)
	out = sub_outside_tags(r'("""[\s\S]*?""")', r"<font color='#28a745'>\1</font>", out)
	out = sub_outside_tags(r"('''[\s\S]*?''')", r"<font color='#28a745'>\1</font>", out)
	# Keywords (purple)
	keywords = (
	'def\|class\|if\|else\|elif\|for\|while\|try\|except\|finally\|import\|from\|as\|with\|return\|yield\|lambda\|and\|or\|not\|in\|is\|True\|False\|None\|pass\|break\|continue\|raise\|assert\|global\|nonlocal'
	)
	out = sub_outside_tags(rf"\b({keywords})\b", r"<font color='#6f42c1'><b>\1</b></font>", out)
	# Built-in functions (blue)
	builtins = (
	'print\|len\|str\|int\|float\|list\|dict\|tuple\|set\|range\|enumerate\|zip\|map\|filter\|sorted\|reversed\|open\|input\|type\|isinstance\|hasattr\|getattr\|setattr\|delattr'
	)
	out = sub_outside_tags(rf"\b({builtins})\b", r"<font color='#005cc5'>\1</font>", out)

	elif lang in ('javascript', 'js', 'typescript', 'ts'):
	# Comments (gray)
	out = sub_outside_tags(r"(//[^\n]*)", r"<font color='#6a737d'>\1</font>", out)
	out = sub_outside_tags(r"/\[\s\S]?\*/", lambda m: f"<font color='#6a737d'>{m.group(0)}</font>", out)
	# Keywords (purple)
	keywords = (
	'function\|var\|let\|const\|if\|else\|for\|while\|do\|switch\|case\|break\|continue\|return\|try\|catch\|finally\|throw\|new\|this\|typeof\|instanceof\|true\|false\|null\|undefined\|async\|await\|class\|extends\|implements\|interface\|type\|namespace\|module\|export\|import\|default\|public\|private\|protected\|static\|abstract\|readonly'
	)
	out = sub_outside_tags(rf"\b({keywords})\b", r"<font color='#6f42c1'><b>\1</b></font>", out)
	# Built-in objects (blue)
	builtins = (
	'console\|window\|document\|Array\|Object\|String\|Number\|Boolean\|Date\|Math\|JSON\|Promise\|Set\|Map\|WeakSet\|WeakMap\|Symbol\|Proxy\|Reflect'
	)
	out = sub_outside_tags(rf"\b({builtins})\b", r"<font color='#005cc5'>\1</font>", out)

	elif lang in ('json',):
	# Boolean and null values (blue)
	out = sub_outside_tags(r"\b(true\|false\|null)\b", r"<font color='#005cc5'><b>\1</b></font>", out)
	# Keys (purple)
	out = sub_outside_tags(r"("[^&]?")(\s:)", r"<font color='#6f42c1'>\1</font>\2", out)

	elif lang in ('bash', 'sh', 'shell'):
	# Comments (gray)
	out = sub_outside_tags(r"(#[^\n]*)", r"<font color='#6a737d'>\1</font>", out)
	# Commands (purple)
	out = sub_outside_tags(r"(^\|\n)(\s)([a-zA-Z_][a-zA-Z0-9_-])", r"\1\2<font color='#6f42c1'><b>\3</b></font>", out)
	# Variables (blue)
	out = sub_outside_tags(r"(\$[a-zA-Z_][a-zA-Z0-9_]*)", r"<font color='#005cc5'>\1</font>", out)
	out = sub_outside_tags(r"(\$\{[^}]+\})", r"<font color='#005cc5'>\1</font>", out)

	elif lang in ('yaml', 'yml'):
	# Keys (purple)
	out = sub_outside_tags(r"(^\|\n)(\s*)([^:\n]+)(:)", r"\1\2<font color='#6f42c1'>\3</font>\4", out)
	# Boolean values (blue)
	out = sub_outside_tags(r"\b(true\|false\|yes\|no\|on\|off)\b", r"<font color='#005cc5'><b>\1</b></font>", out, flags=re.IGNORECASE)

	elif lang in ('sql',):
	# Keywords (purple)
	keywords = (
	'SELECT\|FROM\|WHERE\|INSERT\|UPDATE\|DELETE\|CREATE\|DROP\|ALTER\|TABLE\|INDEX\|VIEW\|DATABASE\|SCHEMA\|JOIN\|LEFT\|RIGHT\|INNER\|OUTER\|ON\|GROUP\|BY\|ORDER\|HAVING\|UNION\|DISTINCT\|COUNT\|SUM\|AVG\|MAX\|MIN\|AND\|OR\|NOT\|IN\|BETWEEN\|LIKE\|IS\|NULL\|ASC\|DESC\|LIMIT\|OFFSET\|CASE\|WHEN\|THEN\|ELSE\|END\|EXISTS\|ALL\|ANY\|SOME'
	)
	out = sub_outside_tags(rf"\b({keywords})\b", r"<font color='#6f42c1'><b>\1</b></font>", out, flags=re.IGNORECASE)

	elif lang in ('java',):
	# Comments (gray)
	out = sub_outside_tags(r"(//[^\n]*)", r"<font color='#6a737d'>\1</font>", out)
	out = sub_outside_tags(r"/\[\s\S]?\*/", lambda m: f"<font color='#6a737d'>{m.group(0)}</font>", out)
	# Keywords (purple)
	keywords = (
	'public\|private\|protected\|static\|final\|class\|interface\|extends\|implements\|if\|else\|for\|while\|do\|switch\|case\|break\|continue\|return\|try\|catch\|finally\|throw\|throws\|new\|this\|super\|import\|package\|void\|int\|long\|float\|double\|boolean\|char\|byte\|short\|true\|false\|null\|abstract\|native\|synchronized\|volatile\|transient\|strictfp'
	)
	out = sub_outside_tags(rf"\b({keywords})\b", r"<font color='#6f42c1'><b>\1</b></font>", out)
	# Built-in classes (blue)
	builtins = (
	'String\|Object\|Integer\|Long\|Float\|Double\|Boolean\|Character\|Byte\|Short\|System\|Math\|ArrayList\|HashMap\|HashSet\|LinkedList\|Vector\|Collections\|Arrays'
	)
	out = sub_outside_tags(rf"\b({builtins})\b", r"<font color='#005cc5'>\1</font>", out)

	elif lang in ('css',):
	# Selectors (purple)
	out = sub_outside_tags(r"([.#]?[a-zA-Z][a-zA-Z0-9_-])(\s\{)", r"<font color='#6f42c1'>\1</font>\2", out)
	# Properties (blue)
	out = sub_outside_tags(r"([a-zA-Z-]+)(\s*:)", r"<font color='#005cc5'>\1</font>\2", out)
	# Values (green)
	out = sub_outside_tags(r"(\s:\s)([^;]+)(;)", r"\1<font color='#28a745'>\2</font>\3", out)

	elif lang in ('html', 'xml'):
	# Tags (purple)
	out = sub_outside_tags(r"(<[^>]*>)", r"<font color='#6f42c1'><b>\1</b></font>", out)
	# Attributes (blue)
	out = sub_outside_tags(r"(\w+)=("[^&]*?")", r"<font color='#005cc5'>\1</font>=\2", out)

	# Strings (green) - apply to all languages
	out = sub_outside_tags(r"(".*?")", r"<font color='#28a745'>\1</font>", out)
	out = sub_outside_tags(r"('.*?')", r"<font color='#28a745'>\1</font>", out)
	out = sub_outside_tags(r"(`.*?`)", r"<font color='#28a745'>\1</font>", out)

	# Numbers (orange) - apply to all languages
	out = sub_outside_tags(r"\b(\d+\.?\d*)\b", r"<font color='#e36209'>\1</font>", out)

	return out


	def _render_mermaid_png(mermaid_text: str) -> bytes:
	"""
	Render mermaid code to PNG via Kroki service (no local mermaid-cli dependency).
	Falls back to returning empty bytes on failure.
	"""
	try:
	import base64
	import json
	import urllib.request
	import urllib.error

	# Validate and clean mermaid content
	if not mermaid_text or not mermaid_text.strip():
	logger.warning("[PDF] Empty mermaid content")
	return b""

	# Clean the mermaid text - remove any potential issues
	cleaned_text = mermaid_text.strip()

	# Basic mermaid syntax validation
	if not cleaned_text.startswith(('graph', 'flowchart', 'sequenceDiagram', 'classDiagram', 'stateDiagram', 'erDiagram', 'journey', 'gantt', 'pie', 'gitgraph')):
	logger.warning(f"[PDF] Invalid mermaid diagram type: {cleaned_text[:50]}...")
	return b""

	# Kroki POST API for mermaid -> png
	data = json.dumps({"diagram_source": cleaned_text}).encode("utf-8")
	req = urllib.request.Request(
	url="https://kroki.io/mermaid/png",
	data=data,
	headers={"Content-Type": "application/json"},
	method="POST"
	)

	with urllib.request.urlopen(req, timeout=15) as resp:
	if resp.status == 200:
	return resp.read()
	else:
	logger.warning(f"[PDF] Kroki returned status {resp.status}")
	return b""

	except urllib.error.HTTPError as e:
	if e.code == 400:
	logger.warning(f"[PDF] Kroki mermaid syntax error (400): {e.reason}")
	else:
	logger.warning(f"[PDF] Kroki HTTP error {e.code}: {e.reason}")
	except urllib.error.URLError as e:
	logger.warning(f"[PDF] Kroki connection error: {e.reason}")
	except Exception as e:
	logger.warning(f"[PDF] Kroki mermaid render error: {e}")

	return b""


	async def _format_references_ieee(sources: List[Dict]) -> List[str]:
	"""Format sources in IEEE citation style using NVIDIA API."""
	try:
	from utils.api.router import generate_answer_with_model
	from helpers.setup import nvidia_rotator

	if not sources or not nvidia_rotator:
	return []

	# Prepare source data for formatting
	source_data = []
	for i, source in enumerate(sources, 1):
	source_info = {
	"number": i,
	"filename": source.get("filename", "Unknown"),
	"url": source.get("url", ""),
	"topic_name": source.get("topic_name", ""),
	"kind": source.get("kind", "document")
	}
	source_data.append(source_info)

	sys_prompt = """You are an expert at formatting academic references in IEEE style.
	Format the provided sources as IEEE-style references. Each reference should be numbered and formatted according to IEEE standards.

	For web sources: [1] Author/Organization, "Title," Website Name, URL, accessed: Date.
	For documents: [1] Author, "Title," Document Type, Filename, Year.

	Return only the formatted references, one per line, numbered sequentially."""

	user_prompt = f"Format these sources in IEEE style:\n\n{source_data}"

	selection = {"provider": "nvidia", "model": os.getenv("NVIDIA_SMALL", "meta/llama-3.1-8b-instruct")}
	response = await generate_answer_with_model(selection, sys_prompt, user_prompt, gemini_rotator, nvidia_rotator, user_id="system", context="pdf_citation")

	# Parse the response into individual references
	references = [line.strip() for line in response.split('\n') if line.strip() and line.strip().startswith('[')]

	# If NVIDIA formatting fails, create basic IEEE format
	if not references:
	references = []
	for i, source in enumerate(sources, 1):
	if source.get("kind") == "web":
	ref = f"[{i}] {source.get('topic_name', 'Unknown')}, \"{source.get('filename', 'Web Source')}\", {source.get('url', '')}, accessed: {datetime.now().strftime('%B %d, %Y')}."
	else:
	ref = f"[{i}] {source.get('topic_name', 'Unknown')}, \"{source.get('filename', 'Document')}\", Document, {datetime.now().year}."
	references.append(ref)

	return references

	except Exception as e:
	logger.warning(f"[PDF] IEEE reference formatting failed: {e}")
	# Fallback to basic formatting
	references = []
	for i, source in enumerate(sources, 1):
	if source.get("kind") == "web":
	ref = f"[{i}] {source.get('topic_name', 'Unknown')}, \"{source.get('filename', 'Web Source')}\", {source.get('url', '')}, accessed: {datetime.now().strftime('%B %d, %Y')}."
	else:
	ref = f"[{i}] {source.get('topic_name', 'Unknown')}, \"{source.get('filename', 'Document')}\", Document, {datetime.now().year}."
	references.append(ref)
	return references


	async def generate_report_pdf(report_content: str, user_id: str, project_id: str, sources: List[Dict] = None) -> bytes:
	"""
	Generate a PDF from report content using reportlab

	Args:
	report_content: Markdown content of the report
	user_id: User ID for logging
	project_id: Project ID for logging

	Returns:
	PDF content as bytes

	Raises:
	HTTPException: If PDF generation fails
	"""
	try:
	from reportlab.lib.pagesizes import letter, A4
	from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
	from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
	from reportlab.lib.units import inch
	from reportlab.lib import colors
	from io import BytesIO

	logger.info(f"[PDF] Generating PDF for user {user_id}, project {project_id}")

	# Create a BytesIO buffer for the PDF
	buffer = BytesIO()

	# Create the PDF document
	doc = SimpleDocTemplate(
	buffer,
	pagesize=A4,
	rightMargin=72,
	leftMargin=72,
	topMargin=72,
	bottomMargin=18
	)

	# Get styles
	styles = getSampleStyleSheet()

	# Create custom styles
	title_style = ParagraphStyle(
	'CustomTitle',
	parent=styles['Heading1'],
	fontSize=24,
	spaceAfter=30,
	textColor=colors.HexColor('#2c3e50'),
	borderWidth=1,
	borderColor=colors.HexColor('#3498db'),
	borderPadding=10
	)

	heading1_style = ParagraphStyle(
	'CustomHeading1',
	parent=styles['Heading1'],
	fontSize=18,
	spaceAfter=12,
	spaceBefore=20,
	textColor=colors.HexColor('#2c3e50')
	)

	heading2_style = ParagraphStyle(
	'CustomHeading2',
	parent=styles['Heading2'],
	fontSize=16,
	spaceAfter=10,
	spaceBefore=16,
	textColor=colors.HexColor('#2c3e50')
	)

	heading3_style = ParagraphStyle(
	'CustomHeading3',
	parent=styles['Heading3'],
	fontSize=14,
	spaceAfter=8,
	spaceBefore=12,
	textColor=colors.HexColor('#2c3e50')
	)

	normal_style = ParagraphStyle(
	'CustomNormal',
	parent=styles['Normal'],
	fontSize=11,
	spaceAfter=6,
	leading=14
	)

	# Professional IDE-like code styling with no background
	base_code_parent = styles['Code'] if 'Code' in styles.byName else styles['Normal']
	code_style = ParagraphStyle(
	'Code',
	parent=base_code_parent,
	fontSize=9,
	fontName='Courier',
	textColor=colors.HexColor('#2c3e50'), # Dark text on white background
	backColor=None, # No background color
	borderColor=colors.HexColor('#e1e8ed'),
	borderWidth=1,
	borderPadding=8,
	leftIndent=12,
	rightIndent=12,
	spaceBefore=6,
	spaceAfter=6,
	leading=11
	)

	# Parse markdown content
	story = []

	# Add title
	story.append(Paragraph("StudyBuddy Report", title_style))
	story.append(Paragraph(f"<i>Generated on {datetime.now().strftime('%B %d, %Y at %I:%M %p')}</i>", normal_style))
	story.append(Spacer(1, 20))

	# Enhanced markdown parser with proper formatting
	story.extend(await _parse_markdown_content(report_content, heading1_style, heading2_style, heading3_style, normal_style, code_style))

	# Add references section if sources provided
	if sources:
	story.append(PageBreak())
	story.append(Paragraph("References", heading1_style))
	story.append(Spacer(1, 12))

	# Format references in IEEE style using NVIDIA API
	try:
	ieee_references = await _format_references_ieee(sources)
	except Exception as _ie:
	logger.warning(f"[PDF] Reference formatting failed, falling back: {_ie}")
	ieee_references = []
	for i, source in enumerate(sources, 1):
	if source.get("kind") == "web":
	ref = f"[{i}] {source.get('topic_name', 'Unknown')}, \"{source.get('filename', 'Web Source')}\", {source.get('url', '')}, accessed: {datetime.now().strftime('%B %d, %Y')}."
	else:
	ref = f"[{i}] {source.get('topic_name', 'Unknown')}, \"{source.get('filename', 'Document')}\", Document, {datetime.now().year}."
	ieee_references.append(ref)
	for ref in ieee_references:
	story.append(Paragraph(ref, normal_style))
	story.append(Spacer(1, 6))

	# Build PDF
	doc.build(story)

	# Get PDF content
	pdf_content = buffer.getvalue()
	buffer.close()

	logger.info(f"[PDF] Successfully generated PDF ({len(pdf_content)} bytes) for user {user_id}, project {project_id}")
	return pdf_content

	except ImportError:
	logger.error("[PDF] reportlab not installed. Install with: pip install reportlab")
	raise HTTPException(500, detail="PDF generation not available. Please install reportlab.")
	except Exception as e:
	logger.error(f"[PDF] Failed to generate PDF: {e}")
	# Keep error generic for client; avoid leaking internals
	raise HTTPException(500, detail="Failed to generate PDF")