Spaces:

gaurv007
/

ClauseGuard

Sleeping

App Files Files Community

ClauseGuard / ocr_engine.py

gaurv007

v4.0: Add ocr_engine.py — OCR + RAG Chatbot + Clause Redlining

b5350d6 verified 15 days ago

raw

history blame

8.26 kB

	"""
	ClauseGuard — OCR Engine v1.0
	═════════════════════════════
	Smart PDF Router: detects native vs scanned PDFs.
	• Native PDF → pdfplumber (fast, existing)
	• Scanned PDF → docTR OCR (CPU-friendly, ~150MB models)

	Architecture:
	PDF uploaded
	↓
	[detect_if_scanned] — pdfplumber gets <50 chars/page?
	↓ ↓
	Native PDF Scanned PDF
	↓ ↓
	pdfplumber docTR OCR (CPU)
	↓ ↓
	Contract text → existing analysis pipeline
	"""

	import os
	import re

	# ── docTR (soft-fail) ───────────────────────────────────────────────
	_HAS_DOCTR = False
	_ocr_predictor = None

	try:
	from doctr.io import DocumentFile
	from doctr.models import ocr_predictor as _make_predictor
	_HAS_DOCTR = True
	except ImportError:
	pass

	# ── pdfplumber (soft-fail) ──────────────────────────────────────────
	try:
	import pdfplumber
	_HAS_PDF = True
	except ImportError:
	_HAS_PDF = False

	# ═══════════════════════════════════════════════════════════════════════
	# OCR MODEL LOADING
	# ═══════════════════════════════════════════════════════════════════════

	_ocr_status = "not_loaded"

	def _load_ocr_model():
	"""Load docTR OCR predictor (lazy, on first use)."""
	global _ocr_predictor, _ocr_status
	if _ocr_predictor is not None:
	return _ocr_predictor
	if not _HAS_DOCTR:
	_ocr_status = "unavailable (python-doctr not installed)"
	return None
	try:
	print("[ClauseGuard OCR] Loading docTR models (fast_base + crnn_vgg16_bn)...")
	_ocr_predictor = _make_predictor(
	det_arch="fast_base",
	reco_arch="crnn_vgg16_bn",
	pretrained=True,
	assume_straight_pages=True,
	)
	_ocr_status = "loaded"
	print("[ClauseGuard OCR] docTR models loaded successfully")
	return _ocr_predictor
	except Exception as e:
	_ocr_status = f"failed: {e}"
	print(f"[ClauseGuard OCR] docTR load failed: {e}")
	return None


	def get_ocr_status():
	"""Return human-readable OCR engine status."""
	if _ocr_predictor is not None:
	return "✅ OCR: docTR loaded"
	elif _HAS_DOCTR:
	return "⏳ OCR: docTR available (not yet loaded)"
	else:
	return "❌ OCR: unavailable (python-doctr not installed)"


	# ═══════════════════════════════════════════════════════════════════════
	# SMART PDF ROUTER
	# ═══════════════════════════════════════════════════════════════════════

	def _is_scanned_pdf(file_path, min_chars_per_page=50):
	"""
	Detect if a PDF is scanned (image-based) by checking if pdfplumber
	extracts fewer than `min_chars_per_page` characters on average.
	"""
	if not _HAS_PDF:
	return True # Can't check with pdfplumber, assume scanned
	try:
	with pdfplumber.open(file_path) as pdf:
	if len(pdf.pages) == 0:
	return True
	total_chars = 0
	pages_checked = min(len(pdf.pages), 5) # Check first 5 pages
	for i in range(pages_checked):
	page_text = pdf.pages[i].extract_text() or ""
	total_chars += len(page_text.strip())
	avg_chars = total_chars / pages_checked
	return avg_chars < min_chars_per_page
	except Exception:
	return True # If pdfplumber fails, try OCR


	def _extract_native_pdf(file_path):
	"""Extract text from a native (digital) PDF using pdfplumber."""
	if not _HAS_PDF:
	return None, "pdfplumber not installed"
	try:
	text = ""
	with pdfplumber.open(file_path) as pdf:
	for page in pdf.pages:
	page_text = page.extract_text()
	if page_text:
	text += page_text + "\n\n"
	if not text.strip():
	return None, "No text extracted from PDF"
	return text.strip(), None
	except Exception as e:
	return None, f"PDF parse error: {e}"


	def _extract_scanned_pdf(file_path):
	"""Extract text from a scanned PDF using docTR OCR."""
	predictor = _load_ocr_model()
	if predictor is None:
	return None, (
	"OCR is not available. Install python-doctr: "
	"`pip install python-doctr[torch]`"
	)
	try:
	doc = DocumentFile.from_pdf(file_path)
	result = predictor(doc)

	# Extract text page by page
	full_text = ""
	for page_idx, page in enumerate(result.pages):
	page_text = ""
	for block in page.blocks:
	for line in block.lines:
	line_text = " ".join(word.value for word in line.words)
	page_text += line_text + "\n"
	page_text += "\n"
	full_text += page_text + "\n\n"

	if not full_text.strip():
	return None, "OCR could not extract text from scanned PDF"

	# Clean up OCR artifacts
	full_text = _clean_ocr_text(full_text)
	return full_text.strip(), None
	except Exception as e:
	return None, f"OCR error: {e}"


	def _clean_ocr_text(text):
	"""Clean common OCR artifacts."""
	# Remove excessive whitespace
	text = re.sub(r'[ \t]{3,}', ' ', text)
	# Fix common OCR substitutions
	text = re.sub(r'\bl\b(?=[A-Z])', 'I', text) # l before capital → I
	# Normalize line breaks
	text = re.sub(r'\n{4,}', '\n\n\n', text)
	# Remove single-char lines (OCR noise)
	lines = text.split('\n')
	cleaned_lines = []
	for line in lines:
	stripped = line.strip()
	if len(stripped) <= 1 and stripped not in ('', '.', ',', ';'):
	continue
	cleaned_lines.append(line)
	return '\n'.join(cleaned_lines)


	# ═══════════════════════════════════════════════════════════════════════
	# PUBLIC API
	# ═══════════════════════════════════════════════════════════════════════

	def parse_pdf_smart(file_path):
	"""
	Smart PDF parser with OCR fallback.

	Returns: (text, error, method)
	text: extracted text (or None)
	error: error message (or None)
	method: "native" \| "ocr" \| None
	"""
	if not os.path.exists(file_path):
	return None, "File not found", None

	# Step 1: Check if PDF is scanned
	is_scanned = _is_scanned_pdf(file_path)

	if not is_scanned:
	# Step 2a: Native PDF — use pdfplumber
	text, error = _extract_native_pdf(file_path)
	if text:
	return text, None, "native"
	# If pdfplumber returns empty, fall through to OCR
	print("[ClauseGuard OCR] pdfplumber returned empty — falling back to OCR")

	# Step 2b: Scanned PDF or pdfplumber failed — use OCR
	print(f"[ClauseGuard OCR] {'Scanned' if is_scanned else 'Empty native'} PDF detected — running docTR OCR...")
	text, error = _extract_scanned_pdf(file_path)
	if text:
	return text, None, "ocr"
	return None, error, None


	def ocr_extract(file_path):
	"""
	Force OCR extraction on a PDF (bypass native text check).
	Useful when user explicitly wants OCR.
	"""
	return _extract_scanned_pdf(file_path)