Spaces:
Sleeping
Sleeping
| """ | |
| ClauseGuard β OCR Engine v1.0 | |
| βββββββββββββββββββββββββββββ | |
| Smart PDF Router: detects native vs scanned PDFs. | |
| β’ Native PDF β pdfplumber (fast, existing) | |
| β’ Scanned PDF β docTR OCR (CPU-friendly, ~150MB models) | |
| Architecture: | |
| PDF uploaded | |
| β | |
| [detect_if_scanned] β pdfplumber gets <50 chars/page? | |
| β β | |
| Native PDF Scanned PDF | |
| β β | |
| pdfplumber docTR OCR (CPU) | |
| β β | |
| Contract text β existing analysis pipeline | |
| """ | |
| import os | |
| import re | |
| # ββ docTR (soft-fail) βββββββββββββββββββββββββββββββββββββββββββββββ | |
| _HAS_DOCTR = False | |
| _ocr_predictor = None | |
| try: | |
| from doctr.io import DocumentFile | |
| from doctr.models import ocr_predictor as _make_predictor | |
| _HAS_DOCTR = True | |
| except ImportError: | |
| pass | |
| # ββ pdfplumber (soft-fail) ββββββββββββββββββββββββββββββββββββββββββ | |
| try: | |
| import pdfplumber | |
| _HAS_PDF = True | |
| except ImportError: | |
| _HAS_PDF = False | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # OCR MODEL LOADING | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _ocr_status = "not_loaded" | |
| def _load_ocr_model(): | |
| """Load docTR OCR predictor (lazy, on first use).""" | |
| global _ocr_predictor, _ocr_status | |
| if _ocr_predictor is not None: | |
| return _ocr_predictor | |
| if not _HAS_DOCTR: | |
| _ocr_status = "unavailable (python-doctr not installed)" | |
| return None | |
| try: | |
| print("[ClauseGuard OCR] Loading docTR models (fast_base + crnn_vgg16_bn)...") | |
| _ocr_predictor = _make_predictor( | |
| det_arch="fast_base", | |
| reco_arch="crnn_vgg16_bn", | |
| pretrained=True, | |
| assume_straight_pages=True, | |
| ) | |
| _ocr_status = "loaded" | |
| print("[ClauseGuard OCR] docTR models loaded successfully") | |
| return _ocr_predictor | |
| except Exception as e: | |
| _ocr_status = f"failed: {e}" | |
| print(f"[ClauseGuard OCR] docTR load failed: {e}") | |
| return None | |
| def get_ocr_status(): | |
| """Return human-readable OCR engine status.""" | |
| if _ocr_predictor is not None: | |
| return "β OCR: docTR loaded" | |
| elif _HAS_DOCTR: | |
| return "β³ OCR: docTR available (not yet loaded)" | |
| else: | |
| return "β OCR: unavailable (python-doctr not installed)" | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # SMART PDF ROUTER | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _is_scanned_pdf(file_path, min_chars_per_page=50): | |
| """ | |
| Detect if a PDF is scanned (image-based) by checking if pdfplumber | |
| extracts fewer than `min_chars_per_page` characters on average. | |
| """ | |
| if not _HAS_PDF: | |
| return True # Can't check with pdfplumber, assume scanned | |
| try: | |
| with pdfplumber.open(file_path) as pdf: | |
| if len(pdf.pages) == 0: | |
| return True | |
| total_chars = 0 | |
| pages_checked = min(len(pdf.pages), 5) # Check first 5 pages | |
| for i in range(pages_checked): | |
| page_text = pdf.pages[i].extract_text() or "" | |
| total_chars += len(page_text.strip()) | |
| avg_chars = total_chars / pages_checked | |
| return avg_chars < min_chars_per_page | |
| except Exception: | |
| return True # If pdfplumber fails, try OCR | |
| def _extract_native_pdf(file_path): | |
| """Extract text from a native (digital) PDF using pdfplumber.""" | |
| if not _HAS_PDF: | |
| return None, "pdfplumber not installed" | |
| try: | |
| text = "" | |
| with pdfplumber.open(file_path) as pdf: | |
| for page in pdf.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text += page_text + "\n\n" | |
| if not text.strip(): | |
| return None, "No text extracted from PDF" | |
| return text.strip(), None | |
| except Exception as e: | |
| return None, f"PDF parse error: {e}" | |
| def _extract_scanned_pdf(file_path): | |
| """Extract text from a scanned PDF using docTR OCR.""" | |
| predictor = _load_ocr_model() | |
| if predictor is None: | |
| return None, ( | |
| "OCR is not available. Install python-doctr: " | |
| "`pip install python-doctr[torch]`" | |
| ) | |
| try: | |
| doc = DocumentFile.from_pdf(file_path) | |
| result = predictor(doc) | |
| # Extract text page by page | |
| full_text = "" | |
| for page_idx, page in enumerate(result.pages): | |
| page_text = "" | |
| for block in page.blocks: | |
| for line in block.lines: | |
| line_text = " ".join(word.value for word in line.words) | |
| page_text += line_text + "\n" | |
| page_text += "\n" | |
| full_text += page_text + "\n\n" | |
| if not full_text.strip(): | |
| return None, "OCR could not extract text from scanned PDF" | |
| # Clean up OCR artifacts | |
| full_text = _clean_ocr_text(full_text) | |
| return full_text.strip(), None | |
| except Exception as e: | |
| return None, f"OCR error: {e}" | |
| def _clean_ocr_text(text): | |
| """Clean common OCR artifacts.""" | |
| # Remove excessive whitespace | |
| text = re.sub(r'[ \t]{3,}', ' ', text) | |
| # Fix common OCR substitutions | |
| text = re.sub(r'\bl\b(?=[A-Z])', 'I', text) # l before capital β I | |
| # Normalize line breaks | |
| text = re.sub(r'\n{4,}', '\n\n\n', text) | |
| # Remove single-char lines (OCR noise) | |
| lines = text.split('\n') | |
| cleaned_lines = [] | |
| for line in lines: | |
| stripped = line.strip() | |
| if len(stripped) <= 1 and stripped not in ('', '.', ',', ';'): | |
| continue | |
| cleaned_lines.append(line) | |
| return '\n'.join(cleaned_lines) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # PUBLIC API | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def parse_pdf_smart(file_path): | |
| """ | |
| Smart PDF parser with OCR fallback. | |
| Returns: (text, error, method) | |
| text: extracted text (or None) | |
| error: error message (or None) | |
| method: "native" | "ocr" | None | |
| """ | |
| if not os.path.exists(file_path): | |
| return None, "File not found", None | |
| # Step 1: Check if PDF is scanned | |
| is_scanned = _is_scanned_pdf(file_path) | |
| if not is_scanned: | |
| # Step 2a: Native PDF β use pdfplumber | |
| text, error = _extract_native_pdf(file_path) | |
| if text: | |
| return text, None, "native" | |
| # If pdfplumber returns empty, fall through to OCR | |
| print("[ClauseGuard OCR] pdfplumber returned empty β falling back to OCR") | |
| # Step 2b: Scanned PDF or pdfplumber failed β use OCR | |
| print(f"[ClauseGuard OCR] {'Scanned' if is_scanned else 'Empty native'} PDF detected β running docTR OCR...") | |
| text, error = _extract_scanned_pdf(file_path) | |
| if text: | |
| return text, None, "ocr" | |
| return None, error, None | |
| def ocr_extract(file_path): | |
| """ | |
| Force OCR extraction on a PDF (bypass native text check). | |
| Useful when user explicitly wants OCR. | |
| """ | |
| return _extract_scanned_pdf(file_path) | |