Spaces:
Running
Running
| """ | |
| PDF Reader | |
| ---------- | |
| Low-level utility: opens a PDF and returns its text page by page. | |
| Uses pdfplumber for clean text extraction; falls back to PyMuPDF if needed. | |
| """ | |
| import re | |
| from pathlib import Path | |
| from typing import List, Dict | |
| def read_pdf_pages(pdf_path: str) -> List[str]: | |
| """Return a list of strings, one per page.""" | |
| path = Path(pdf_path) | |
| if not path.exists(): | |
| raise FileNotFoundError(f"PDF not found: {pdf_path}") | |
| try: | |
| import pdfplumber | |
| pages = [] | |
| with pdfplumber.open(pdf_path) as pdf: | |
| for page in pdf.pages: | |
| text = page.extract_text() or "" | |
| pages.append(text) | |
| return pages | |
| except ImportError: | |
| pass | |
| # Fallback: PyMuPDF (fitz) | |
| try: | |
| import fitz | |
| doc = fitz.open(pdf_path) | |
| pages = [doc[i].get_text() for i in range(len(doc))] | |
| doc.close() | |
| return pages | |
| except ImportError: | |
| raise ImportError( | |
| "Neither pdfplumber nor PyMuPDF is installed.\n" | |
| "Run: pip install pdfplumber OR pip install pymupdf" | |
| ) | |
| def read_pdf_text(pdf_path: str) -> str: | |
| """Return full text of a PDF as a single string.""" | |
| return "\n".join(read_pdf_pages(pdf_path)) | |
| def extract_tables_from_pdf(pdf_path: str) -> List[List[List[str]]]: | |
| """ | |
| Extract tables page by page using pdfplumber. | |
| Returns a list (one per page) of tables, | |
| where each table is a list of rows (list of cell strings). | |
| """ | |
| try: | |
| import pdfplumber | |
| result = [] | |
| with pdfplumber.open(pdf_path) as pdf: | |
| for page in pdf.pages: | |
| tables = page.extract_tables() or [] | |
| result.append(tables) | |
| return result | |
| except ImportError: | |
| raise ImportError("pdfplumber is required for table extraction. Run: pip install pdfplumber") | |
| def clean_text(text: str) -> str: | |
| """Normalise whitespace and remove junk characters.""" | |
| text = re.sub(r"\s+", " ", text) | |
| return text.strip() | |