Spaces:
Runtime error
Runtime error
| """PDF text extraction and chunking for RAG.""" | |
| import re | |
| from pathlib import Path | |
| from typing import List | |
| import pdfplumber | |
| from pypdf import PdfReader | |
| from config import CHUNK_OVERLAP, CHUNK_SIZE | |
| def extract_text_from_pdf(pdf_path: str | Path) -> str: | |
| """Extract text from a PDF using pdfplumber (better for tables) with pypdf fallback.""" | |
| path = Path(pdf_path) | |
| if not path.exists(): | |
| raise FileNotFoundError(f"PDF not found: {path}") | |
| text_parts: List[str] = [] | |
| try: | |
| with pdfplumber.open(path) as pdf: | |
| for page in pdf.pages: | |
| t = page.extract_text() | |
| if t: | |
| text_parts.append(t) | |
| except Exception: | |
| # Fallback to pypdf | |
| reader = PdfReader(path) | |
| for page in reader.pages: | |
| t = page.extract_text() | |
| if t: | |
| text_parts.append(t) | |
| raw = "\n\n".join(text_parts) | |
| # Normalize whitespace | |
| return re.sub(r"\s+", " ", raw).strip() | |
| def chunk_text( | |
| text: str, | |
| chunk_size: int = CHUNK_SIZE, | |
| overlap: int = CHUNK_OVERLAP, | |
| ) -> List[dict]: | |
| """ | |
| Split text into overlapping chunks for embedding. | |
| Returns list of dicts with 'text' and 'metadata' (source, chunk_index). | |
| """ | |
| if not text or not text.strip(): | |
| return [] | |
| chunks: List[dict] = [] | |
| start = 0 | |
| index = 0 | |
| text = text.strip() | |
| while start < len(text): | |
| end = start + chunk_size | |
| chunk = text[start:end] | |
| # Try to break at sentence or word boundary | |
| if end < len(text): | |
| last_period = chunk.rfind(". ") | |
| last_newline = chunk.rfind("\n") | |
| break_at = max(last_period, last_newline) | |
| if break_at > chunk_size // 2: | |
| chunk = chunk[: break_at + 1] | |
| end = start + break_at + 1 | |
| chunk = chunk.strip() | |
| if chunk: | |
| chunks.append({ | |
| "text": chunk, | |
| "metadata": {"chunk_index": index}, | |
| }) | |
| index += 1 | |
| start = end - overlap if end < len(text) else len(text) | |
| return chunks | |
| def process_pdf(pdf_path: str | Path, source_name: str | None = None) -> List[dict]: | |
| """ | |
| Extract text from PDF and return chunks with source metadata. | |
| source_name: optional label (e.g. filename) for metadata. | |
| """ | |
| path = Path(pdf_path) | |
| source_name = source_name or path.name | |
| text = extract_text_from_pdf(path) | |
| chunks = chunk_text(text) | |
| for c in chunks: | |
| c["metadata"]["source"] = source_name | |
| return chunks | |