from __future__ import annotations import re from typing import List import pdfplumber def read_pdf_text(pathlike) -> str: """Return concatenated text from every page of the PDF.""" text: List[str] = [] with pdfplumber.open(pathlike.name) as pdf: for page in pdf.pages: text.append(page.extract_text() or "") return "\n".join(text) def chunk_text(text: str, max_chars: int = 900, overlap: int = 120) -> List[str]: """Split text into overlapping chunks with light sentence-aware boundaries.""" text = re.sub(r"\s+", " ", text).strip() chunks: List[str] = [] i = 0 while i < len(text): j = min(i + max_chars, len(text)) if j < len(text): candidate = text.rfind(".", i, j) if candidate != -1 and candidate > i + 200: j = candidate + 1 chunks.append(text[i:j].strip()) i = max(j - overlap, j) return [chunk for chunk in chunks if chunk]