Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import re | |
| from typing import List | |
| import pdfplumber | |
| def read_pdf_text(pathlike) -> str: | |
| """Return concatenated text from every page of the PDF.""" | |
| text: List[str] = [] | |
| with pdfplumber.open(pathlike.name) as pdf: | |
| for page in pdf.pages: | |
| text.append(page.extract_text() or "") | |
| return "\n".join(text) | |
| def chunk_text(text: str, max_chars: int = 900, overlap: int = 120) -> List[str]: | |
| """Split text into overlapping chunks with light sentence-aware boundaries.""" | |
| text = re.sub(r"\s+", " ", text).strip() | |
| chunks: List[str] = [] | |
| i = 0 | |
| while i < len(text): | |
| j = min(i + max_chars, len(text)) | |
| if j < len(text): | |
| candidate = text.rfind(".", i, j) | |
| if candidate != -1 and candidate > i + 200: | |
| j = candidate + 1 | |
| chunks.append(text[i:j].strip()) | |
| i = max(j - overlap, j) | |
| return [chunk for chunk in chunks if chunk] | |