CreditCopilot / text_utils.py
okara chidera
chore: refactored code
4e36c6c unverified
raw
history blame contribute delete
972 Bytes
from __future__ import annotations
import re
from typing import List
import pdfplumber
def read_pdf_text(pathlike) -> str:
"""Return concatenated text from every page of the PDF."""
text: List[str] = []
with pdfplumber.open(pathlike.name) as pdf:
for page in pdf.pages:
text.append(page.extract_text() or "")
return "\n".join(text)
def chunk_text(text: str, max_chars: int = 900, overlap: int = 120) -> List[str]:
"""Split text into overlapping chunks with light sentence-aware boundaries."""
text = re.sub(r"\s+", " ", text).strip()
chunks: List[str] = []
i = 0
while i < len(text):
j = min(i + max_chars, len(text))
if j < len(text):
candidate = text.rfind(".", i, j)
if candidate != -1 and candidate > i + 200:
j = candidate + 1
chunks.append(text[i:j].strip())
i = max(j - overlap, j)
return [chunk for chunk in chunks if chunk]