from __future__ import annotations

import re
from typing import List

import pdfplumber


def read_pdf_text(pathlike) -> str:
    """Return concatenated text from every page of the PDF."""
    text: List[str] = []
    with pdfplumber.open(pathlike.name) as pdf:
        for page in pdf.pages:
            text.append(page.extract_text() or "")
    return "\n".join(text)


def chunk_text(text: str, max_chars: int = 900, overlap: int = 120) -> List[str]:
    """Split text into overlapping chunks with light sentence-aware boundaries."""
    text = re.sub(r"\s+", " ", text).strip()
    chunks: List[str] = []
    i = 0
    while i < len(text):
        j = min(i + max_chars, len(text))
        if j < len(text):
            candidate = text.rfind(".", i, j)
            if candidate != -1 and candidate > i + 200:
                j = candidate + 1
        chunks.append(text[i:j].strip())
        i = max(j - overlap, j)
    return [chunk for chunk in chunks if chunk]