Spaces:

okarachidera
/

CreditCopilot

Sleeping

CreditCopilot / text_utils.py

okara chidera

chore: refactored code

4e36c6c unverified about 1 month ago

972 Bytes

	from __future__ import annotations

	import re
	from typing import List

	import pdfplumber


	def read_pdf_text(pathlike) -> str:
	"""Return concatenated text from every page of the PDF."""
	text: List[str] = []
	with pdfplumber.open(pathlike.name) as pdf:
	for page in pdf.pages:
	text.append(page.extract_text() or "")
	return "\n".join(text)


	def chunk_text(text: str, max_chars: int = 900, overlap: int = 120) -> List[str]:
	"""Split text into overlapping chunks with light sentence-aware boundaries."""
	text = re.sub(r"\s+", " ", text).strip()
	chunks: List[str] = []
	i = 0
	while i < len(text):
	j = min(i + max_chars, len(text))
	if j < len(text):
	candidate = text.rfind(".", i, j)
	if candidate != -1 and candidate > i + 200:
	j = candidate + 1
	chunks.append(text[i:j].strip())
	i = max(j - overlap, j)
	return [chunk for chunk in chunks if chunk]