Spaces:

awellis
/

bfh-studadmin-assist

Running on CPU Upgrade

bfh-studadmin-assist / src /document_processing /chunker.py

Refactor document ingestion and processing; update configurations for chunking and retrieval, enhance error logging, and implement markdown-aware chunking

78a356b 5 months ago

raw

history blame contribute delete

5.31 kB

	"""Document chunking with markdown-aware semantic splitting."""

	from typing import List
	from haystack import Document
	from haystack.components.preprocessors import DocumentSplitter
	from haystack.components.converters import MarkdownToDocument
	import logging
	import re

	logger = logging.getLogger(__name__)


	class SemanticChunker:
	"""Chunks documents using markdown-aware semantic splitting."""

	def __init__(
	self,
	chunk_size: int = 300,
	chunk_overlap: int = 50,
	min_chunk_size: int = 100,
	):
	"""
	Initialize the chunker.

	Args:
	chunk_size: Target number of words per chunk (not used for markdown splitting)
	chunk_overlap: Number of words to overlap between chunks
	min_chunk_size: Minimum number of words per chunk
	"""
	self.chunk_size = chunk_size
	self.chunk_overlap = chunk_overlap
	self.min_chunk_size = min_chunk_size

	# Fallback splitter for non-markdown documents
	self.splitter = DocumentSplitter(
	split_by="sentence",
	split_length=chunk_size,
	split_overlap=chunk_overlap,
	split_threshold=min_chunk_size,
	)

	# Markdown to plain text converter
	self.md_converter = MarkdownToDocument()

	def _markdown_to_plain(self, markdown_text: str) -> str:
	"""Convert markdown to plain text, removing formatting."""
	# Simple markdown to plain text conversion
	text = markdown_text
	# Remove ## headers but keep the text
	text = re.sub(r'^##\s+(.+)$', r'\1', text, flags=re.MULTILINE)
	# Remove bold/italic
	text = re.sub(r'\\(.+?)\\', r'\1', text)
	text = re.sub(r'\(.+?)\', r'\1', text)
	# Remove links but keep text
	text = re.sub(r'\[(.+?)\]$.+?$', r'\1', text)
	# Remove bullet points
	text = re.sub(r'^\*\s+', '', text, flags=re.MULTILINE)
	return text.strip()

	def _split_by_markdown_headers(self, doc: Document) -> List[Document]:
	"""Split a document by markdown H2 headers (##), then convert to plain text."""
	content = doc.content

	# Find all H2 headers and their positions
	pattern = r'^## (.+)$'
	matches = list(re.finditer(pattern, content, re.MULTILINE))

	if not matches:
	# No headers found, convert whole document to plain text
	plain_text = self._markdown_to_plain(content)
	return [Document(content=plain_text, meta=doc.meta)]

	chunks = []
	doc_title = doc.meta.get("file_name", "Unknown")

	# Extract preamble (before first header)
	if matches[0].start() > 0:
	preamble_md = content[:matches[0].start()].strip()
	if preamble_md:
	preamble_plain = self._markdown_to_plain(preamble_md)
	if len(preamble_plain.split()) >= 10:
	chunk_meta = {**doc.meta, "section": "Introduction"}
	chunks.append(Document(content=preamble_plain, meta=chunk_meta))

	# Extract each section between headers
	for i, match in enumerate(matches):
	header = match.group(1).strip()
	start = match.start()
	end = matches[i + 1].start() if i + 1 < len(matches) else len(content)

	section_md = content[start:end].strip()

	if section_md:
	# Convert markdown section to plain text
	section_plain = self._markdown_to_plain(section_md)

	logger.debug(f"Section '{header}': {len(section_plain.split())} words")

	chunk_meta = {**doc.meta, "section": header}
	chunks.append(Document(content=section_plain, meta=chunk_meta))

	logger.info(f"Split '{doc_title}' into {len(chunks)} sections by markdown headers")
	return chunks

	def chunk_documents(self, documents: List[Document]) -> List[Document]:
	"""
	Chunk documents into smaller pieces using markdown-aware splitting.

	Args:
	documents: List of documents to chunk

	Returns:
	List of chunked documents with metadata
	"""
	if not documents:
	logger.warning("No documents to chunk")
	return []

	logger.info(f"Chunking {len(documents)} documents with markdown-aware splitting")

	# First, split by markdown headers
	all_chunks = []
	for doc in documents:
	header_chunks = self._split_by_markdown_headers(doc)
	all_chunks.extend(header_chunks)

	# Add chunk metadata
	for idx, doc in enumerate(all_chunks):
	if doc.meta is None:
	doc.meta = {}
	doc.meta["chunk_id"] = idx
	doc.meta["chunk_size"] = len(doc.content.split())

	logger.info(f"Created {len(all_chunks)} chunks from {len(documents)} documents")

	# Log statistics
	chunk_sizes = [doc.meta.get("chunk_size", 0) for doc in all_chunks]
	if chunk_sizes:
	avg_size = sum(chunk_sizes) / len(chunk_sizes)
	logger.info(
	f"Chunk statistics - Avg: {avg_size:.1f} words, "
	f"Min: {min(chunk_sizes)}, Max: {max(chunk_sizes)}"
	)

	return all_chunks