awellis's picture
Refactor document ingestion and processing; update configurations for chunking and retrieval, enhance error logging, and implement markdown-aware chunking
78a356b
"""Document chunking with markdown-aware semantic splitting."""
from typing import List
from haystack import Document
from haystack.components.preprocessors import DocumentSplitter
from haystack.components.converters import MarkdownToDocument
import logging
import re
logger = logging.getLogger(__name__)
class SemanticChunker:
"""Chunks documents using markdown-aware semantic splitting."""
def __init__(
self,
chunk_size: int = 300,
chunk_overlap: int = 50,
min_chunk_size: int = 100,
):
"""
Initialize the chunker.
Args:
chunk_size: Target number of words per chunk (not used for markdown splitting)
chunk_overlap: Number of words to overlap between chunks
min_chunk_size: Minimum number of words per chunk
"""
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.min_chunk_size = min_chunk_size
# Fallback splitter for non-markdown documents
self.splitter = DocumentSplitter(
split_by="sentence",
split_length=chunk_size,
split_overlap=chunk_overlap,
split_threshold=min_chunk_size,
)
# Markdown to plain text converter
self.md_converter = MarkdownToDocument()
def _markdown_to_plain(self, markdown_text: str) -> str:
"""Convert markdown to plain text, removing formatting."""
# Simple markdown to plain text conversion
text = markdown_text
# Remove ## headers but keep the text
text = re.sub(r'^##\s+(.+)$', r'\1', text, flags=re.MULTILINE)
# Remove bold/italic
text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)
text = re.sub(r'\*(.+?)\*', r'\1', text)
# Remove links but keep text
text = re.sub(r'\[(.+?)\]\(.+?\)', r'\1', text)
# Remove bullet points
text = re.sub(r'^\*\s+', '', text, flags=re.MULTILINE)
return text.strip()
def _split_by_markdown_headers(self, doc: Document) -> List[Document]:
"""Split a document by markdown H2 headers (##), then convert to plain text."""
content = doc.content
# Find all H2 headers and their positions
pattern = r'^## (.+)$'
matches = list(re.finditer(pattern, content, re.MULTILINE))
if not matches:
# No headers found, convert whole document to plain text
plain_text = self._markdown_to_plain(content)
return [Document(content=plain_text, meta=doc.meta)]
chunks = []
doc_title = doc.meta.get("file_name", "Unknown")
# Extract preamble (before first header)
if matches[0].start() > 0:
preamble_md = content[:matches[0].start()].strip()
if preamble_md:
preamble_plain = self._markdown_to_plain(preamble_md)
if len(preamble_plain.split()) >= 10:
chunk_meta = {**doc.meta, "section": "Introduction"}
chunks.append(Document(content=preamble_plain, meta=chunk_meta))
# Extract each section between headers
for i, match in enumerate(matches):
header = match.group(1).strip()
start = match.start()
end = matches[i + 1].start() if i + 1 < len(matches) else len(content)
section_md = content[start:end].strip()
if section_md:
# Convert markdown section to plain text
section_plain = self._markdown_to_plain(section_md)
logger.debug(f"Section '{header}': {len(section_plain.split())} words")
chunk_meta = {**doc.meta, "section": header}
chunks.append(Document(content=section_plain, meta=chunk_meta))
logger.info(f"Split '{doc_title}' into {len(chunks)} sections by markdown headers")
return chunks
def chunk_documents(self, documents: List[Document]) -> List[Document]:
"""
Chunk documents into smaller pieces using markdown-aware splitting.
Args:
documents: List of documents to chunk
Returns:
List of chunked documents with metadata
"""
if not documents:
logger.warning("No documents to chunk")
return []
logger.info(f"Chunking {len(documents)} documents with markdown-aware splitting")
# First, split by markdown headers
all_chunks = []
for doc in documents:
header_chunks = self._split_by_markdown_headers(doc)
all_chunks.extend(header_chunks)
# Add chunk metadata
for idx, doc in enumerate(all_chunks):
if doc.meta is None:
doc.meta = {}
doc.meta["chunk_id"] = idx
doc.meta["chunk_size"] = len(doc.content.split())
logger.info(f"Created {len(all_chunks)} chunks from {len(documents)} documents")
# Log statistics
chunk_sizes = [doc.meta.get("chunk_size", 0) for doc in all_chunks]
if chunk_sizes:
avg_size = sum(chunk_sizes) / len(chunk_sizes)
logger.info(
f"Chunk statistics - Avg: {avg_size:.1f} words, "
f"Min: {min(chunk_sizes)}, Max: {max(chunk_sizes)}"
)
return all_chunks