Spaces:
Sleeping
Sleeping
# file: chunking.py | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_core.documents import Document | |
from typing import List | |
from unstructured.partition.md import partition_md | |
from unstructured.documents.elements import Header, Footer, PageBreak, Table, NarrativeText | |
# --- Configuration --- | |
CHUNK_SIZE = 1000 | |
CHUNK_OVERLAP = 200 | |
def process_and_chunk(raw_text: str) -> List[Document]: | |
""" | |
Partitions raw text from a document processor using 'unstructured', | |
correctly interpreting it as markdown to preserve table structures, | |
and then chunks the remaining text content. | |
Args: | |
raw_text: The raw string content of the document (expected to be markdown). | |
Returns: | |
A list of Document objects, including structured tables and chunked text. | |
""" | |
if not raw_text: | |
print("Warning: Input text for chunking is empty.") | |
return [] | |
print(f"Processing raw text of length {len(raw_text)} with 'unstructured' markdown parser.") | |
# --- FIX: Change content_type to "text/markdown" --- | |
# This tells unstructured to use its specialized markdown parser, which | |
# correctly handles tables and other structures from your PyMuPDF output. | |
elements = partition_md(text=raw_text) | |
documents = [] | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=CHUNK_SIZE, | |
chunk_overlap=CHUNK_OVERLAP, | |
length_function=len, | |
is_separator_regex=False, | |
) | |
for element in elements: | |
if isinstance(element, (Header, Footer, PageBreak)): | |
continue | |
# Process tables | |
if "unstructured.documents.elements.Table" in str(type(element)): | |
table_html = element.metadata.text_as_html | |
table_metadata = element.metadata.to_dict() | |
table_metadata['content_type'] = 'table' | |
documents.append(Document(page_content=table_html, metadata=table_metadata)) | |
# Process and chunk narrative text | |
elif "unstructured.documents.elements.NarrativeText" in str(type(element)): | |
chunks = text_splitter.split_text(element.text) | |
for chunk in chunks: | |
chunk_metadata = element.metadata.to_dict() | |
chunk_metadata['content_type'] = 'text' | |
documents.append(Document(page_content=chunk, metadata=chunk_metadata)) | |
# Handle other elements directly | |
else: | |
general_metadata = element.metadata.to_dict() | |
general_metadata['content_type'] = 'other' | |
documents.append(Document(page_content=element.text, metadata=general_metadata)) | |
print(f"Created {len(documents)} documents (chunks and tables).") | |
return documents |