Spaces:

PercivalFletcher
/

ChadGPT

Sleeping

App Files Files

xet

Community

ChadGPT / chunking.py

PercivalFletcher

Upload 6 files

a19a241 verified 2 months ago

raw

history blame

2.74 kB

	# file: chunking.py
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_core.documents import Document
	from typing import List
	from unstructured.partition.md import partition_md
	from unstructured.documents.elements import Header, Footer, PageBreak, Table, NarrativeText



	# --- Configuration ---
	CHUNK_SIZE = 1000
	CHUNK_OVERLAP = 200

	def process_and_chunk(raw_text: str) -> List[Document]:
	"""
	Partitions raw text from a document processor using 'unstructured',
	correctly interpreting it as markdown to preserve table structures,
	and then chunks the remaining text content.

	Args:
	raw_text: The raw string content of the document (expected to be markdown).

	Returns:
	A list of Document objects, including structured tables and chunked text.
	"""
	if not raw_text:
	print("Warning: Input text for chunking is empty.")
	return []

	print(f"Processing raw text of length {len(raw_text)} with 'unstructured' markdown parser.")

	# --- FIX: Change content_type to "text/markdown" ---
	# This tells unstructured to use its specialized markdown parser, which
	# correctly handles tables and other structures from your PyMuPDF output.
	elements = partition_md(text=raw_text)

	documents = []
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=CHUNK_SIZE,
	chunk_overlap=CHUNK_OVERLAP,
	length_function=len,
	is_separator_regex=False,
	)

	for element in elements:
	if isinstance(element, (Header, Footer, PageBreak)):
	continue
	# Process tables
	if "unstructured.documents.elements.Table" in str(type(element)):
	table_html = element.metadata.text_as_html
	table_metadata = element.metadata.to_dict()
	table_metadata['content_type'] = 'table'
	documents.append(Document(page_content=table_html, metadata=table_metadata))
	# Process and chunk narrative text
	elif "unstructured.documents.elements.NarrativeText" in str(type(element)):
	chunks = text_splitter.split_text(element.text)
	for chunk in chunks:
	chunk_metadata = element.metadata.to_dict()
	chunk_metadata['content_type'] = 'text'
	documents.append(Document(page_content=chunk, metadata=chunk_metadata))
	# Handle other elements directly
	else:
	general_metadata = element.metadata.to_dict()
	general_metadata['content_type'] = 'other'
	documents.append(Document(page_content=element.text, metadata=general_metadata))

	print(f"Created {len(documents)} documents (chunks and tables).")
	return documents