Spaces:

sanchitshaleen
/

chat-with-your-data

Sleeping

chat-with-your-data / server /llm_system /utils /splitter.py

sanchitshaleen

Initial deployment of RAG with Gemma-3 to Hugging Face Spaces

4aec76b 14 days ago

2.99 kB

	"""Contains a function to split text into smaller chunks."""

	from typing import List
	from langchain_core.documents import Document
	from langchain_text_splitters import RecursiveCharacterTextSplitter

	import sys
	from pprint import pprint
	# pprint(sys.path)

	from llm_system.config import DOC_CHAR_LIMIT, DOC_OVERLAP_NO

	from logger import get_logger
	log = get_logger(name="utils_splitter")


	def split_text(
	documents: List[Document],
	chunk_size: int = DOC_CHAR_LIMIT,
	chunk_overlap: int = DOC_OVERLAP_NO
	) -> tuple[bool, List[Document], str]:
	"""Splits a list of Document objects into smaller chunks.

	Args:
	documents (List[Document]): List of Document objects to be split.
	chunk_size (int): The maximum size of each chunk.
	chunk_overlap (int): The number of characters that overlap between chunks.

	Returns:
	tuple[bool, List[Document], str]: A tuple containing:
	- bool: True if the documents were split successfully, False otherwise.
	- List[Document]: A list of Document objects containing the split text.
	- str: Message indicating the result of the splitting operation.
	"""

	try:
	log.info(f"🔨 split_text() starting - input: {len(documents)} documents, chunk_size={chunk_size}, overlap={chunk_overlap}")
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size, chunk_overlap=chunk_overlap
	)
	log.info(f"⏳ Executing text_splitter.split_documents()...")

	split_docs = text_splitter.split_documents(documents)
	log.info(f"✅ split_documents() completed, got {len(split_docs)} chunks from {len(documents)} documents")

	if not split_docs:
	log.warning("⚠️ No documents were split. Please check the input documents.")
	return True, [], "No documents were split. Please check the input documents."

	log.info(f"✅ Successfully split {len(documents)} documents into {len(split_docs)} chunks. Total content: {sum(len(d.page_content) for d in split_docs)} chars")
	return True, split_docs, "Documents split successfully."

	except Exception as e:
	log.error(f"❌ Error splitting documents: {e}")
	import traceback
	log.error(f"Traceback: {traceback.format_exc()}")
	return False, [], f"Error splitting documents: {e}"


	if __name__ == "__main__":
	# Example usage
	example_docs = [
	Document(page_content="This is a sample document. " * 10),
	Document(page_content="Another document with some text. " * 5),
	Document(page_content="Yet another document with different content. " * 3)
	]

	status, split_documents, message = split_text(example_docs, chunk_size=100, chunk_overlap=10)

	for i, doc in enumerate(split_documents):
	print(f"Chunk {i+1}: {doc.page_content}")
	# Print first 50 characters of each chunk
	# print(f"Chunk {i+1}: {doc.page_content[:50]}...")