sanchitshaleen
Initial deployment of RAG with Gemma-3 to Hugging Face Spaces
4aec76b
"""Contains a function to split text into smaller chunks."""
from typing import List
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
import sys
from pprint import pprint
# pprint(sys.path)
from llm_system.config import DOC_CHAR_LIMIT, DOC_OVERLAP_NO
from logger import get_logger
log = get_logger(name="utils_splitter")
def split_text(
documents: List[Document],
chunk_size: int = DOC_CHAR_LIMIT,
chunk_overlap: int = DOC_OVERLAP_NO
) -> tuple[bool, List[Document], str]:
"""Splits a list of Document objects into smaller chunks.
Args:
documents (List[Document]): List of Document objects to be split.
chunk_size (int): The maximum size of each chunk.
chunk_overlap (int): The number of characters that overlap between chunks.
Returns:
tuple[bool, List[Document], str]: A tuple containing:
- bool: True if the documents were split successfully, False otherwise.
- List[Document]: A list of Document objects containing the split text.
- str: Message indicating the result of the splitting operation.
"""
try:
log.info(f"πŸ”¨ split_text() starting - input: {len(documents)} documents, chunk_size={chunk_size}, overlap={chunk_overlap}")
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
log.info(f"⏳ Executing text_splitter.split_documents()...")
split_docs = text_splitter.split_documents(documents)
log.info(f"βœ… split_documents() completed, got {len(split_docs)} chunks from {len(documents)} documents")
if not split_docs:
log.warning("⚠️ No documents were split. Please check the input documents.")
return True, [], "No documents were split. Please check the input documents."
log.info(f"βœ… Successfully split {len(documents)} documents into {len(split_docs)} chunks. Total content: {sum(len(d.page_content) for d in split_docs)} chars")
return True, split_docs, "Documents split successfully."
except Exception as e:
log.error(f"❌ Error splitting documents: {e}")
import traceback
log.error(f"Traceback: {traceback.format_exc()}")
return False, [], f"Error splitting documents: {e}"
if __name__ == "__main__":
# Example usage
example_docs = [
Document(page_content="This is a sample document. " * 10),
Document(page_content="Another document with some text. " * 5),
Document(page_content="Yet another document with different content. " * 3)
]
status, split_documents, message = split_text(example_docs, chunk_size=100, chunk_overlap=10)
for i, doc in enumerate(split_documents):
print(f"Chunk {i+1}: {doc.page_content}")
# Print first 50 characters of each chunk
# print(f"Chunk {i+1}: {doc.page_content[:50]}...")