from langchain_text_splitters import RecursiveCharacterTextSplitter import os import json def chunk_text( text: str, chunk_size: int, chunk_overlap: int, respect_delimiters: bool = True ) -> list[str]: """ Splits text into manageable chunks using RecursiveCharacterTextSplitter. Args: text (str): The input text as a string. chunk_size (int): Maximum size of each chunk in characters. chunk_overlap (int): Number of overlapping characters between consecutive chunks. respect_delimiters (bool): Whether to respect logical delimiters to avoid splitting in the middle of words or sentences. Returns: list[str]: A list of text chunks. """ # Define a set of hierarchical delimiters for logical splitting if respect_delimiters: delimiters = ["\n\n", "\n", ". "] else: delimiters = None # No special treatment for delimiters # Initialize RecursiveCharacterTextSplitter splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=delimiters, ) # Split the text chunks = list(filter( lambda x: len(x) > 80, set(splitter.split_text(text)) )) return chunks def chunk_file( file_path: str, chunk_size: int, chunk_overlap: int, respect_delimiters: bool = True ) -> list[str]: """ Splits the contents of a file into manageable chunks using RecursiveCharacterTextSplitter. Args: file_path (str): Path to the input file. chunk_size (int): Maximum size of each chunk in characters. chunk_overlap (int): Number of overlapping characters between consecutive chunks. respect_delimiters (bool): Whether to respect logical delimiters to avoid splitting in the middle of words or sentences. Returns: list[str]: A list of text chunks. """ with open(file_path, "r", encoding="utf-8") as file: text = file.read() chunks = chunk_text(text, chunk_size, chunk_overlap, respect_delimiters) return chunks def chunk_files_in_directory( input_dir: str = "data/converted", output_dir: str = "data/chunked", chunk_size: int = 1000, chunk_overlap: int = 100, respect_delimiters: bool = True, ): """ Splits the contents of all files in a directory into manageable chunks using RecursiveCharacterTextSplitter. Args: input_dir (str): Directory containing input files. output_dir (str): Directory to save the chunked files. chunk_size (int): Maximum size of each chunk in characters. chunk_overlap (int): Number of overlapping characters between consecutive chunks. respect_delimiters (bool): Whether to respect logical delimiters to avoid splitting in the middle of words or sentences. """ os.makedirs(output_dir, exist_ok=True) for filename in os.listdir(input_dir): if filename.endswith(".txt"): input_path = os.path.join(input_dir, filename) output_path = os.path.join( output_dir, f"{os.path.splitext(filename)[0]}.json" ) chunks = chunk_file( input_path, chunk_size, chunk_overlap, respect_delimiters ) with open(output_path, "w", encoding="utf-8") as file: json.dump(chunks, file, indent=4)