Spaces:
Runtime error
Runtime error
| from typing import Optional, List | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from pydantic import BaseModel | |
| class RecursiveCharacterTextChunkerConfig(BaseModel): | |
| chunk_size: int = 500 | |
| chunk_overlap: int = 100 | |
| class RecursiveCharacterTextChunker: | |
| def __init__(self, config: RecursiveCharacterTextChunkerConfig): | |
| self.config = config | |
| def chunk_text(self, text: str, separators: Optional[List[str]] = None) -> List[str]: | |
| """ | |
| Chunks a single text string using Langchain's RecursiveCharacterTextSplitter. | |
| This function is designed to be easily used with pandas DataFrame.apply(). | |
| Args: | |
| text (str): The input text string to be chunked. | |
| chunk_size (int): The maximum number of characters per chunk. | |
| chunk_overlap (int): The number of characters to overlap between chunks. | |
| separators (Optional[List[str]]): A list of characters/strings to use as split points. | |
| Defaults to common markdown-friendly separators. | |
| Returns: | |
| List[str]: A list of chunked text strings. | |
| If the input text is empty or None, returns an empty list. | |
| """ | |
| if not text: | |
| return [] | |
| # Initialize the splitter inside the function. | |
| # This ensures each text receives a fresh splitter instance if needed, | |
| # though it's more efficient to initialize it once outside if possible | |
| # and pass it, but for df.apply() direct column operation, this is common. | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=self.config.chunk_size, | |
| chunk_overlap=self.config.chunk_overlap, | |
| separators=separators or ["\n\n", "\n", " ", ""], # Default separators | |
| length_function=len, # Use character length | |
| is_separator_regex=False | |
| ) | |
| # Use split_text which returns a list of strings | |
| chunked_texts = text_splitter.split_text(text) | |
| return chunked_texts | |