Demos / backend /classes /chunker /text_chunker.py
nikhile-galileo's picture
Adding finance protect demo
e68d535
raw
history blame
2.08 kB
from typing import Optional, List
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pydantic import BaseModel
class RecursiveCharacterTextChunkerConfig(BaseModel):
chunk_size: int = 500
chunk_overlap: int = 100
class RecursiveCharacterTextChunker:
def __init__(self, config: RecursiveCharacterTextChunkerConfig):
self.config = config
def chunk_text(self, text: str, separators: Optional[List[str]] = None) -> List[str]:
"""
Chunks a single text string using Langchain's RecursiveCharacterTextSplitter.
This function is designed to be easily used with pandas DataFrame.apply().
Args:
text (str): The input text string to be chunked.
chunk_size (int): The maximum number of characters per chunk.
chunk_overlap (int): The number of characters to overlap between chunks.
separators (Optional[List[str]]): A list of characters/strings to use as split points.
Defaults to common markdown-friendly separators.
Returns:
List[str]: A list of chunked text strings.
If the input text is empty or None, returns an empty list.
"""
if not text:
return []
# Initialize the splitter inside the function.
# This ensures each text receives a fresh splitter instance if needed,
# though it's more efficient to initialize it once outside if possible
# and pass it, but for df.apply() direct column operation, this is common.
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=self.config.chunk_size,
chunk_overlap=self.config.chunk_overlap,
separators=separators or ["\n\n", "\n", " ", ""], # Default separators
length_function=len, # Use character length
is_separator_regex=False
)
# Use split_text which returns a list of strings
chunked_texts = text_splitter.split_text(text)
return chunked_texts