RAG / backend /chunk_splitting.py
thenativefox
Added split files and tables
939262b
from typing import List
import os
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
def fixed_size_split(text: str, chunk_size: int, chunk_overlap: int) -> List[str]:
text_splitter = CharacterTextSplitter(
separator="\n\n",
chunk_size=chunk_size,
chunk_overlap=chunk_overlap
)
docs = text_splitter.split_text(text)
return docs
def recursive_character_split(text: str, chunk_size: int, chunk_overlap: int, model_name: str = "gpt-4o") -> List[str]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
model_name=model_name,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
separators=["\n\n", "\n"],
allowed_special={'<|endoftext|>'},
disallowed_special=()
)
docs = text_splitter.split_text(text)
return docs
def save_chunks(chunks, output_dir, base_filename):
for i, chunk in enumerate(chunks):
with open(os.path.join(output_dir, f"{base_filename}_chunk_{i}.txt"), 'w') as f:
f.write(chunk)
def chunk_documents(embedding_model: str):
# Set chunk size and overlap based on embedding model
if embedding_model == "sentence-transformers/all-MiniLM-L6-v2":
chunk_size = 500
chunk_overlap = 50
elif embedding_model == "BAAI/bge-large-en-v1.5":
chunk_size = 1000
chunk_overlap = 100
elif embedding_model == "openai/text-embedding-ada-002":
chunk_size = 4096
chunk_overlap = 200
else:
raise ValueError(f"Unsupported embedding model: {embedding_model}")
# Directory paths
input_dir = "/Users/anvereshko/Desktop/rag-gradio-sample-project/gradio_app/documentation"
model_dir = os.path.join(os.path.dirname(input_dir), f"{embedding_model.replace('/', '_')}")
fixed_output_dir = os.path.join(model_dir, "fixed_chunks")
recursive_output_dir = os.path.join(model_dir, "recursive_chunks")
# Create output directories if they don't exist
os.makedirs(fixed_output_dir, exist_ok=True)
os.makedirs(recursive_output_dir, exist_ok=True)
# Process each document
for filename in os.listdir(input_dir):
if filename.endswith(".txt"):
with open(os.path.join(input_dir, filename), 'r') as file:
text = file.read()
# Chunk using fixed size
fixed_chunks = fixed_size_split(text, chunk_size, chunk_overlap)
save_chunks(fixed_chunks, fixed_output_dir, filename)
# Chunk using recursive character splitting
recursive_chunks = recursive_character_split(text, chunk_size, chunk_overlap)
save_chunks(recursive_chunks, recursive_output_dir, filename)
if __name__ == "__main__":
embedding_models = [
"sentence-transformers/all-MiniLM-L6-v2",
"BAAI/bge-large-en-v1.5",
"openai/text-embedding-ada-002"
]
for model in embedding_models:
chunk_documents(model)