myrag / backend /chunk.py
Adir Gozlan
second commit
a200fe6
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter, NLTKTextSplitter, SpacyTextSplitter
docs_path = "/Users/egozlan/Projects/genai_course/week5/rag-gradio-sample-project/docs"
char_text_splitter = CharacterTextSplitter(
separator="\n",
chunk_size=256,
chunk_overlap=20
)
recursive_text_splitter = RecursiveCharacterTextSplitter(
chunk_size=256,
chunk_overlap=20
)
nltk_text_splitter = NLTKTextSplitter()
spacy_text_splitter = SpacyTextSplitter()
def split_chunks(splitter, input_dir, output_dir):
files_split = {}
for filename in os.listdir(input_dir):
file_path = os.path.join(input_dir, filename)
with open(file_path, 'r') as file:
text = file.read()
texts = splitter.create_documents([text])
files_split[filename[:-4]] = texts
os.makedirs(output_dir, exist_ok=True)
for key, texts in files_split.items():
for index, text in enumerate(texts):
file_path = os.path.join(output_dir, f"{key}_{index}.txt")
with open(file_path, 'w') as file:
file.write(text.page_content)
print('Chunks written')
split_chunks(char_text_splitter, docs_path, '/Users/egozlan/Projects/genai_course/week5/rag-gradio-sample-project/char_split_docs')
split_chunks(recursive_text_splitter, docs_path, '/Users/egozlan/Projects/genai_course/week5/rag-gradio-sample-project/recursive_split_docs')
split_chunks(nltk_text_splitter, docs_path, '/Users/egozlan/Projects/genai_course/week5/rag-gradio-sample-project/nltk_split_docs')
split_chunks(nltk_text_splitter, docs_path, '/Users/egozlan/Projects/genai_course/week5/rag-gradio-sample-project/spacy_split_docs')