from pathlib import Path from typing import List from langchain.schema import Document from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores.chroma import Chroma from langchain_community.document_loaders import TextLoader from langchain_openai import OpenAIEmbeddings import configs embeddings_model = OpenAIEmbeddings() def process_documents(doc_storage_path: str): print("doc preprocessing...") doc_directory = Path(doc_storage_path) docs = [] # type: List[Document] text_splitter = RecursiveCharacterTextSplitter( chunk_size=configs.CHUNK_SIZE, chunk_overlap=configs.CHUNK_OVERLAP ) doc_search = Chroma( persist_directory=configs.STORE_FILE, embedding_function=embeddings_model ) for file_path in doc_directory.glob("*.txt"): loader = TextLoader(str(file_path)) documents = loader.load() docs = text_splitter.split_documents(documents) doc_search = doc_search.from_documents( docs, embeddings_model, persist_directory=configs.STORE_FILE ) doc_search.persist() print("doc preprocessing end.") return doc_search def format_docs(docs): return "\n\n".join([d.page_content for d in docs])