from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.document_loaders import PyPDFLoader, DirectoryLoader from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import FAISS DATA_PATH = "data/" DB_FAISS_PATH = "vectorstores/db_faiss" #model path: #https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/blob/main/llama-2-7b-chat.ggmlv3.q8_0.bin #create vector database def create_vector_db(): loader = DirectoryLoader( DATA_PATH, glob='*.pdf', loader_cls=PyPDFLoader ) documents = loader.load() text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 50) texts = text_splitter.split_documents(documents) embeddings = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L6-v2', model_kwargs = {'device': 'cpu'}) db = FAISS.from_documents(texts, embeddings) db.save_local(DB_FAISS_PATH) if __name__ == '__main__': create_vector_db()