import os.path from tqdm import tqdm from logger import logger from langchain_community.vectorstores import FAISS from langchain_huggingface import HuggingFaceEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader embeddings = HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2") if not os.path.exists("faiss_index"): files = os.listdir("dataset") for file in tqdm(files): try: path = os.path.join(os.path.join("dataset", file)) docs = PyPDFLoader(path) documents = docs.load() splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) splitted_docs = splitter.split_documents(documents) vector_db = FAISS.from_documents(splitted_docs, embedding=embeddings) vector_db.save_local("faiss_index") logger.info(f"Success for file :{file}") except Exception as e: logger.error(f"Error {e} for file :{file}") else: vector_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True) retriever = vector_db.as_retriever() logger.info(retriever.invoke("what is machine learning")) print("#"*90)