File size: 1,270 Bytes
b676995 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 |
import os.path
from tqdm import tqdm
from logger import logger
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader
embeddings = HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2")
if not os.path.exists("faiss_index"):
files = os.listdir("dataset")
for file in tqdm(files):
try:
path = os.path.join(os.path.join("dataset", file))
docs = PyPDFLoader(path)
documents = docs.load()
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splitted_docs = splitter.split_documents(documents)
vector_db = FAISS.from_documents(splitted_docs, embedding=embeddings)
vector_db.save_local("faiss_index")
logger.info(f"Success for file :{file}")
except Exception as e:
logger.error(f"Error {e} for file :{file}")
else:
vector_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
retriever = vector_db.as_retriever()
logger.info(retriever.invoke("what is machine learning"))
print("#"*90) |