|
|
|
import os.path |
|
from tqdm import tqdm |
|
from logger import logger |
|
from langchain_community.vectorstores import FAISS |
|
from langchain_huggingface import HuggingFaceEmbeddings |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader |
|
|
|
|
|
embeddings = HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2") |
|
if not os.path.exists("faiss_index"): |
|
files = os.listdir("dataset") |
|
for file in tqdm(files): |
|
try: |
|
path = os.path.join(os.path.join("dataset", file)) |
|
docs = PyPDFLoader(path) |
|
documents = docs.load() |
|
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) |
|
splitted_docs = splitter.split_documents(documents) |
|
vector_db = FAISS.from_documents(splitted_docs, embedding=embeddings) |
|
vector_db.save_local("faiss_index") |
|
logger.info(f"Success for file :{file}") |
|
except Exception as e: |
|
logger.error(f"Error {e} for file :{file}") |
|
else: |
|
vector_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True) |
|
retriever = vector_db.as_retriever() |
|
logger.info(retriever.invoke("what is machine learning")) |
|
print("#"*90) |