policy-analysis / utils /loading_embeddings.py
kaburia's picture
redesigned modules
ef26a79
# Loading embeddings from storage
import os
from pathlib import Path
from huggingface_hub import hf_hub_download
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
# download it at the data directory
data_path = os.path.join(Path(os.getcwd()).parent, "data")
# make the faiss local folder
local_folder = os.path.join(data_path, 'faiss_index')
def download_faiss_index(repo_id="kaburia/epic-a-embeddings", local_folder="faiss_index"):
os.makedirs(local_folder, exist_ok=True)
index_faiss_path = os.path.join(local_folder, "index.faiss")
index_pkl_path = os.path.join(local_folder, "index.pkl")
if not os.path.exists(index_faiss_path):
print("Downloading index.faiss from Hugging Face Dataset...")
hf_hub_download(
repo_id=repo_id,
filename="index.faiss",
repo_type="dataset",
local_dir=local_folder,
local_dir_use_symlinks=False,
)
if not os.path.exists(index_pkl_path):
print("Downloading index.pkl from Hugging Face Dataset...")
hf_hub_download(
repo_id=repo_id,
filename="index.pkl",
repo_type="dataset",
local_dir=local_folder,
local_dir_use_symlinks=False,
)
def load_vectorstore(index_path="faiss_index"):
embedding_model = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2"
)
db = FAISS.load_local(
index_path,
embeddings=embedding_model,
allow_dangerous_deserialization=True
)
return db
# download and load vectorstore
def get_vectorstore(repo_id="kaburia/epic-a-embeddings", local_folder="faiss_index"):
download_faiss_index(repo_id=repo_id, local_folder=local_folder)
return load_vectorstore(index_path=local_folder)