RegBotBeta / models /vector_database.py
Zwea Htet
added huggingface api
66bfc6b
from typing import List
from pinecone import Pinecone, ServerlessSpec
from llama_index.vector_stores.pinecone import PineconeVectorStore
from dotenv import load_dotenv
from llama_index.core import (
SimpleDirectoryReader,
Document,
VectorStoreIndex,
StorageContext,
)
from huggingface_hub import HfFileSystem, HfApi
import os
load_dotenv()
# Pinecone Vector Database
pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
pc_index_name = "llama-integration-pinecone"
# pc_index_name = "openai-embeddings"
pc_indexes = pc.list_indexes()
# Check if the index already exists
def index_exists(index_name):
for index in pc_indexes:
if index["name"] == index_name:
return True
return False
# Create the index if it doesn't exist
if not index_exists(pc_index_name):
pc.create_index(
name=pc_index_name,
dimension=1536,
spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)
# Initialize your index
pinecone_index = pc.Index(pc_index_name)
# print("Deleting all vectors in the pinecone index: ", pinecone_index.delete(delete_all=True))
# print("Deleting all vectors with the namespace 'calregs_pdf': ", pinecone_index.delete(namespace="calregs_pdf"))
SAVE_DIR = "uploaded_files"
def _namespace_exists(namespace: str):
namespaces = pinecone_index.describe_index_stats()["namespaces"]
return namespace in namespaces
def get_pinecone_index(filename: str) -> VectorStoreIndex:
"""This function loads the index from Pinecone if it exists, otherwise it creates a new index from the document."""
namespace = filename.replace(".", "_").replace(" ", "_")
pinecone_vector_store = PineconeVectorStore(
pinecone_index=pinecone_index,
namespace=namespace,
)
index = None
if _namespace_exists(namespace=namespace):
print(f"Namespace {namespace} exists.")
index = VectorStoreIndex.from_vector_store(vector_store=pinecone_vector_store)
else:
reader = SimpleDirectoryReader(input_files=[f"{SAVE_DIR}/{filename}"])
docs = reader.load_data(show_progress=True)
storage_context = StorageContext.from_defaults(
vector_store=pinecone_vector_store
)
index = VectorStoreIndex.from_documents(
documents=docs, show_progress=True, storage_context=storage_context
)
return index
api = HfApi(
token=os.environ.get("HF_TOKEN")
)
api.upload_file(
repo_id="hbui/RegBot4.0",
path_or_fileobj=f"{SAVE_DIR}/calregs.pdf",
path_in_repo=f"{SAVE_DIR}/calregs.pdf",
)