smart-research-assistant / backend /pinecone_utilis.py
umar-100's picture
minor bug fixes
b2926b5
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from typing import List
from langchain_core.documents import Document
import os
from dotenv import load_dotenv
load_dotenv()
# API keys
PINECONE_API_KEY=os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")
# text splitter and embedding function
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, length_function=len)
embeddings = OpenAIEmbeddings(model="text-embedding-3-large", dimensions=1024, api_key=OPENAI_API_KEY)
# Pinecone vector store
pc = Pinecone(api_key=PINECONE_API_KEY)
def load_and_split_document(file_path: str) -> List[Document]:
if file_path.endswith('.pdf'):
loader = PyPDFLoader(file_path)
elif file_path.endswith('.txt'):
loader = TextLoader(file_path)
else:
raise ValueError(f"Unsupported file type: {file_path}")
documents = loader.load()
return text_splitter.split_documents(documents)
INDEX_NAME = "smart-research-assistant"
def create_pinecone_vectorstore()-> PineconeVectorStore:
try:
if not pc.has_index(INDEX_NAME):
pc.create_index(
name=INDEX_NAME,
dimension=1024,
metric="cosine",
spec=ServerlessSpec(cloud="aws", region="us-east-1")
)
index = pc.Index(INDEX_NAME)
return PineconeVectorStore(index=index, embedding=embeddings)
except Exception as e:
print(f"Index initialization failed: {e}")
raise
vectorstore=create_pinecone_vectorstore()
def index_document_to_pinecone(file_path: str, file_id: int) -> bool:
try:
splits = load_and_split_document(file_path)
# Add metadata to each split
for split in splits:
split.metadata['file_id'] = file_id
vectorstore.add_documents(splits)
return True
except Exception as e:
print(f"Error indexing document: {e}")
return False
def delete_doc_from_pinecone(file_id: int):
try:
index = pc.Index(INDEX_NAME)
# Query for all vectors with file_id metadata
query_result = index.query(
vector=[0.0]*1024,
filter={"file_id": {"$eq": str(file_id)}},
top_k=10000,
include_metadata=True
)
ids = [match["id"] for match in query_result["matches"]]
if ids:
index.delete(ids=ids)
return True
except Exception as e:
print(f"Error deleting from Pinecone: {str(e)}")
return False