Spaces:

umar-100
/

smart-research-assistant

Sleeping

App Files Files Community

smart-research-assistant / backend /pinecone_utilis.py

umar-100

minor bug fixes

b2926b5 2 months ago

raw

history blame contribute delete

2.79 kB

	from langchain_community.document_loaders import PyPDFLoader, TextLoader
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_openai import OpenAIEmbeddings
	from pinecone import Pinecone, ServerlessSpec
	from langchain_pinecone import PineconeVectorStore
	from typing import List
	from langchain_core.documents import Document
	import os
	from dotenv import load_dotenv
	load_dotenv()

	# API keys
	PINECONE_API_KEY=os.getenv("PINECONE_API_KEY")
	OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")



	# text splitter and embedding function
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, length_function=len)
	embeddings = OpenAIEmbeddings(model="text-embedding-3-large", dimensions=1024, api_key=OPENAI_API_KEY)

	# Pinecone vector store
	pc = Pinecone(api_key=PINECONE_API_KEY)

	def load_and_split_document(file_path: str) -> List[Document]:
	if file_path.endswith('.pdf'):
	loader = PyPDFLoader(file_path)
	elif file_path.endswith('.txt'):
	loader = TextLoader(file_path)
	else:
	raise ValueError(f"Unsupported file type: {file_path}")

	documents = loader.load()
	return text_splitter.split_documents(documents)

	INDEX_NAME = "smart-research-assistant"

	def create_pinecone_vectorstore()-> PineconeVectorStore:
	try:
	if not pc.has_index(INDEX_NAME):
	pc.create_index(
	name=INDEX_NAME,
	dimension=1024,
	metric="cosine",
	spec=ServerlessSpec(cloud="aws", region="us-east-1")
	)

	index = pc.Index(INDEX_NAME)
	return PineconeVectorStore(index=index, embedding=embeddings)

	except Exception as e:
	print(f"Index initialization failed: {e}")
	raise


	vectorstore=create_pinecone_vectorstore()

	def index_document_to_pinecone(file_path: str, file_id: int) -> bool:
	try:
	splits = load_and_split_document(file_path)

	# Add metadata to each split
	for split in splits:
	split.metadata['file_id'] = file_id

	vectorstore.add_documents(splits)
	return True
	except Exception as e:
	print(f"Error indexing document: {e}")
	return False

	def delete_doc_from_pinecone(file_id: int):
	try:
	index = pc.Index(INDEX_NAME)
	# Query for all vectors with file_id metadata
	query_result = index.query(
	vector=[0.0]*1024,
	filter={"file_id": {"$eq": str(file_id)}},
	top_k=10000,
	include_metadata=True
	)
	ids = [match["id"] for match in query_result["matches"]]
	if ids:
	index.delete(ids=ids)
	return True
	except Exception as e:
	print(f"Error deleting from Pinecone: {str(e)}")
	return False