MoSPI / helper /create_embeddings.py
akshansh36's picture
Update helper/create_embeddings.py
70288fd verified
import pinecone
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import os
import uuid
from pymongo import MongoClient
from dotenv import load_dotenv
load_dotenv()
FLASH_API = os.getenv("FLASH_API")
PINECONE_API=os.getenv("PINECONE_API")
PINECONE_INDEX=os.getenv("PINECONE_INDEX")
google_embeddings = GoogleGenerativeAIEmbeddings(
model="models/embedding-001", # Correct model name
google_api_key=FLASH_API # Your API key
)
# Initialize Pinecone instance
pc = pinecone.Pinecone(
api_key=PINECONE_API # Your Pinecone API key
)
MONGO_URI = os.getenv("MONGO_URI")
DB_NAME = os.getenv("DB_NAME")
COLLECTION_NAME = os.getenv("COLLECTION_NAME")
mongo_client = MongoClient(MONGO_URI)
db = mongo_client[DB_NAME]
collection = db[COLLECTION_NAME]
# Define the Pinecone index name (make sure it exists in your Pinecone dashboard)
index = pc.Index(PINECONE_INDEX)
def create_embedding(object_url,tags,categories):
try:
document = collection.find_one({'object_url': object_url})
content = document.get("description")
file_type = document.get("type")
mongo_id = str(document.get('_id')) # Convert ObjectId to string for storage in metadata
# Generate the embedding
embedding = google_embeddings.embed_query(content)
# Generate a unique ID for Pinecone
pinecone_id = str(uuid.uuid4())
# Prepare the vector with metadata
vectors = [{
'id': pinecone_id,
'values': embedding,
'metadata': {
'description': content,
"url": object_url,
"tag": file_type,
"mongo_id": mongo_id,
"tags":','.join(tags),
"categories":','.join(categories)# Store MongoDB ID in metadata
}
}]
# Upsert the vector to Pinecone
index.upsert(vectors)
print(f"Inserted: {object_url} in Pinecone with MongoDB ID in metadata")
# Update MongoDB document with Pinecone ID and success status
collection.update_one(
{"_id": document["_id"]},
{"$set": {
"pinecone_id": pinecone_id,
"successfully_embedding_created": True
}}
)
return True
except Exception as e:
print(f"Error occurred: {e}")
# Update MongoDB document with failure status
collection.update_one(
{"_id": document["_id"]},
{"$set": {
"successfully_embedding_created": False
}}
)
return False