Spaces:
Running
Running
import time | |
from pymongo.mongo_client import MongoClient | |
from pymongo.operations import SearchIndexModel | |
database_name = "airbnb_dataset" | |
collection_name = "listings_reviews" | |
def get_mongo_client(mongo_uri): | |
"""Establish connection to the MongoDB.""" | |
# gateway to interacting with a MongoDB database cluster | |
client = MongoClient(mongo_uri, appname="devrel.deeplearningai.lesson1.python") | |
print("Connection to MongoDB successful") | |
return client | |
if not MONGO_URI: | |
print("MONGO_URI not set in environment variables") | |
def create_db(): | |
mongo_client = get_mongo_client(MONGO_URI) | |
# Pymongo client of database and collection | |
db = mongo_client.get_database(database_name) | |
collection = db.get_collection(collection_name) | |
# Delete any existing records in the collection | |
collection.delete_many({}) | |
def ingest_data(): | |
# The ingestion process might take a few minutes | |
collection.insert_many(listings) | |
print("Data ingestion into MongoDB completed") | |
def create_vector_search_index(): | |
# NOTE: This dataset contains text and image embeddings, but this lessons only uses the text embeddings | |
# The field containing the text embeddings on each document within the listings_reviews collection | |
text_embedding_field_name = "text_embeddings" | |
# MongoDB Atlas Vector Search index name | |
vector_search_index_name_text = "vector_index_text" | |
vector_search_index_model = SearchIndexModel( | |
definition={ | |
"mappings": { # describes how fields in the database documents are indexed and stored | |
"dynamic": True, # automatically index new fields that appear in the document | |
"fields": { # properties of the fields that will be indexed. | |
text_embedding_field_name: { | |
"dimensions": 1536, # size of the vector. | |
"similarity": "cosine", # algorithm used to compute the similarity between vectors | |
"type": "knnVector", | |
} | |
}, | |
} | |
}, | |
name=vector_search_index_name_text, # identifier for the vector search index | |
) | |
# Check if the index already exists | |
index_exists = False | |
for index in collection.list_indexes(): | |
print(index) | |
if index['name'] == vector_search_index_name_text: | |
index_exists = True | |
break | |
# Create the index if it doesn't exist | |
if not index_exists: | |
try: | |
result = collection.create_search_index(model=vector_search_index_model) | |
print("Creating index...") | |
time.sleep(20) # Sleep for 20 seconds, adding sleep to ensure vector index has compeleted inital sync before utilization | |
print("Index created successfully:", result) | |
print("Wait a few minutes before conducting search with index to ensure index intialization") | |
except Exception as e: | |
print(f"Error creating vector search index: {str(e)}") | |
else: | |
print(f"Index '{vector_search_index_name_text}' already exists.") |