Spaces:

bstraehle
/

advanced-rag

Running

App Files Files Community

bstraehle commited on Jul 28, 2024

Commit

1ebe7e2

verified ·

1 Parent(s): 772ea58

Create mongodb.py

Browse files

Files changed (1) hide show

mongodb.py +77 -0

mongodb.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import time
+from pymongo.mongo_client import MongoClient
+from pymongo.operations import SearchIndexModel
+database_name = "airbnb_dataset"
+collection_name = "listings_reviews"
+def get_mongo_client(mongo_uri):
+    """Establish connection to the MongoDB."""
+    # gateway to interacting with a MongoDB database cluster
+    client = MongoClient(mongo_uri, appname="devrel.deeplearningai.lesson1.python")
+    print("Connection to MongoDB successful")
+    return client
+if not MONGO_URI:
+    print("MONGO_URI not set in environment variables")
+def create_db():
+    mongo_client = get_mongo_client(MONGO_URI)
+    # Pymongo client of database and collection
+    db = mongo_client.get_database(database_name)
+    collection = db.get_collection(collection_name)
+    # Delete any existing records in the collection
+    collection.delete_many({})
+def ingest_data():
+    # The ingestion process might take a few minutes
+    collection.insert_many(listings)
+    print("Data ingestion into MongoDB completed")
+def create_vector_search_index():
+    # NOTE: This dataset contains text and image embeddings, but this lessons only uses the text embeddings
+    # The field containing the text embeddings on each document within the listings_reviews collection
+    text_embedding_field_name = "text_embeddings"
+    # MongoDB Atlas Vector Search index name
+    vector_search_index_name_text = "vector_index_text"
+    vector_search_index_model = SearchIndexModel(
+        definition={
+            "mappings": { # describes how fields in the database documents are indexed and stored
+                "dynamic": True, # automatically index new fields that appear in the document
+                "fields": { # properties of the fields that will be indexed.
+                    text_embedding_field_name: {
+                        "dimensions": 1536, # size of the vector.
+                        "similarity": "cosine", # algorithm used to compute the similarity between vectors
+                        "type": "knnVector",
+                    }
+                },
+            }
+        },
+        name=vector_search_index_name_text, # identifier for the vector search index
+    )
+    # Check if the index already exists
+    index_exists = False
+    for index in collection.list_indexes():
+        print(index)
+        if index['name'] == vector_search_index_name_text:
+            index_exists = True
+            break
+    # Create the index if it doesn't exist
+    if not index_exists:
+        try:
+            result = collection.create_search_index(model=vector_search_index_model)
+            print("Creating index...")
+            time.sleep(20)  # Sleep for 20 seconds, adding sleep to ensure vector index has compeleted inital sync before utilization
+            print("Index created successfully:", result)
+            print("Wait a few minutes before conducting search with index to ensure index intialization")
+        except Exception as e:
+            print(f"Error creating vector search index: {str(e)}")
+    else:
+        print(f"Index '{vector_search_index_name_text}' already exists.")