bstraehle commited on
Commit
1ebe7e2
·
verified ·
1 Parent(s): 772ea58

Create mongodb.py

Browse files
Files changed (1) hide show
  1. mongodb.py +77 -0
mongodb.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+
3
+ from pymongo.mongo_client import MongoClient
4
+ from pymongo.operations import SearchIndexModel
5
+
6
+ database_name = "airbnb_dataset"
7
+ collection_name = "listings_reviews"
8
+
9
+ def get_mongo_client(mongo_uri):
10
+ """Establish connection to the MongoDB."""
11
+
12
+ # gateway to interacting with a MongoDB database cluster
13
+ client = MongoClient(mongo_uri, appname="devrel.deeplearningai.lesson1.python")
14
+ print("Connection to MongoDB successful")
15
+ return client
16
+
17
+ if not MONGO_URI:
18
+ print("MONGO_URI not set in environment variables")
19
+
20
+ def create_db():
21
+ mongo_client = get_mongo_client(MONGO_URI)
22
+
23
+ # Pymongo client of database and collection
24
+ db = mongo_client.get_database(database_name)
25
+ collection = db.get_collection(collection_name)
26
+
27
+ # Delete any existing records in the collection
28
+ collection.delete_many({})
29
+
30
+ def ingest_data():
31
+ # The ingestion process might take a few minutes
32
+ collection.insert_many(listings)
33
+ print("Data ingestion into MongoDB completed")
34
+
35
+ def create_vector_search_index():
36
+ # NOTE: This dataset contains text and image embeddings, but this lessons only uses the text embeddings
37
+ # The field containing the text embeddings on each document within the listings_reviews collection
38
+ text_embedding_field_name = "text_embeddings"
39
+ # MongoDB Atlas Vector Search index name
40
+ vector_search_index_name_text = "vector_index_text"
41
+
42
+ vector_search_index_model = SearchIndexModel(
43
+ definition={
44
+ "mappings": { # describes how fields in the database documents are indexed and stored
45
+ "dynamic": True, # automatically index new fields that appear in the document
46
+ "fields": { # properties of the fields that will be indexed.
47
+ text_embedding_field_name: {
48
+ "dimensions": 1536, # size of the vector.
49
+ "similarity": "cosine", # algorithm used to compute the similarity between vectors
50
+ "type": "knnVector",
51
+ }
52
+ },
53
+ }
54
+ },
55
+ name=vector_search_index_name_text, # identifier for the vector search index
56
+ )
57
+
58
+ # Check if the index already exists
59
+ index_exists = False
60
+ for index in collection.list_indexes():
61
+ print(index)
62
+ if index['name'] == vector_search_index_name_text:
63
+ index_exists = True
64
+ break
65
+
66
+ # Create the index if it doesn't exist
67
+ if not index_exists:
68
+ try:
69
+ result = collection.create_search_index(model=vector_search_index_model)
70
+ print("Creating index...")
71
+ time.sleep(20) # Sleep for 20 seconds, adding sleep to ensure vector index has compeleted inital sync before utilization
72
+ print("Index created successfully:", result)
73
+ print("Wait a few minutes before conducting search with index to ensure index intialization")
74
+ except Exception as e:
75
+ print(f"Error creating vector search index: {str(e)}")
76
+ else:
77
+ print(f"Index '{vector_search_index_name_text}' already exists.")