lstetson commited on
Commit
61e6bcc
1 Parent(s): 2abed75

Add batching to db load

Browse files
Files changed (1) hide show
  1. models/etl.py +17 -9
models/etl.py CHANGED
@@ -1,6 +1,7 @@
1
  import json
2
  import chromadb
3
  from datetime import datetime
 
4
 
5
  from utils.general_utils import timeit
6
  from utils.embedding_utils import MyEmbeddingFunction
@@ -96,16 +97,23 @@ def load_data_to_db(db_path, data):
96
 
97
  collection = client.get_collection("huberman_videos")
98
 
99
- documents = [segment['text'] for segment in data]
100
- metadata = [segment['metadata'] for segment in data]
101
- ids = [segment['metadata']['segment_id'] for segment in data]
102
-
103
- collection.add(
104
- documents=documents,
105
- metadatas=metadata,
106
- ids=ids
107
- )
108
 
 
 
 
 
 
 
 
109
  print(f"Data loaded to database at {db_path}.")
110
 
111
 
 
1
  import json
2
  import chromadb
3
  from datetime import datetime
4
+ import math
5
 
6
  from utils.general_utils import timeit
7
  from utils.embedding_utils import MyEmbeddingFunction
 
97
 
98
  collection = client.get_collection("huberman_videos")
99
 
100
+ num_rows = len(data)
101
+ batch_size = 5461
102
+ num_batches = math.ceil(num_rows / batch_size)
103
+
104
+ for i in range(num_batches):
105
+ batch_data = data[i * batch_size : (i + 1) * batch_size]
106
+ documents = [segment['text'] for segment in batch_data]
107
+ metadata = [segment['metadata'] for segment in batch_data]
108
+ ids = [segment['metadata']['segment_id'] for segment in batch_data]
109
 
110
+ collection.add(
111
+ documents=documents,
112
+ metadatas=metadata,
113
+ ids=ids
114
+ )
115
+ print(f"Batch {i+1} of {num_batches} loaded to database.")
116
+
117
  print(f"Data loaded to database at {db_path}.")
118
 
119