Spaces:
Runtime error
Runtime error
Add batching to db load
Browse files- models/etl.py +17 -9
models/etl.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import json
|
2 |
import chromadb
|
3 |
from datetime import datetime
|
|
|
4 |
|
5 |
from utils.general_utils import timeit
|
6 |
from utils.embedding_utils import MyEmbeddingFunction
|
@@ -96,16 +97,23 @@ def load_data_to_db(db_path, data):
|
|
96 |
|
97 |
collection = client.get_collection("huberman_videos")
|
98 |
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
print(f"Data loaded to database at {db_path}.")
|
110 |
|
111 |
|
|
|
1 |
import json
|
2 |
import chromadb
|
3 |
from datetime import datetime
|
4 |
+
import math
|
5 |
|
6 |
from utils.general_utils import timeit
|
7 |
from utils.embedding_utils import MyEmbeddingFunction
|
|
|
97 |
|
98 |
collection = client.get_collection("huberman_videos")
|
99 |
|
100 |
+
num_rows = len(data)
|
101 |
+
batch_size = 5461
|
102 |
+
num_batches = math.ceil(num_rows / batch_size)
|
103 |
+
|
104 |
+
for i in range(num_batches):
|
105 |
+
batch_data = data[i * batch_size : (i + 1) * batch_size]
|
106 |
+
documents = [segment['text'] for segment in batch_data]
|
107 |
+
metadata = [segment['metadata'] for segment in batch_data]
|
108 |
+
ids = [segment['metadata']['segment_id'] for segment in batch_data]
|
109 |
|
110 |
+
collection.add(
|
111 |
+
documents=documents,
|
112 |
+
metadatas=metadata,
|
113 |
+
ids=ids
|
114 |
+
)
|
115 |
+
print(f"Batch {i+1} of {num_batches} loaded to database.")
|
116 |
+
|
117 |
print(f"Data loaded to database at {db_path}.")
|
118 |
|
119 |
|