gabrielaltay commited on
Commit
6f2f718
Β·
1 Parent(s): b087f3c
src/legisqa_local/config/settings.py CHANGED
@@ -74,6 +74,8 @@ def setup_chromadb():
74
  # Update environment variable to point to persistent storage
75
  os.environ["CHROMA_PERSIST_DIRECTORY"] = persistent_chroma_path
76
  logger.info(f"Updated CHROMA_PERSIST_DIRECTORY to: {persistent_chroma_path}")
 
 
77
  return persistent_chroma_path
78
 
79
  # ChromaDB not found in persistent storage, try to download from HF Dataset
@@ -92,6 +94,8 @@ def setup_chromadb():
92
  # Update environment variable to point to persistent storage
93
  os.environ["CHROMA_PERSIST_DIRECTORY"] = persistent_chroma_path
94
  logger.info(f"βœ… ChromaDB download successful! Updated path to: {persistent_chroma_path}")
 
 
95
  return persistent_chroma_path
96
  else:
97
  logger.error("❌ ChromaDB download from HF Dataset failed!")
@@ -109,6 +113,8 @@ def setup_chromadb():
109
  if os.path.exists(chroma_path):
110
  if os.listdir(chroma_path):
111
  logger.info(f"βœ… ChromaDB found at {chroma_path}")
 
 
112
  return chroma_path
113
  else:
114
  logger.warning(f"ChromaDB directory exists but is empty: {chroma_path}")
@@ -172,6 +178,67 @@ def download_chromadb_from_hf_dataset(dataset_repo: str, local_path: str) -> boo
172
  logger.error(f"Exception type: {type(e).__name__}")
173
  return False
174
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  # Embedding model configuration
176
  EMBEDDING_MODEL = "sentence-transformers/static-retrieval-mrl-en-v1"
177
  EMBEDDING_DEVICE = "cpu"
 
74
  # Update environment variable to point to persistent storage
75
  os.environ["CHROMA_PERSIST_DIRECTORY"] = persistent_chroma_path
76
  logger.info(f"Updated CHROMA_PERSIST_DIRECTORY to: {persistent_chroma_path}")
77
+ # Inspect the existing ChromaDB
78
+ inspect_chromadb(persistent_chroma_path)
79
  return persistent_chroma_path
80
 
81
  # ChromaDB not found in persistent storage, try to download from HF Dataset
 
94
  # Update environment variable to point to persistent storage
95
  os.environ["CHROMA_PERSIST_DIRECTORY"] = persistent_chroma_path
96
  logger.info(f"βœ… ChromaDB download successful! Updated path to: {persistent_chroma_path}")
97
+ # Inspect the downloaded ChromaDB
98
+ inspect_chromadb(persistent_chroma_path)
99
  return persistent_chroma_path
100
  else:
101
  logger.error("❌ ChromaDB download from HF Dataset failed!")
 
113
  if os.path.exists(chroma_path):
114
  if os.listdir(chroma_path):
115
  logger.info(f"βœ… ChromaDB found at {chroma_path}")
116
+ # Inspect the fallback ChromaDB
117
+ inspect_chromadb(chroma_path)
118
  return chroma_path
119
  else:
120
  logger.warning(f"ChromaDB directory exists but is empty: {chroma_path}")
 
178
  logger.error(f"Exception type: {type(e).__name__}")
179
  return False
180
 
181
+ def inspect_chromadb(chroma_path: str):
182
+ """Inspect ChromaDB collection to verify it's working correctly"""
183
+ logger.info("πŸ” === ChromaDB Collection Inspection ===")
184
+
185
+ try:
186
+ from chromadb import PersistentClient
187
+
188
+ # Get collection name from config
189
+ chroma_config = get_chroma_config()
190
+ collection_name = chroma_config["collection_name"]
191
+
192
+ logger.info(f"πŸ“‹ Collection name: {collection_name}")
193
+ logger.info(f"πŸ“ ChromaDB path: {chroma_path}")
194
+
195
+ # Create ChromaDB client
196
+ client = PersistentClient(path=chroma_path)
197
+ logger.info("βœ… ChromaDB client created successfully")
198
+
199
+ # List all collections
200
+ collections = client.list_collections()
201
+ logger.info(f"πŸ“š Available collections: {[c.name for c in collections]}")
202
+
203
+ # Get the specific collection
204
+ if collection_name in [c.name for c in collections]:
205
+ collection = client.get_collection(name=collection_name)
206
+ logger.info(f"βœ… Collection '{collection_name}' found")
207
+
208
+ # Get collection count
209
+ count = collection.count()
210
+ logger.info(f"πŸ“Š Collection count: {count} items")
211
+
212
+ if count > 0:
213
+ # Get a sample item
214
+ logger.info("πŸ” Fetching sample items...")
215
+ sample = collection.get(limit=3, include=["documents", "metadatas", "ids"])
216
+
217
+ logger.info(f"πŸ“ Sample IDs: {sample['ids']}")
218
+
219
+ if sample['documents']:
220
+ logger.info(f"πŸ“„ Sample document (first 200 chars):")
221
+ logger.info(f" {sample['documents'][0][:200]}...")
222
+
223
+ if sample['metadatas']:
224
+ logger.info(f"🏷️ Sample metadata:")
225
+ for i, metadata in enumerate(sample['metadatas'][:2]):
226
+ logger.info(f" Item {i}: {metadata}")
227
+ else:
228
+ logger.warning("⚠️ Collection is empty!")
229
+
230
+ else:
231
+ logger.error(f"❌ Collection '{collection_name}' not found!")
232
+ logger.error(f"Available collections: {[c.name for c in collections]}")
233
+
234
+ except ImportError as e:
235
+ logger.error(f"❌ Import error during ChromaDB inspection: {e}")
236
+ except Exception as e:
237
+ logger.error(f"❌ Error inspecting ChromaDB: {e}")
238
+ logger.error(f"Exception type: {type(e).__name__}")
239
+
240
+ logger.info("πŸ” === ChromaDB Inspection Complete ===")
241
+
242
  # Embedding model configuration
243
  EMBEDDING_MODEL = "sentence-transformers/static-retrieval-mrl-en-v1"
244
  EMBEDDING_DEVICE = "cpu"