gabrielaltay commited on
Commit
8b9c625
·
1 Parent(s): 6f2f718

chroma pathing

Browse files
Files changed (2) hide show
  1. src/legisqa_local/config/settings.py +41 -9
  2. uv.lock +2 -51
src/legisqa_local/config/settings.py CHANGED
@@ -141,10 +141,13 @@ def download_chromadb_from_hf_dataset(dataset_repo: str, local_path: str) -> boo
141
  logger.info(f"Downloading dataset: {dataset_repo}")
142
  logger.info("This may take several minutes for large datasets...")
143
 
 
 
 
144
  downloaded_path = snapshot_download(
145
  repo_id=dataset_repo,
146
  repo_type="dataset",
147
- local_dir=local_path,
148
  cache_dir="/tmp/hf_chromadb_cache"
149
  # Note: resume_download and local_dir_use_symlinks are now handled automatically
150
  )
@@ -152,7 +155,27 @@ def download_chromadb_from_hf_dataset(dataset_repo: str, local_path: str) -> boo
152
  logger.info(f"✅ ChromaDB download from HF Dataset complete!")
153
  logger.info(f"Downloaded to: {downloaded_path}")
154
 
155
- # Verify the download by checking for expected files
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  if os.path.exists(local_path) and os.listdir(local_path):
157
  file_count = sum(len(files) for _, _, files in os.walk(local_path))
158
  total_size = sum(
@@ -161,13 +184,21 @@ def download_chromadb_from_hf_dataset(dataset_repo: str, local_path: str) -> boo
161
  for filename in filenames
162
  ) / (1024 * 1024 * 1024) # Convert to GB
163
 
164
- logger.info(f"📊 Download verification:")
165
  logger.info(f" Files: {file_count}")
166
  logger.info(f" Total size: {total_size:.2f} GB")
167
 
168
- return True
 
 
 
 
 
 
 
 
169
  else:
170
- logger.error("❌ Download completed but no files found in target directory")
171
  return False
172
 
173
  except ImportError:
@@ -212,15 +243,16 @@ def inspect_chromadb(chroma_path: str):
212
  if count > 0:
213
  # Get a sample item
214
  logger.info("🔍 Fetching sample items...")
215
- sample = collection.get(limit=3, include=["documents", "metadatas", "ids"])
216
 
217
- logger.info(f"📝 Sample IDs: {sample['ids']}")
 
218
 
219
- if sample['documents']:
220
  logger.info(f"📄 Sample document (first 200 chars):")
221
  logger.info(f" {sample['documents'][0][:200]}...")
222
 
223
- if sample['metadatas']:
224
  logger.info(f"🏷️ Sample metadata:")
225
  for i, metadata in enumerate(sample['metadatas'][:2]):
226
  logger.info(f" Item {i}: {metadata}")
 
141
  logger.info(f"Downloading dataset: {dataset_repo}")
142
  logger.info("This may take several minutes for large datasets...")
143
 
144
+ # Download to a temporary location first
145
+ temp_download_path = f"{local_path}_temp"
146
+
147
  downloaded_path = snapshot_download(
148
  repo_id=dataset_repo,
149
  repo_type="dataset",
150
+ local_dir=temp_download_path,
151
  cache_dir="/tmp/hf_chromadb_cache"
152
  # Note: resume_download and local_dir_use_symlinks are now handled automatically
153
  )
 
155
  logger.info(f"✅ ChromaDB download from HF Dataset complete!")
156
  logger.info(f"Downloaded to: {downloaded_path}")
157
 
158
+ # The HF dataset contains a 'chromadb' subdirectory with the actual ChromaDB files
159
+ chromadb_subdir = os.path.join(temp_download_path, "chromadb")
160
+
161
+ if os.path.exists(chromadb_subdir):
162
+ logger.info(f"📁 Found ChromaDB subdirectory: {chromadb_subdir}")
163
+
164
+ # Move the ChromaDB files from the subdirectory to the target location
165
+ import shutil
166
+ if os.path.exists(local_path):
167
+ shutil.rmtree(local_path)
168
+ shutil.move(chromadb_subdir, local_path)
169
+
170
+ # Clean up the temporary download directory
171
+ shutil.rmtree(temp_download_path)
172
+
173
+ logger.info(f"✅ ChromaDB files moved to: {local_path}")
174
+ else:
175
+ logger.error(f"❌ ChromaDB subdirectory not found in downloaded data: {chromadb_subdir}")
176
+ return False
177
+
178
+ # Verify the final ChromaDB structure
179
  if os.path.exists(local_path) and os.listdir(local_path):
180
  file_count = sum(len(files) for _, _, files in os.walk(local_path))
181
  total_size = sum(
 
184
  for filename in filenames
185
  ) / (1024 * 1024 * 1024) # Convert to GB
186
 
187
+ logger.info(f"📊 ChromaDB verification:")
188
  logger.info(f" Files: {file_count}")
189
  logger.info(f" Total size: {total_size:.2f} GB")
190
 
191
+ # Check for key ChromaDB files
192
+ sqlite_file = os.path.join(local_path, "chroma.sqlite3")
193
+ if os.path.exists(sqlite_file):
194
+ sqlite_size = os.path.getsize(sqlite_file) / (1024 * 1024 * 1024)
195
+ logger.info(f" SQLite database: {sqlite_size:.2f} GB")
196
+ return True
197
+ else:
198
+ logger.error("❌ chroma.sqlite3 not found in ChromaDB directory")
199
+ return False
200
  else:
201
+ logger.error("❌ ChromaDB directory is empty after processing")
202
  return False
203
 
204
  except ImportError:
 
243
  if count > 0:
244
  # Get a sample item
245
  logger.info("🔍 Fetching sample items...")
246
+ sample = collection.get(limit=3, include=["documents", "metadatas"])
247
 
248
+ if sample.get('ids'):
249
+ logger.info(f"📝 Sample IDs: {sample['ids']}")
250
 
251
+ if sample.get('documents'):
252
  logger.info(f"📄 Sample document (first 200 chars):")
253
  logger.info(f" {sample['documents'][0][:200]}...")
254
 
255
+ if sample.get('metadatas'):
256
  logger.info(f"🏷️ Sample metadata:")
257
  for i, metadata in enumerate(sample['metadatas'][:2]):
258
  logger.info(f" Item {i}: {metadata}")
uv.lock CHANGED
@@ -191,34 +191,6 @@ wheels = [
191
  { url = "https://files.pythonhosted.org/packages/10/cb/f2ad4230dc2eb1a74edf38f1a38b9b52277f75bef262d8908e60d957e13c/blinker-1.9.0-py3-none-any.whl", hash = "sha256:ba0efaa9080b619ff2f3459d1d500c57bddea4a6b424b60a91141db6fd2f08bc", size = 8458, upload-time = "2024-11-08T17:25:46.184Z" },
192
  ]
193
 
194
- [[package]]
195
- name = "boto3"
196
- version = "1.40.40"
197
- source = { registry = "https://pypi.org/simple" }
198
- dependencies = [
199
- { name = "botocore" },
200
- { name = "jmespath" },
201
- { name = "s3transfer" },
202
- ]
203
- sdist = { url = "https://files.pythonhosted.org/packages/3c/12/1a31b36802d0f33bc6982ab8b7e6437d75ef3c179abe6c53d4d8f7b4248f/boto3-1.40.40.tar.gz", hash = "sha256:f384d3a0410d0f1a4d4ae7aa69c41d0549c6ca5a76667dc25fc97d50ad6db740", size = 111606, upload-time = "2025-09-26T19:23:46.923Z" }
204
- wheels = [
205
- { url = "https://files.pythonhosted.org/packages/90/69/c65566dbdaaea3af0c23f7731ab0f185a38b593fd449d2423374150dbfe0/boto3-1.40.40-py3-none-any.whl", hash = "sha256:385904de68623e1c341bdc095d94a30006843032c912adeb1e0752a343632ec6", size = 139340, upload-time = "2025-09-26T19:23:45.557Z" },
206
- ]
207
-
208
- [[package]]
209
- name = "botocore"
210
- version = "1.40.40"
211
- source = { registry = "https://pypi.org/simple" }
212
- dependencies = [
213
- { name = "jmespath" },
214
- { name = "python-dateutil" },
215
- { name = "urllib3" },
216
- ]
217
- sdist = { url = "https://files.pythonhosted.org/packages/83/5a/43a7fea503ad14fa79819f2b3103a38977fb587a3663d1ac6e958fccf592/botocore-1.40.40.tar.gz", hash = "sha256:78eb121a16a6481ed0f6e1aebe53a4f23aa121f34466846c13a5ca48fa980e31", size = 14363370, upload-time = "2025-09-26T19:23:37.853Z" }
218
- wheels = [
219
- { url = "https://files.pythonhosted.org/packages/ed/5e/3bbf6d34cbf307c1b9e58e0204ceba2d35bbc0c93b4e3b3cc895aae0a5fd/botocore-1.40.40-py3-none-any.whl", hash = "sha256:68506142b3cde93145ef3ee0268f2444f2b68ada225a151f714092bbd3d6516a", size = 14031738, upload-time = "2025-09-26T19:23:35.475Z" },
220
- ]
221
-
222
  [[package]]
223
  name = "build"
224
  version = "1.3.0"
@@ -845,15 +817,6 @@ wheels = [
845
  { url = "https://files.pythonhosted.org/packages/af/22/7ab7b4ec3a1c1f03aef376af11d23b05abcca3fb31fbca1e7557053b1ba2/jiter-0.11.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e2bbf24f16ba5ad4441a9845e40e4ea0cb9eed00e76ba94050664ef53ef4406", size = 347102, upload-time = "2025-09-15T09:20:20.16Z" },
846
  ]
847
 
848
- [[package]]
849
- name = "jmespath"
850
- version = "1.0.1"
851
- source = { registry = "https://pypi.org/simple" }
852
- sdist = { url = "https://files.pythonhosted.org/packages/00/2a/e867e8531cf3e36b41201936b7fa7ba7b5702dbef42922193f05c8976cd6/jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe", size = 25843, upload-time = "2022-06-17T18:00:12.224Z" }
853
- wheels = [
854
- { url = "https://files.pythonhosted.org/packages/31/b4/b9b800c45527aadd64d5b442f9b932b00648617eb5d63d2c7a6587b7cafc/jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980", size = 20256, upload-time = "2022-06-17T18:00:10.251Z" },
855
- ]
856
-
857
  [[package]]
858
  name = "joblib"
859
  version = "1.5.2"
@@ -1113,9 +1076,9 @@ name = "legisqa-local"
1113
  version = "0.1.0"
1114
  source = { editable = "." }
1115
  dependencies = [
1116
- { name = "boto3" },
1117
  { name = "chromadb" },
1118
  { name = "datasets" },
 
1119
  { name = "langchain" },
1120
  { name = "langchain-anthropic" },
1121
  { name = "langchain-chroma" },
@@ -1132,9 +1095,9 @@ dependencies = [
1132
 
1133
  [package.metadata]
1134
  requires-dist = [
1135
- { name = "boto3", specifier = ">=1.35.0" },
1136
  { name = "chromadb", specifier = ">=1.1.0" },
1137
  { name = "datasets", specifier = ">=3.0.0" },
 
1138
  { name = "langchain", specifier = ">=0.3.27" },
1139
  { name = "langchain-anthropic", specifier = ">=0.3.19" },
1140
  { name = "langchain-chroma", specifier = ">=0.1.4" },
@@ -2390,18 +2353,6 @@ wheels = [
2390
  { url = "https://files.pythonhosted.org/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl", hash = "sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762", size = 34696, upload-time = "2025-04-16T09:51:17.142Z" },
2391
  ]
2392
 
2393
- [[package]]
2394
- name = "s3transfer"
2395
- version = "0.14.0"
2396
- source = { registry = "https://pypi.org/simple" }
2397
- dependencies = [
2398
- { name = "botocore" },
2399
- ]
2400
- sdist = { url = "https://files.pythonhosted.org/packages/62/74/8d69dcb7a9efe8baa2046891735e5dfe433ad558ae23d9e3c14c633d1d58/s3transfer-0.14.0.tar.gz", hash = "sha256:eff12264e7c8b4985074ccce27a3b38a485bb7f7422cc8046fee9be4983e4125", size = 151547, upload-time = "2025-09-09T19:23:31.089Z" }
2401
- wheels = [
2402
- { url = "https://files.pythonhosted.org/packages/48/f0/ae7ca09223a81a1d890b2557186ea015f6e0502e9b8cb8e1813f1d8cfa4e/s3transfer-0.14.0-py3-none-any.whl", hash = "sha256:ea3b790c7077558ed1f02a3072fb3cb992bbbd253392f4b6e9e8976941c7d456", size = 85712, upload-time = "2025-09-09T19:23:30.041Z" },
2403
- ]
2404
-
2405
  [[package]]
2406
  name = "safetensors"
2407
  version = "0.6.2"
 
191
  { url = "https://files.pythonhosted.org/packages/10/cb/f2ad4230dc2eb1a74edf38f1a38b9b52277f75bef262d8908e60d957e13c/blinker-1.9.0-py3-none-any.whl", hash = "sha256:ba0efaa9080b619ff2f3459d1d500c57bddea4a6b424b60a91141db6fd2f08bc", size = 8458, upload-time = "2024-11-08T17:25:46.184Z" },
192
  ]
193
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
  [[package]]
195
  name = "build"
196
  version = "1.3.0"
 
817
  { url = "https://files.pythonhosted.org/packages/af/22/7ab7b4ec3a1c1f03aef376af11d23b05abcca3fb31fbca1e7557053b1ba2/jiter-0.11.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e2bbf24f16ba5ad4441a9845e40e4ea0cb9eed00e76ba94050664ef53ef4406", size = 347102, upload-time = "2025-09-15T09:20:20.16Z" },
818
  ]
819
 
 
 
 
 
 
 
 
 
 
820
  [[package]]
821
  name = "joblib"
822
  version = "1.5.2"
 
1076
  version = "0.1.0"
1077
  source = { editable = "." }
1078
  dependencies = [
 
1079
  { name = "chromadb" },
1080
  { name = "datasets" },
1081
+ { name = "huggingface-hub" },
1082
  { name = "langchain" },
1083
  { name = "langchain-anthropic" },
1084
  { name = "langchain-chroma" },
 
1095
 
1096
  [package.metadata]
1097
  requires-dist = [
 
1098
  { name = "chromadb", specifier = ">=1.1.0" },
1099
  { name = "datasets", specifier = ">=3.0.0" },
1100
+ { name = "huggingface-hub", specifier = ">=0.19.0" },
1101
  { name = "langchain", specifier = ">=0.3.27" },
1102
  { name = "langchain-anthropic", specifier = ">=0.3.19" },
1103
  { name = "langchain-chroma", specifier = ">=0.1.4" },
 
2353
  { url = "https://files.pythonhosted.org/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl", hash = "sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762", size = 34696, upload-time = "2025-04-16T09:51:17.142Z" },
2354
  ]
2355
 
 
 
 
 
 
 
 
 
 
 
 
 
2356
  [[package]]
2357
  name = "safetensors"
2358
  version = "0.6.2"