gabrielaltay commited on
Commit
4445f91
Β·
1 Parent(s): 471185d

use hf dataset

Browse files
Dockerfile CHANGED
@@ -6,15 +6,8 @@ RUN apt-get update && apt-get install -y \
6
  build-essential \
7
  curl \
8
  git \
9
- unzip \
10
  && rm -rf /var/lib/apt/lists/*
11
 
12
- # Install AWS CLI
13
- RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" \
14
- && unzip awscliv2.zip \
15
- && ./aws/install \
16
- && rm -rf aws awscliv2.zip
17
-
18
  # Install uv
19
  COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv
20
 
 
6
  build-essential \
7
  curl \
8
  git \
 
9
  && rm -rf /var/lib/apt/lists/*
10
 
 
 
 
 
 
 
11
  # Install uv
12
  COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv
13
 
pyproject.toml CHANGED
@@ -5,9 +5,9 @@ description = "Congressional Legislation Query and Analysis Tool"
5
  readme = "README.md"
6
  requires-python = ">=3.13"
7
  dependencies = [
8
- "boto3>=1.35.0",
9
  "chromadb>=1.1.0",
10
  "datasets>=3.0.0",
 
11
  "langchain>=0.3.27",
12
  "langchain-anthropic>=0.3.19",
13
  "langchain-chroma>=0.1.4",
 
5
  readme = "README.md"
6
  requires-python = ">=3.13"
7
  dependencies = [
 
8
  "chromadb>=1.1.0",
9
  "datasets>=3.0.0",
10
+ "huggingface_hub>=0.19.0",
11
  "langchain>=0.3.27",
12
  "langchain-anthropic>=0.3.19",
13
  "langchain-chroma>=0.1.4",
src/legisqa_local/config/settings.py CHANGED
@@ -38,7 +38,7 @@ def get_chroma_config():
38
  }
39
 
40
  def setup_chromadb():
41
- """Setup ChromaDB - use persistent storage (/data) or download from S3 if needed"""
42
  logger.info("=== ChromaDB Setup Starting ===")
43
 
44
  chroma_config = get_chroma_config()
@@ -60,30 +60,27 @@ def setup_chromadb():
60
  logger.info(f"Updated CHROMA_PERSIST_DIRECTORY to: {persistent_chroma_path}")
61
  return persistent_chroma_path
62
 
63
- # ChromaDB not found in persistent storage, try to download from S3
64
- logger.info("ChromaDB not found in persistent storage, checking S3 configuration...")
65
 
66
- s3_bucket = os.getenv("CHROMA_S3_BUCKET", "")
67
- s3_prefix = os.getenv("CHROMA_S3_PREFIX", "")
68
 
69
- logger.info(f"S3 Bucket: {s3_bucket}")
70
- logger.info(f"S3 Prefix: {s3_prefix}")
71
-
72
- if s3_bucket and s3_prefix:
73
- logger.info(f"πŸ“₯ Downloading ChromaDB from S3 to persistent storage...")
74
- logger.info(f" Source: s3://{s3_bucket}/{s3_prefix}")
75
  logger.info(f" Target: {persistent_chroma_path}")
76
 
77
- success = download_chromadb_from_s3(s3_bucket, s3_prefix, persistent_chroma_path)
78
  if success:
79
  # Update environment variable to point to persistent storage
80
  os.environ["CHROMA_PERSIST_DIRECTORY"] = persistent_chroma_path
81
  logger.info(f"βœ… ChromaDB download successful! Updated path to: {persistent_chroma_path}")
82
  return persistent_chroma_path
83
  else:
84
- logger.error("❌ ChromaDB download from S3 failed!")
85
  else:
86
- logger.error("❌ No S3 configuration found (CHROMA_S3_BUCKET, CHROMA_S3_PREFIX)")
87
  logger.info("Available environment variables:")
88
  for key, value in os.environ.items():
89
  if "CHROMA" in key:
@@ -106,92 +103,58 @@ def setup_chromadb():
106
  logger.info("=== ChromaDB Setup Complete ===")
107
  return chroma_path
108
 
109
- def download_chromadb_from_s3(bucket: str, prefix: str, local_path: str) -> bool:
110
- """Download ChromaDB from S3"""
111
- logger.info(f"Starting S3 download: s3://{bucket}/{prefix} -> {local_path}")
112
 
113
  try:
114
- import subprocess
115
  import os
116
 
117
  # Ensure target directory exists
118
  logger.info(f"Creating target directory: {local_path}")
119
  os.makedirs(local_path, exist_ok=True)
120
 
121
- # Use AWS CLI to sync from S3 (no credentials needed for public buckets)
122
- s3_url = f"s3://{bucket}/{prefix}"
123
- cmd = ["aws", "s3", "sync", s3_url, local_path, "--no-sign-request"]
124
-
125
- logger.info(f"Running AWS CLI command: {' '.join(cmd)}")
126
- result = subprocess.run(cmd, capture_output=True, text=True)
127
-
128
- if result.returncode == 0:
129
- logger.info("βœ… ChromaDB download from S3 (AWS CLI) complete!")
130
- if result.stdout:
131
- logger.info(f"AWS CLI output: {result.stdout}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  return True
133
  else:
134
- logger.error(f"❌ AWS CLI failed with return code {result.returncode}")
135
- logger.error(f"AWS CLI stderr: {result.stderr}")
136
- if result.stdout:
137
- logger.error(f"AWS CLI stdout: {result.stdout}")
138
  return False
139
 
140
- except FileNotFoundError:
141
- logger.warning("❌ AWS CLI not found. Trying with boto3...")
142
- return download_chromadb_from_s3_boto3(bucket, prefix, local_path)
143
- except Exception as e:
144
- logger.error(f"❌ Error downloading from S3: {e}")
145
- return False
146
-
147
- def download_chromadb_from_s3_boto3(bucket: str, prefix: str, local_path: str) -> bool:
148
- """Download ChromaDB from S3 using boto3 (fallback)"""
149
- try:
150
- import boto3
151
- from botocore import UNSIGNED
152
- from botocore.config import Config
153
- import os
154
-
155
- logger.info("πŸ“¦ Using boto3 for S3 download...")
156
-
157
- # Create S3 client with no credentials (for public buckets)
158
- s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
159
-
160
- # List objects in the S3 prefix
161
- logger.info(f"Listing objects in s3://{bucket}/{prefix}")
162
- paginator = s3.get_paginator('list_objects_v2')
163
- pages = paginator.paginate(Bucket=bucket, Prefix=prefix)
164
-
165
- os.makedirs(local_path, exist_ok=True)
166
-
167
- file_count = 0
168
- for page in pages:
169
- if 'Contents' in page:
170
- for obj in page['Contents']:
171
- key = obj['Key']
172
- # Get relative path by removing prefix
173
- relative_path = key[len(prefix):].lstrip('/')
174
- if relative_path: # Skip empty paths
175
- local_file_path = os.path.join(local_path, relative_path)
176
-
177
- # Create directory if needed
178
- os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
179
-
180
- # Download file
181
- file_count += 1
182
- if file_count % 10 == 0:
183
- logger.info(f"Downloaded {file_count} files...")
184
-
185
- s3.download_file(bucket, key, local_file_path)
186
-
187
- logger.info(f"βœ… ChromaDB download from S3 (boto3) complete! Downloaded {file_count} files.")
188
- return True
189
-
190
  except ImportError:
191
- logger.error("❌ boto3 not available. Please install: pip install boto3")
192
  return False
193
  except Exception as e:
194
- logger.error(f"❌ Error downloading from S3 with boto3: {e}")
 
195
  return False
196
 
197
  # Embedding model configuration
 
38
  }
39
 
40
  def setup_chromadb():
41
+ """Setup ChromaDB - use persistent storage (/data) or download from HF Dataset if needed"""
42
  logger.info("=== ChromaDB Setup Starting ===")
43
 
44
  chroma_config = get_chroma_config()
 
60
  logger.info(f"Updated CHROMA_PERSIST_DIRECTORY to: {persistent_chroma_path}")
61
  return persistent_chroma_path
62
 
63
+ # ChromaDB not found in persistent storage, try to download from HF Dataset
64
+ logger.info("ChromaDB not found in persistent storage, checking HF Dataset configuration...")
65
 
66
+ dataset_repo = os.getenv("CHROMA_DATASET_REPO", "hyperdemocracy/usc-chroma-vecs-v1-chunks-v1-s8192-o512-sentence-transformers-static-retrieval-mrl-en-v1")
67
+ logger.info(f"HF Dataset repo: {dataset_repo}")
68
 
69
+ if dataset_repo:
70
+ logger.info(f"πŸ“₯ Downloading ChromaDB from HF Dataset to persistent storage...")
71
+ logger.info(f" Source: {dataset_repo}")
 
 
 
72
  logger.info(f" Target: {persistent_chroma_path}")
73
 
74
+ success = download_chromadb_from_hf_dataset(dataset_repo, persistent_chroma_path)
75
  if success:
76
  # Update environment variable to point to persistent storage
77
  os.environ["CHROMA_PERSIST_DIRECTORY"] = persistent_chroma_path
78
  logger.info(f"βœ… ChromaDB download successful! Updated path to: {persistent_chroma_path}")
79
  return persistent_chroma_path
80
  else:
81
+ logger.error("❌ ChromaDB download from HF Dataset failed!")
82
  else:
83
+ logger.error("❌ No HF Dataset configuration found (CHROMA_DATASET_REPO)")
84
  logger.info("Available environment variables:")
85
  for key, value in os.environ.items():
86
  if "CHROMA" in key:
 
103
  logger.info("=== ChromaDB Setup Complete ===")
104
  return chroma_path
105
 
106
+ def download_chromadb_from_hf_dataset(dataset_repo: str, local_path: str) -> bool:
107
+ """Download ChromaDB from HuggingFace Dataset"""
108
+ logger.info(f"Starting HF Dataset download: {dataset_repo} -> {local_path}")
109
 
110
  try:
111
+ from huggingface_hub import snapshot_download
112
  import os
113
 
114
  # Ensure target directory exists
115
  logger.info(f"Creating target directory: {local_path}")
116
  os.makedirs(local_path, exist_ok=True)
117
 
118
+ # Download the dataset using snapshot_download
119
+ logger.info(f"Downloading dataset: {dataset_repo}")
120
+ logger.info("This may take several minutes for large datasets...")
121
+
122
+ downloaded_path = snapshot_download(
123
+ repo_id=dataset_repo,
124
+ repo_type="dataset",
125
+ local_dir=local_path,
126
+ cache_dir="/tmp/hf_chromadb_cache",
127
+ resume_download=True, # Resume if interrupted
128
+ local_dir_use_symlinks=False # Copy files instead of symlinks
129
+ )
130
+
131
+ logger.info(f"βœ… ChromaDB download from HF Dataset complete!")
132
+ logger.info(f"Downloaded to: {downloaded_path}")
133
+
134
+ # Verify the download by checking for expected files
135
+ if os.path.exists(local_path) and os.listdir(local_path):
136
+ file_count = sum(len(files) for _, _, files in os.walk(local_path))
137
+ total_size = sum(
138
+ os.path.getsize(os.path.join(dirpath, filename))
139
+ for dirpath, _, filenames in os.walk(local_path)
140
+ for filename in filenames
141
+ ) / (1024 * 1024 * 1024) # Convert to GB
142
+
143
+ logger.info(f"πŸ“Š Download verification:")
144
+ logger.info(f" Files: {file_count}")
145
+ logger.info(f" Total size: {total_size:.2f} GB")
146
+
147
  return True
148
  else:
149
+ logger.error("❌ Download completed but no files found in target directory")
 
 
 
150
  return False
151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  except ImportError:
153
+ logger.error("❌ huggingface_hub not available. Please install: pip install huggingface_hub")
154
  return False
155
  except Exception as e:
156
+ logger.error(f"❌ Error downloading from HF Dataset: {e}")
157
+ logger.error(f"Exception type: {type(e).__name__}")
158
  return False
159
 
160
  # Embedding model configuration