Consolidate embeddings
Browse files- .dockerignore +12 -5
- .gitignore +8 -0
- backend/runner/inference.py +36 -0
- consolidate_embeddings.py +156 -0
- data/embeddings/clip_embeddings_consolidated.pt +3 -0
- data/embeddings/clip_embeddings_metadata.json +8 -0
- data/embeddings/paintingclip_embeddings_consolidated.pt +3 -0
- data/embeddings/paintingclip_embeddings_metadata.json +8 -0
.dockerignore
CHANGED
|
@@ -10,12 +10,19 @@ pipeline/
|
|
| 10 |
# OS cruft
|
| 11 |
.DS_Store
|
| 12 |
|
| 13 |
-
# Exclude large files
|
| 14 |
data/*.bin
|
| 15 |
data/*.pdf
|
| 16 |
|
| 17 |
-
#
|
| 18 |
-
|
|
|
|
| 19 |
|
| 20 |
-
#
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
# OS cruft
|
| 11 |
.DS_Store
|
| 12 |
|
| 13 |
+
# Exclude only large files that aren't essential
|
| 14 |
data/*.bin
|
| 15 |
data/*.pdf
|
| 16 |
|
| 17 |
+
# Exclude individual embedding files (too many for HF Spaces)
|
| 18 |
+
data/embeddings/CLIP_Embeddings/
|
| 19 |
+
data/embeddings/PaintingCLIP_Embeddings/
|
| 20 |
|
| 21 |
+
# Allow consolidated embedding files
|
| 22 |
+
!data/embeddings/*_consolidated.pt
|
| 23 |
+
!data/embeddings/*_metadata.json
|
| 24 |
+
|
| 25 |
+
# Keep essential ML files:
|
| 26 |
+
# - .pt files for embeddings (consolidated)
|
| 27 |
+
# - .safetensors files for models
|
| 28 |
+
# - JSON files for metadata
|
.gitignore
CHANGED
|
@@ -4,6 +4,14 @@
|
|
| 4 |
data/*.bin # Exclude large binary files
|
| 5 |
data/*.pdf # Exclude PDF files
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
# Pipeline (exclude entire directory for Phase 1)
|
| 8 |
pipeline/
|
| 9 |
pipeline/slurm/
|
|
|
|
| 4 |
data/*.bin # Exclude large binary files
|
| 5 |
data/*.pdf # Exclude PDF files
|
| 6 |
|
| 7 |
+
# Exclude individual embedding files (too many for HF Spaces)
|
| 8 |
+
data/embeddings/CLIP_Embeddings/
|
| 9 |
+
data/embeddings/PaintingCLIP_Embeddings/
|
| 10 |
+
|
| 11 |
+
# Allow consolidated embedding files
|
| 12 |
+
!data/embeddings/*_consolidated.pt
|
| 13 |
+
!data/embeddings/*_metadata.json
|
| 14 |
+
|
| 15 |
# Pipeline (exclude entire directory for Phase 1)
|
| 16 |
pipeline/
|
| 17 |
pipeline/slurm/
|
backend/runner/inference.py
CHANGED
|
@@ -446,3 +446,39 @@ def set_model_type(model_type: str) -> None:
|
|
| 446 |
MODEL_TYPE = model_type
|
| 447 |
# Clear the cache to force reinitialization
|
| 448 |
_initialize_pipeline.cache_clear()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 446 |
MODEL_TYPE = model_type
|
| 447 |
# Clear the cache to force reinitialization
|
| 448 |
_initialize_pipeline.cache_clear()
|
| 449 |
+
|
| 450 |
+
|
| 451 |
+
def load_consolidated_embeddings(embedding_file: Path, metadata_file: Path):
|
| 452 |
+
"""Load embeddings from consolidated file with metadata"""
|
| 453 |
+
print(f"Loading consolidated embeddings from {embedding_file}")
|
| 454 |
+
|
| 455 |
+
# Load consolidated data
|
| 456 |
+
consolidated_data = torch.load(embedding_file, map_location='cpu')
|
| 457 |
+
embeddings = consolidated_data['embeddings']
|
| 458 |
+
|
| 459 |
+
# Load metadata for file mapping
|
| 460 |
+
with open(metadata_file, 'r', encoding='utf-8') as f:
|
| 461 |
+
metadata = json.load(f)
|
| 462 |
+
|
| 463 |
+
# Create filename to index mapping
|
| 464 |
+
filename_to_index = {item['filename']: item['index'] for item in metadata['file_mapping']}
|
| 465 |
+
|
| 466 |
+
print(f"Loaded {len(embeddings)} embeddings with dimension {embeddings.shape[1]}")
|
| 467 |
+
|
| 468 |
+
return embeddings, filename_to_index
|
| 469 |
+
|
| 470 |
+
# Update your embedding loading logic
|
| 471 |
+
def load_embeddings_for_model(model_type: str):
|
| 472 |
+
"""Load embeddings for the specified model type"""
|
| 473 |
+
if model_type == "clip":
|
| 474 |
+
embedding_file = CLIP_EMBEDDINGS_DIR / "clip_embeddings_consolidated.pt"
|
| 475 |
+
metadata_file = CLIP_EMBEDDINGS_DIR / "clip_embeddings_metadata.json"
|
| 476 |
+
else: # paintingclip
|
| 477 |
+
embedding_file = PAINTINGCLIP_EMBEDDINGS_DIR / "paintingclip_embeddings_consolidated.pt"
|
| 478 |
+
metadata_file = PAINTINGCLIP_EMBEDDINGS_DIR / "paintingclip_embeddings_metadata.json"
|
| 479 |
+
|
| 480 |
+
if not embedding_file.exists():
|
| 481 |
+
print(f"Warning: Consolidated embedding file not found: {embedding_file}")
|
| 482 |
+
return None, None
|
| 483 |
+
|
| 484 |
+
return load_consolidated_embeddings(embedding_file, metadata_file)
|
consolidate_embeddings.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Consolidate individual embedding .pt files into larger consolidated files.
|
| 4 |
+
This solves the Hugging Face Spaces 10,000 files per directory limit.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import torch
|
| 8 |
+
import os
|
| 9 |
+
import json
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import Dict, List, Tuple
|
| 12 |
+
import argparse
|
| 13 |
+
|
| 14 |
+
def consolidate_embeddings(
|
| 15 |
+
input_dir: Path,
|
| 16 |
+
output_file: Path,
|
| 17 |
+
metadata_file: Path,
|
| 18 |
+
batch_size: int = 1000
|
| 19 |
+
) -> Dict[str, int]:
|
| 20 |
+
"""
|
| 21 |
+
Merge individual .pt files into one large tensor file with metadata.
|
| 22 |
+
|
| 23 |
+
Args:
|
| 24 |
+
input_dir: Directory containing individual .pt files
|
| 25 |
+
output_file: Path to save consolidated tensor
|
| 26 |
+
metadata_file: Path to save file mapping metadata
|
| 27 |
+
batch_size: Process files in batches to manage memory
|
| 28 |
+
|
| 29 |
+
Returns:
|
| 30 |
+
Dict with statistics about the consolidation
|
| 31 |
+
"""
|
| 32 |
+
embedding_files = sorted(list(input_dir.glob("*.pt")))
|
| 33 |
+
|
| 34 |
+
if not embedding_files:
|
| 35 |
+
raise ValueError(f"No .pt files found in {input_dir}")
|
| 36 |
+
|
| 37 |
+
print(f"Found {len(embedding_files)} embedding files in {input_dir}")
|
| 38 |
+
|
| 39 |
+
# Load first file to get embedding dimension
|
| 40 |
+
print("Loading first embedding to determine dimensions...")
|
| 41 |
+
first_embedding = torch.load(embedding_files[0])
|
| 42 |
+
embedding_dim = first_embedding.shape[0]
|
| 43 |
+
print(f"Embedding dimension: {embedding_dim}")
|
| 44 |
+
|
| 45 |
+
# Pre-allocate tensor
|
| 46 |
+
all_embeddings = torch.zeros(len(embedding_files), embedding_dim, dtype=first_embedding.dtype)
|
| 47 |
+
file_mapping = []
|
| 48 |
+
|
| 49 |
+
print(f"Consolidating {len(embedding_files)} embeddings...")
|
| 50 |
+
|
| 51 |
+
for i, file_path in enumerate(embedding_files):
|
| 52 |
+
if i % 1000 == 0:
|
| 53 |
+
print(f"Processing {i}/{len(embedding_files)} ({i/len(embedding_files)*100:.1f}%)")
|
| 54 |
+
|
| 55 |
+
try:
|
| 56 |
+
embedding = torch.load(file_path)
|
| 57 |
+
all_embeddings[i] = embedding
|
| 58 |
+
|
| 59 |
+
# Store file mapping for later lookup
|
| 60 |
+
file_mapping.append({
|
| 61 |
+
'index': i,
|
| 62 |
+
'filename': file_path.name,
|
| 63 |
+
'stem': file_path.stem,
|
| 64 |
+
'file_size': file_path.stat().st_size
|
| 65 |
+
})
|
| 66 |
+
|
| 67 |
+
except Exception as e:
|
| 68 |
+
print(f"Error loading {file_path}: {e}")
|
| 69 |
+
# Fill with zeros if file is corrupted
|
| 70 |
+
all_embeddings[i] = torch.zeros(embedding_dim, dtype=first_embedding.dtype)
|
| 71 |
+
|
| 72 |
+
# Save consolidated data
|
| 73 |
+
print(f"Saving consolidated embeddings to {output_file}...")
|
| 74 |
+
consolidated_data = {
|
| 75 |
+
'embeddings': all_embeddings,
|
| 76 |
+
'embedding_dim': embedding_dim,
|
| 77 |
+
'num_embeddings': len(embedding_files),
|
| 78 |
+
'dtype': str(first_embedding.dtype)
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
torch.save(consolidated_data, output_file)
|
| 82 |
+
|
| 83 |
+
# Save metadata for lookup
|
| 84 |
+
print(f"Saving metadata to {metadata_file}...")
|
| 85 |
+
metadata = {
|
| 86 |
+
'input_directory': str(input_dir),
|
| 87 |
+
'output_file': str(output_file),
|
| 88 |
+
'num_embeddings': len(embedding_files),
|
| 89 |
+
'embedding_dim': embedding_dim,
|
| 90 |
+
'dtype': str(first_embedding.dtype),
|
| 91 |
+
'file_mapping': file_mapping
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
with open(metadata_file, 'w', encoding='utf-8') as f:
|
| 95 |
+
json.dump(metadata, f, indent=2, ensure_ascii=False)
|
| 96 |
+
|
| 97 |
+
# Calculate file sizes
|
| 98 |
+
original_size = sum(f.stat().st_size for f in embedding_files)
|
| 99 |
+
consolidated_size = output_file.stat().st_size
|
| 100 |
+
metadata_size = metadata_file.stat().st_size
|
| 101 |
+
|
| 102 |
+
stats = {
|
| 103 |
+
'num_files_processed': len(embedding_files),
|
| 104 |
+
'original_size_mb': original_size / (1024 * 1024),
|
| 105 |
+
'consolidated_size_mb': consolidated_size / (1024 * 1024),
|
| 106 |
+
'metadata_size_kb': metadata_size / 1024,
|
| 107 |
+
'compression_ratio': original_size / consolidated_size if consolidated_size > 0 else 0
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
print(f"\nConsolidation complete!")
|
| 111 |
+
print(f"Files processed: {stats['num_files_processed']}")
|
| 112 |
+
print(f"Original size: {stats['original_size_mb']:.1f} MB")
|
| 113 |
+
print(f"Consolidated size: {stats['consolidated_size_mb']:.1f} MB")
|
| 114 |
+
print(f"Metadata size: {stats['metadata_size_kb']:.1f} KB")
|
| 115 |
+
print(f"Compression ratio: {stats['compression_ratio']:.2f}x")
|
| 116 |
+
|
| 117 |
+
return stats
|
| 118 |
+
|
| 119 |
+
def main():
|
| 120 |
+
parser = argparse.ArgumentParser(description='Consolidate embedding files')
|
| 121 |
+
parser.add_argument('--input-dir', type=str, required=True,
|
| 122 |
+
help='Input directory containing .pt files')
|
| 123 |
+
parser.add_argument('--output-file', type=str, required=True,
|
| 124 |
+
help='Output consolidated .pt file')
|
| 125 |
+
parser.add_argument('--metadata-file', type=str, required=True,
|
| 126 |
+
help='Output metadata JSON file')
|
| 127 |
+
parser.add_argument('--batch-size', type=int, default=1000,
|
| 128 |
+
help='Batch size for processing (default: 1000)')
|
| 129 |
+
|
| 130 |
+
args = parser.parse_args()
|
| 131 |
+
|
| 132 |
+
input_dir = Path(args.input_dir)
|
| 133 |
+
output_file = Path(args.output_file)
|
| 134 |
+
metadata_file = Path(args.metadata_file)
|
| 135 |
+
|
| 136 |
+
if not input_dir.exists():
|
| 137 |
+
print(f"Error: Input directory {input_dir} does not exist")
|
| 138 |
+
return 1
|
| 139 |
+
|
| 140 |
+
# Create output directory if it doesn't exist
|
| 141 |
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
| 142 |
+
|
| 143 |
+
try:
|
| 144 |
+
stats = consolidate_embeddings(
|
| 145 |
+
input_dir=input_dir,
|
| 146 |
+
output_file=output_file,
|
| 147 |
+
metadata_file=metadata_file,
|
| 148 |
+
batch_size=args.batch_size
|
| 149 |
+
)
|
| 150 |
+
return 0
|
| 151 |
+
except Exception as e:
|
| 152 |
+
print(f"Error during consolidation: {e}")
|
| 153 |
+
return 1
|
| 154 |
+
|
| 155 |
+
if __name__ == "__main__":
|
| 156 |
+
exit(main())
|
data/embeddings/clip_embeddings_consolidated.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ebcce37ea66ceb8417a2e52e80e38e1c2970e7ef6026b5546f57a9a09f2f3c85
|
| 3 |
+
size 60604
|
data/embeddings/clip_embeddings_metadata.json
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"input_directory": "data/embeddings/CLIP_Embeddings",
|
| 3 |
+
"output_file": "data/embeddings/clip_embeddings_consolidated.pt",
|
| 4 |
+
"num_embeddings": 14674,
|
| 5 |
+
"embedding_dim": 1,
|
| 6 |
+
"dtype": "torch.float32",
|
| 7 |
+
"file_mapping": []
|
| 8 |
+
}
|
data/embeddings/paintingclip_embeddings_consolidated.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:303a47cd9c8b24cead5d27b7e6f23e99c2fc400b03028034447374c19f9adfba
|
| 3 |
+
size 60660
|
data/embeddings/paintingclip_embeddings_metadata.json
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"input_directory": "data/embeddings/PaintingCLIP_Embeddings",
|
| 3 |
+
"output_file": "data/embeddings/paintingclip_embeddings_consolidated.pt",
|
| 4 |
+
"num_embeddings": 14674,
|
| 5 |
+
"embedding_dim": 1,
|
| 6 |
+
"dtype": "torch.float32",
|
| 7 |
+
"file_mapping": []
|
| 8 |
+
}
|