samwaugh commited on
Commit
7973274
·
1 Parent(s): 3cc3618

Consolidate embeddings

Browse files
.dockerignore CHANGED
@@ -10,12 +10,19 @@ pipeline/
10
  # OS cruft
11
  .DS_Store
12
 
13
- # Exclude large files but allow model files and embedding files
14
  data/*.bin
15
  data/*.pdf
16
 
17
- # Keep embedding files (.pt) for ML inference
18
- # data/*.pt <- This line is removed to allow .pt files
 
19
 
20
- # Keep model files (.safetensors) for PaintingCLIP LoRA
21
- # data/*.safetensors <- This line is removed to allow .safetensors files
 
 
 
 
 
 
 
10
  # OS cruft
11
  .DS_Store
12
 
13
+ # Exclude only large files that aren't essential
14
  data/*.bin
15
  data/*.pdf
16
 
17
+ # Exclude individual embedding files (too many for HF Spaces)
18
+ data/embeddings/CLIP_Embeddings/
19
+ data/embeddings/PaintingCLIP_Embeddings/
20
 
21
+ # Allow consolidated embedding files
22
+ !data/embeddings/*_consolidated.pt
23
+ !data/embeddings/*_metadata.json
24
+
25
+ # Keep essential ML files:
26
+ # - .pt files for embeddings (consolidated)
27
+ # - .safetensors files for models
28
+ # - JSON files for metadata
.gitignore CHANGED
@@ -4,6 +4,14 @@
4
  data/*.bin # Exclude large binary files
5
  data/*.pdf # Exclude PDF files
6
 
 
 
 
 
 
 
 
 
7
  # Pipeline (exclude entire directory for Phase 1)
8
  pipeline/
9
  pipeline/slurm/
 
4
  data/*.bin # Exclude large binary files
5
  data/*.pdf # Exclude PDF files
6
 
7
+ # Exclude individual embedding files (too many for HF Spaces)
8
+ data/embeddings/CLIP_Embeddings/
9
+ data/embeddings/PaintingCLIP_Embeddings/
10
+
11
+ # Allow consolidated embedding files
12
+ !data/embeddings/*_consolidated.pt
13
+ !data/embeddings/*_metadata.json
14
+
15
  # Pipeline (exclude entire directory for Phase 1)
16
  pipeline/
17
  pipeline/slurm/
backend/runner/inference.py CHANGED
@@ -446,3 +446,39 @@ def set_model_type(model_type: str) -> None:
446
  MODEL_TYPE = model_type
447
  # Clear the cache to force reinitialization
448
  _initialize_pipeline.cache_clear()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
446
  MODEL_TYPE = model_type
447
  # Clear the cache to force reinitialization
448
  _initialize_pipeline.cache_clear()
449
+
450
+
451
+ def load_consolidated_embeddings(embedding_file: Path, metadata_file: Path):
452
+ """Load embeddings from consolidated file with metadata"""
453
+ print(f"Loading consolidated embeddings from {embedding_file}")
454
+
455
+ # Load consolidated data
456
+ consolidated_data = torch.load(embedding_file, map_location='cpu')
457
+ embeddings = consolidated_data['embeddings']
458
+
459
+ # Load metadata for file mapping
460
+ with open(metadata_file, 'r', encoding='utf-8') as f:
461
+ metadata = json.load(f)
462
+
463
+ # Create filename to index mapping
464
+ filename_to_index = {item['filename']: item['index'] for item in metadata['file_mapping']}
465
+
466
+ print(f"Loaded {len(embeddings)} embeddings with dimension {embeddings.shape[1]}")
467
+
468
+ return embeddings, filename_to_index
469
+
470
+ # Update your embedding loading logic
471
+ def load_embeddings_for_model(model_type: str):
472
+ """Load embeddings for the specified model type"""
473
+ if model_type == "clip":
474
+ embedding_file = CLIP_EMBEDDINGS_DIR / "clip_embeddings_consolidated.pt"
475
+ metadata_file = CLIP_EMBEDDINGS_DIR / "clip_embeddings_metadata.json"
476
+ else: # paintingclip
477
+ embedding_file = PAINTINGCLIP_EMBEDDINGS_DIR / "paintingclip_embeddings_consolidated.pt"
478
+ metadata_file = PAINTINGCLIP_EMBEDDINGS_DIR / "paintingclip_embeddings_metadata.json"
479
+
480
+ if not embedding_file.exists():
481
+ print(f"Warning: Consolidated embedding file not found: {embedding_file}")
482
+ return None, None
483
+
484
+ return load_consolidated_embeddings(embedding_file, metadata_file)
consolidate_embeddings.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Consolidate individual embedding .pt files into larger consolidated files.
4
+ This solves the Hugging Face Spaces 10,000 files per directory limit.
5
+ """
6
+
7
+ import torch
8
+ import os
9
+ import json
10
+ from pathlib import Path
11
+ from typing import Dict, List, Tuple
12
+ import argparse
13
+
14
+ def consolidate_embeddings(
15
+ input_dir: Path,
16
+ output_file: Path,
17
+ metadata_file: Path,
18
+ batch_size: int = 1000
19
+ ) -> Dict[str, int]:
20
+ """
21
+ Merge individual .pt files into one large tensor file with metadata.
22
+
23
+ Args:
24
+ input_dir: Directory containing individual .pt files
25
+ output_file: Path to save consolidated tensor
26
+ metadata_file: Path to save file mapping metadata
27
+ batch_size: Process files in batches to manage memory
28
+
29
+ Returns:
30
+ Dict with statistics about the consolidation
31
+ """
32
+ embedding_files = sorted(list(input_dir.glob("*.pt")))
33
+
34
+ if not embedding_files:
35
+ raise ValueError(f"No .pt files found in {input_dir}")
36
+
37
+ print(f"Found {len(embedding_files)} embedding files in {input_dir}")
38
+
39
+ # Load first file to get embedding dimension
40
+ print("Loading first embedding to determine dimensions...")
41
+ first_embedding = torch.load(embedding_files[0])
42
+ embedding_dim = first_embedding.shape[0]
43
+ print(f"Embedding dimension: {embedding_dim}")
44
+
45
+ # Pre-allocate tensor
46
+ all_embeddings = torch.zeros(len(embedding_files), embedding_dim, dtype=first_embedding.dtype)
47
+ file_mapping = []
48
+
49
+ print(f"Consolidating {len(embedding_files)} embeddings...")
50
+
51
+ for i, file_path in enumerate(embedding_files):
52
+ if i % 1000 == 0:
53
+ print(f"Processing {i}/{len(embedding_files)} ({i/len(embedding_files)*100:.1f}%)")
54
+
55
+ try:
56
+ embedding = torch.load(file_path)
57
+ all_embeddings[i] = embedding
58
+
59
+ # Store file mapping for later lookup
60
+ file_mapping.append({
61
+ 'index': i,
62
+ 'filename': file_path.name,
63
+ 'stem': file_path.stem,
64
+ 'file_size': file_path.stat().st_size
65
+ })
66
+
67
+ except Exception as e:
68
+ print(f"Error loading {file_path}: {e}")
69
+ # Fill with zeros if file is corrupted
70
+ all_embeddings[i] = torch.zeros(embedding_dim, dtype=first_embedding.dtype)
71
+
72
+ # Save consolidated data
73
+ print(f"Saving consolidated embeddings to {output_file}...")
74
+ consolidated_data = {
75
+ 'embeddings': all_embeddings,
76
+ 'embedding_dim': embedding_dim,
77
+ 'num_embeddings': len(embedding_files),
78
+ 'dtype': str(first_embedding.dtype)
79
+ }
80
+
81
+ torch.save(consolidated_data, output_file)
82
+
83
+ # Save metadata for lookup
84
+ print(f"Saving metadata to {metadata_file}...")
85
+ metadata = {
86
+ 'input_directory': str(input_dir),
87
+ 'output_file': str(output_file),
88
+ 'num_embeddings': len(embedding_files),
89
+ 'embedding_dim': embedding_dim,
90
+ 'dtype': str(first_embedding.dtype),
91
+ 'file_mapping': file_mapping
92
+ }
93
+
94
+ with open(metadata_file, 'w', encoding='utf-8') as f:
95
+ json.dump(metadata, f, indent=2, ensure_ascii=False)
96
+
97
+ # Calculate file sizes
98
+ original_size = sum(f.stat().st_size for f in embedding_files)
99
+ consolidated_size = output_file.stat().st_size
100
+ metadata_size = metadata_file.stat().st_size
101
+
102
+ stats = {
103
+ 'num_files_processed': len(embedding_files),
104
+ 'original_size_mb': original_size / (1024 * 1024),
105
+ 'consolidated_size_mb': consolidated_size / (1024 * 1024),
106
+ 'metadata_size_kb': metadata_size / 1024,
107
+ 'compression_ratio': original_size / consolidated_size if consolidated_size > 0 else 0
108
+ }
109
+
110
+ print(f"\nConsolidation complete!")
111
+ print(f"Files processed: {stats['num_files_processed']}")
112
+ print(f"Original size: {stats['original_size_mb']:.1f} MB")
113
+ print(f"Consolidated size: {stats['consolidated_size_mb']:.1f} MB")
114
+ print(f"Metadata size: {stats['metadata_size_kb']:.1f} KB")
115
+ print(f"Compression ratio: {stats['compression_ratio']:.2f}x")
116
+
117
+ return stats
118
+
119
+ def main():
120
+ parser = argparse.ArgumentParser(description='Consolidate embedding files')
121
+ parser.add_argument('--input-dir', type=str, required=True,
122
+ help='Input directory containing .pt files')
123
+ parser.add_argument('--output-file', type=str, required=True,
124
+ help='Output consolidated .pt file')
125
+ parser.add_argument('--metadata-file', type=str, required=True,
126
+ help='Output metadata JSON file')
127
+ parser.add_argument('--batch-size', type=int, default=1000,
128
+ help='Batch size for processing (default: 1000)')
129
+
130
+ args = parser.parse_args()
131
+
132
+ input_dir = Path(args.input_dir)
133
+ output_file = Path(args.output_file)
134
+ metadata_file = Path(args.metadata_file)
135
+
136
+ if not input_dir.exists():
137
+ print(f"Error: Input directory {input_dir} does not exist")
138
+ return 1
139
+
140
+ # Create output directory if it doesn't exist
141
+ output_file.parent.mkdir(parents=True, exist_ok=True)
142
+
143
+ try:
144
+ stats = consolidate_embeddings(
145
+ input_dir=input_dir,
146
+ output_file=output_file,
147
+ metadata_file=metadata_file,
148
+ batch_size=args.batch_size
149
+ )
150
+ return 0
151
+ except Exception as e:
152
+ print(f"Error during consolidation: {e}")
153
+ return 1
154
+
155
+ if __name__ == "__main__":
156
+ exit(main())
data/embeddings/clip_embeddings_consolidated.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ebcce37ea66ceb8417a2e52e80e38e1c2970e7ef6026b5546f57a9a09f2f3c85
3
+ size 60604
data/embeddings/clip_embeddings_metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "input_directory": "data/embeddings/CLIP_Embeddings",
3
+ "output_file": "data/embeddings/clip_embeddings_consolidated.pt",
4
+ "num_embeddings": 14674,
5
+ "embedding_dim": 1,
6
+ "dtype": "torch.float32",
7
+ "file_mapping": []
8
+ }
data/embeddings/paintingclip_embeddings_consolidated.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:303a47cd9c8b24cead5d27b7e6f23e99c2fc400b03028034447374c19f9adfba
3
+ size 60660
data/embeddings/paintingclip_embeddings_metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "input_directory": "data/embeddings/PaintingCLIP_Embeddings",
3
+ "output_file": "data/embeddings/paintingclip_embeddings_consolidated.pt",
4
+ "num_embeddings": 14674,
5
+ "embedding_dim": 1,
6
+ "dtype": "torch.float32",
7
+ "file_mapping": []
8
+ }