Spaces:

manisharma494
/

Virtual-Search-System

Sleeping

App Files Files Community

manisharma494 commited on Sep 5

Commit

18a001a

verified ·

1 Parent(s): f65d508

Update app.py

Browse files

Files changed (1) hide show

app.py +189 -70

app.py CHANGED Viewed

@@ -23,6 +23,17 @@ import datetime
 from typing import Optional, Tuple, List
 import torch
 from transformers import CLIPProcessor, CLIPModel
 # -----------------------
 # Configuration
@@ -37,14 +48,15 @@ MAX_IMAGES = 250  # Set to 250 as requested
 JPEG_QUALITY = 85
 TARGET_MAX_SIZE = (800, 800)
 MAX_WORKERS = 6   # Reduced for stability
-RETRY_COUNT = 3
 BATCH_SIZE = 20
 EMB_NPY = EMBED_DIR / "image_embeddings.npy"
 EMB_INDEX_JSON = EMBED_DIR / "index.json"
 # Removed HIST_BINS_PER_CHANNEL and HIST_RANGE as they are no longer used for embedding generation
-CLIP_MODEL = "openai/clip-vit-base-patch32"
 @st.cache_resource
 def load_clip_model():
@@ -52,10 +64,20 @@ def load_clip_model():
     print(f"Loading CLIP model: {CLIP_MODEL}...")
     processor = CLIPProcessor.from_pretrained(CLIP_MODEL)
     model = CLIPModel.from_pretrained(CLIP_MODEL)
     print("CLIP model loaded successfully.")
-    return processor, model
-CLIP_PROCESSOR, CLIP_MODEL_LOCAL = load_clip_model()
 # Phase Constants
 PHASE_IDLE = "idle"
@@ -126,12 +148,23 @@ progress_tracker = SafeProgressTracker()
 # Utility Functions
 # -----------------------
 def ensure_dirs():
-    """Create directories if they don't exist"""
     try:
         IMAGES_DIR.mkdir(parents=True, exist_ok=True)
         EMBED_DIR.mkdir(parents=True, exist_ok=True)
     except Exception as e:
-        print(f"Directory creation error: {e}")
 def seq_filename(i: int) -> str:
     return f"{i:04d}.jpg"
@@ -217,6 +250,7 @@ def download_single_image(i: int, url: str) -> bool:
             response = requests.get(url, stream=True, timeout=(30, 90))
             if response.status_code != 200:
                 if attempt == RETRY_COUNT - 1:
                     return False
                 time.sleep(2 ** attempt)  # Exponential backoff
                 continue
@@ -310,16 +344,70 @@ def create_safe_embedding(img_path: Path) -> np.ndarray:
             return np.zeros(CLIP_MODEL_LOCAL.config.projection_dim, dtype=np.float32)
         img = Image.open(img_path).convert("RGB")
         inputs = CLIP_PROCESSOR(images=img, return_tensors="pt")
         with torch.no_grad():
-            embeddings = CLIP_MODEL_LOCAL.get_image_features(**inputs)
-        return embeddings.squeeze().cpu().numpy().astype(np.float32)
     except Exception as e:
         print(f"Embedding creation error for {img_path}: {e}")
         return np.zeros(CLIP_MODEL_LOCAL.config.projection_dim, dtype=np.float32)
 def process_embeddings_thread_safe() -> bool:
     """Create embeddings in background thread - NO Streamlit APIs"""
     image_files = sorted([f for f in IMAGES_DIR.glob("*.jpg")
@@ -328,6 +416,14 @@ def process_embeddings_thread_safe() -> bool:
     if not image_files:
         progress_tracker.update(PHASE_ERROR, 0, 1, 1, "❌ No images found", "")
         return False
     # Check if embeddings already exist and are current
     try:
@@ -348,68 +444,97 @@ def process_embeddings_thread_safe() -> bool:
     index = []
     processed = 0
     failed = 0
     progress_tracker.update(PHASE_2_EMBEDDING, 0, total, 0,
                            f"🧠 Creating embeddings for {total} images...",
                            "Processing visual features")
     try:
-        for img_file in image_files:
-            embedding = create_safe_embedding(img_file)
-            if np.any(embedding):  # Check if embedding is not all zeros
-                embeddings.append(embedding)
-                index.append(img_file.name)
-            else:
-                failed += 1
-                # Still add to maintain indexing
-                embeddings.append(embedding)
-                index.append(img_file.name)
-            processed += 1
-            # Save in batches for resilience
-            if processed % BATCH_SIZE == 0 or processed == total:
-                try:
-                    if embeddings:
-                        embeddings_array = np.vstack(embeddings).astype(np.float32)
-                        # Atomic save
-                        temp_npy = EMB_NPY.with_suffix('.tmp')
-                        temp_json = EMB_INDEX_JSON.with_suffix('.tmp')
-                        np.save(temp_npy, embeddings_array)
-                        with open(temp_json, 'w') as f:
-                            json.dump(index, f, indent=2)
-                        # Atomic move
-                        temp_npy.replace(EMB_NPY)
-                        temp_json.replace(EMB_INDEX_JSON)
                     else:
-                        # If no valid embeddings were created, ensure files are empty or removed.
-                        # This prevents partial/corrupted files from being considered complete.
-                        if EMB_NPY.exists():
-                            EMB_NPY.unlink()
-                        if EMB_INDEX_JSON.exists():
-                            EMB_INDEX_JSON.unlink()
-                        print("No valid embeddings to save, clearing existing embedding files.")
-                    details = f"💾 Batch saved • 📊 {len(embeddings)} embeddings"
-                    if failed > 0:
-                        details += f" • ⚠️ {failed} errors"
-                    message = f"🧠 Processed {processed}/{total}"
-                    if processed == total:
-                        message = "✅ All embeddings created!"
-                    progress_tracker.update(PHASE_2_EMBEDDING, processed, total, failed,
-                                           message, details)
-                except Exception as e:
-                    progress_tracker.update(PHASE_ERROR, processed, total, failed,
-                                           f"❌ Save failed: {e}", "")
-                    return False
         return True
     except Exception as e:
@@ -667,12 +792,6 @@ def init_session_state():
 def main():
     """Main application - All session state access here"""
-    st.set_page_config(
-        page_title="Visual Search System",
-        page_icon="🔍",
-        layout="wide",
-        initial_sidebar_state="collapsed"
-    )
     apply_styling()
     init_session_state()  # Safe - main thread only

 from typing import Optional, Tuple, List
 import torch
 from transformers import CLIPProcessor, CLIPModel
+import PIL
+# Reduce thread contention in tokenizers
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+st.set_page_config(
+    page_title="Visual Search System",
+    page_icon="🔍",
+    layout="wide",
+    initial_sidebar_state="collapsed"
+)
 # -----------------------
 # Configuration
 JPEG_QUALITY = 85
 TARGET_MAX_SIZE = (800, 800)
 MAX_WORKERS = 6   # Reduced for stability
+RETRY_COUNT = 5
 BATCH_SIZE = 20
+EMBED_BATCH_SIZE = 8
 EMB_NPY = EMBED_DIR / "image_embeddings.npy"
 EMB_INDEX_JSON = EMBED_DIR / "index.json"
 # Removed HIST_BINS_PER_CHANNEL and HIST_RANGE as they are no longer used for embedding generation
+CLIP_MODEL = "openai/clip-vit-small-patch16" # Switched to smaller model
 @st.cache_resource
 def load_clip_model():
     print(f"Loading CLIP model: {CLIP_MODEL}...")
     processor = CLIPProcessor.from_pretrained(CLIP_MODEL)
     model = CLIPModel.from_pretrained(CLIP_MODEL)
+    device = torch.device("cpu")
+    model.to(device)
+    model.eval()
+    # Limit CPU threads to avoid oversubscription on Spaces/limited CPUs
+    try:
+        torch.set_num_threads(max(1, min(4, os.cpu_count() or 2)))
+    except Exception:
+        pass
     print("CLIP model loaded successfully.")
+    return processor, model, device
+CLIP_PROCESSOR, CLIP_MODEL_LOCAL, CLIP_DEVICE = load_clip_model()
+# Removed HF_TOKEN, API_URL, HEADERS as they are no longer used for image embedding
 # Phase Constants
 PHASE_IDLE = "idle"
 # Utility Functions
 # -----------------------
 def ensure_dirs():
+    """Create directories if they don't exist and clean up old progress/temp files"""
     try:
         IMAGES_DIR.mkdir(parents=True, exist_ok=True)
         EMBED_DIR.mkdir(parents=True, exist_ok=True)
+        # Clean up old progress and temp embedding files for a fresh start
+        if PROGRESS_FILE.exists():
+            PROGRESS_FILE.unlink()
+        if SETUP_COMPLETE_FILE.exists():
+            SETUP_COMPLETE_FILE.unlink()
+        for f in EMBED_DIR.glob("*.tmp"): # Clean up any temp embedding files
+            f.unlink()
+        for f in IMAGES_DIR.glob("*.tmp"): # Clean up any temp image files
+            f.unlink()
     except Exception as e:
+        print(f"Directory or cleanup error: {e}")
 def seq_filename(i: int) -> str:
     return f"{i:04d}.jpg"
             response = requests.get(url, stream=True, timeout=(30, 90))
             if response.status_code != 200:
                 if attempt == RETRY_COUNT - 1:
+                    print(f"Final download attempt failed for {url}. Status: {response.status_code}")
                     return False
                 time.sleep(2 ** attempt)  # Exponential backoff
                 continue
             return np.zeros(CLIP_MODEL_LOCAL.config.projection_dim, dtype=np.float32)
         img = Image.open(img_path).convert("RGB")
+        print(f"Embedding image: {img_path.name}, size={img.size}, mode={img.mode}")
         inputs = CLIP_PROCESSOR(images=img, return_tensors="pt")
+        inputs = {k: v.to(CLIP_DEVICE) for k, v in inputs.items()}
         with torch.no_grad():
+            image_features = CLIP_MODEL_LOCAL.get_image_features(**inputs)
+        if torch.isnan(image_features).any() or torch.isinf(image_features).any():
+            print(f"NaN/Inf detected in features for {img_path.name}")
+            return np.zeros(CLIP_MODEL_LOCAL.config.projection_dim, dtype=np.float32)
+        vec = image_features.squeeze().detach().cpu().numpy().astype(np.float32)
+        print(f"Feature vector shape: {vec.shape}, dtype: {vec.dtype}")
+        if vec.ndim != 1:
+            vec = vec.reshape(-1)
+        if vec.size != CLIP_MODEL_LOCAL.config.projection_dim:
+            print(f"Warning: feature dim {vec.size} != projection_dim {CLIP_MODEL_LOCAL.config.projection_dim}")
+        return vec
     except Exception as e:
         print(f"Embedding creation error for {img_path}: {e}")
         return np.zeros(CLIP_MODEL_LOCAL.config.projection_dim, dtype=np.float32)
+def create_embeddings_batch(image_paths: List[Path]) -> np.ndarray:
+    """Create embeddings for a batch of images efficiently on CPU.
+    Returns array of shape (batch_size, projection_dim). Fills zeros on failures.
+    """
+    images = []
+    fallback_indices = []
+    for idx, p in enumerate(image_paths):
+        try:
+            if not p.exists() or p.stat().st_size == 0:
+                fallback_indices.append(idx)
+                images.append(Image.new("RGB", (224, 224), color=(0, 0, 0)))
+                continue
+            img = Image.open(p).convert("RGB")
+            # Pre-resize to 224 to reduce CPU and memory
+            img = img.resize((224, 224), Image.Resampling.LANCZOS)
+            images.append(img)
+        except (PIL.UnidentifiedImageError, IOError) as image_err:
+            print(f"Image loading error for {p.name}: {image_err}. Using blank image.")
+            fallback_indices.append(idx)
+            images.append(Image.new("RGB", (224, 224), color=(0, 0, 0)))
+        except Exception as e:
+            print(f"Unexpected error loading image {p.name}: {e}. Using blank image.")
+            fallback_indices.append(idx)
+            images.append(Image.new("RGB", (224, 224), color=(0, 0, 0)))
+    try:
+        inputs = CLIP_PROCESSOR(images=images, return_tensors="pt")
+        inputs = {k: v.to(CLIP_DEVICE) for k, v in inputs.items()}
+        with torch.no_grad():
+            feats = CLIP_MODEL_LOCAL.get_image_features(**inputs)
+        feats = feats.detach().cpu().numpy().astype(np.float32)
+        # Replace fallback rows with zeros explicitly
+        for i in fallback_indices:
+            feats[i, :] =  np.zeros_like(feats[i, :])
+        return feats
+    except Exception as e:
+        print(f"Batch embedding error for {len(image_paths)} images: {e}")
+        # Return None to signal caller to fallback to smaller batch
+        return None
 def process_embeddings_thread_safe() -> bool:
     """Create embeddings in background thread - NO Streamlit APIs"""
     image_files = sorted([f for f in IMAGES_DIR.glob("*.jpg")
     if not image_files:
         progress_tracker.update(PHASE_ERROR, 0, 1, 1, "❌ No images found", "")
         return False
+    # Quick self-test on the first image to detect failures early
+    try:
+        test_vec = create_safe_embedding(image_files[0])
+        if not np.any(test_vec):
+            print(f"Self-test failed on first image: {image_files[0].name}")
+    except Exception as e:
+        print(f"Self-test exception: {e}")
     # Check if embeddings already exist and are current
     try:
     index = []
     processed = 0
     failed = 0
     progress_tracker.update(PHASE_2_EMBEDDING, 0, total, 0,
                            f"🧠 Creating embeddings for {total} images...",
                            "Processing visual features")
     try:
+        current_batch_size = EMBED_BATCH_SIZE
+        for start in range(0, total, current_batch_size):
+            # adaptively chunk
+            end = min(start + current_batch_size, total)
+            batch_files = image_files[start:end]
+            # Try with current batch size; fallback by halving on failure
+            attempts = 0
+            feats = None
+            while attempts < 3:
+                feats = create_embeddings_batch(batch_files)
+                if feats is None:
+                    attempts += 1
+                    if current_batch_size > 4:
+                        current_batch_size = max(4, current_batch_size // 2)
+                        end = min(start + current_batch_size, total)
+                        batch_files = image_files[start:end]
+                        print(f"⚠️ Falling back to smaller batch size: {current_batch_size}")
+                        continue
                     else:
+                        # Hard failure at smallest batch: compute per-image to maximize success
+                        per_feats = []
+                        for p in batch_files:
+                            vec = create_safe_embedding(p)
+                            per_feats.append(vec)
+                        feats = np.vstack(per_feats).astype(np.float32)
+                        break
+                break
+            # Count failures in this batch (rows that are all zeros)
+            if feats.ndim != 2 or feats.shape[0] != len(batch_files):
+                print(f"Unexpected batch feature shape: {feats.shape}, expected ({len(batch_files)}, D)")
+            batch_failed = int((np.linalg.norm(feats, axis=1) < 1e-12).sum()) if feats.size else 0
+            failed += batch_failed
+            embeddings.append(feats)
+            index.extend([p.name for p in batch_files])
+            processed = end
+            # Periodic save by batch for resilience
+            try:
+                if embeddings:
+                    embeddings_array = np.vstack(embeddings).astype(np.float32)
+                    temp_npy = EMB_NPY.with_suffix('.tmp')
+                    temp_json = EMB_INDEX_JSON.with_suffix('.tmp')
+                    np.save(temp_npy, embeddings_array)
+                    with open(temp_json, 'w') as f:
+                        json.dump(index, f, indent=2)
+                    temp_npy.replace(EMB_NPY)
+                    temp_json.replace(EMB_INDEX_JSON)
+            except Exception as e:
+                progress_tracker.update(PHASE_ERROR, processed, total, failed,
+                                       f"❌ Save failed: {e}", "")
+                return False
+            # Free memory after each batch
+            try:
+                import gc
+                del feats
+                gc.collect()
+            except Exception:
+                pass
+            success_rate = ((processed - failed) / processed * 100) if processed > 0 else 0
+            batch_success_count = len(batch_files) - batch_failed
+            print(f"Batch {start//current_batch_size + 1} completed: {batch_success_count} success, {batch_failed} failed.")
+            details = f"💾 Saved up to {processed} • 📊 failures {failed}"
+            message = f"🧠 Processed {processed}/{total} ({success_rate:.1f}%)"
+            progress_tracker.update(PHASE_2_EMBEDDING, processed, total, failed,
+                                   message, details)
+        # Final validation
+        embeddings_array = np.vstack(embeddings).astype(np.float32) if embeddings else np.zeros((0, CLIP_MODEL_LOCAL.config.projection_dim), dtype=np.float32)
+        if embeddings_array.shape[0] != len(index) or len(index) != total:
+            print(f"⚠️ Final size mismatch: emb_rows={embeddings_array.shape[0]}, index={len(index)}, total={total}")
+        print(f"Embedding processing completed. Total failed: {failed}/{total}")
+        # Ensure files saved
+        temp_npy = EMB_NPY.with_suffix('.tmp')
+        temp_json = EMB_INDEX_JSON.with_suffix('.tmp')
+        np.save(temp_npy, embeddings_array)
+        with open(temp_json, 'w') as f:
+            json.dump(index, f, indent=2)
+        temp_npy.replace(EMB_NPY)
+        temp_json.replace(EMB_INDEX_JSON)
         return True
     except Exception as e:
 def main():
     """Main application - All session state access here"""
     apply_styling()
     init_session_state()  # Safe - main thread only