Spaces:

vungocthach1112
/

SigLip-Fashion-Retrieval

Build error

App Files Files Community

vungocthach1112 commited on Aug 10

Commit

9f89b03

1 Parent(s): 10fc621

Add logic UI gradio

Browse files

Files changed (5) hide show

app.py +270 -1
configs.py +21 -1
dataset_utils.py +90 -1
encoders.py +139 -1
index_builder.py +168 -1

app.py CHANGED Viewed

@@ -1,3 +1,272 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
-... (truncated for brevity, full content from assistant's previous message) ...

 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
+from __future__ import annotations
+import os
+import json
+import time
+import hashlib
+from pathlib import Path
+from typing import List, Tuple, Optional
+import gradio as gr
+from PIL import Image
+from configs import (
+    DEFAULT_SPLIT,
+    DEFAULT_TOP_K,
+    DEFAULT_MAX_SAMPLES,
+    DEFAULT_IMAGE_COL,
+    DEFAULT_TEXT_COL,
+    DEFAULT_INDEX_DIR,
+    EXAMPLE_QUERIES,
+    DATASET_NAME,
+)
+from dataset_utils import load_fashion_dataset, SampleAccessor
+from encoders import SiglipEncoder
+from index_builder import (
+    IndexStatus,
+    ensure_index,
+    load_faiss_index,
+    search_faiss,
+    index_signature_from_env,
+)
+# --------------------------
+# Globals initialized lazily
+# --------------------------
+_encoder: Optional[SiglipEncoder] = None
+_accessor: Optional[SampleAccessor] = None
+_index_ref = {"index": None, "sig": None, "dim": None}
+def _get_encoder(log: callable) -> SiglipEncoder:
+    global _encoder
+    if _encoder is None:
+        ckpt = os.getenv("SIGLIP_CHECKPOINT_DIR", "siglip_checkpoint")
+        log(f"Loading SigLIP checkpoint from: {ckpt}")
+        _encoder = SiglipEncoder.from_checkpoint_dir(ckpt, log=log)
+        log(f"Device: {_encoder.device}, dtype: {_encoder.dtype}")
+    return _encoder
+def _get_accessor(split: str, image_col: str, text_col: str, max_samples: int, log: callable) -> SampleAccessor:
+    global _accessor
+    # Always create a new accessor matching current UI state
+    _accessor = load_fashion_dataset(
+        dataset_name=DATASET_NAME,
+        split=split,
+        image_col=image_col,
+        text_col=text_col,
+        max_samples=max_samples,
+        log=log,
+    )
+    log(f"Dataset ready: {len(_accessor)} samples.")
+    return _accessor
+def _maybe_load_index(sig: str, index_dir: str, log: callable):
+    """Try to load an existing FAISS index matching 'sig'."""
+    faiss_index, dim = load_faiss_index(index_dir=index_dir, signature=sig, log=log)
+    _index_ref["index"] = faiss_index
+    _index_ref["sig"] = sig
+    _index_ref["dim"] = dim
+    return faiss_index, dim
+def _build_index(
+    split: str,
+    image_col: str,
+    text_col: str,
+    max_samples: int,
+    index_dir: str,
+    log: callable,
+) -> Tuple[IndexStatus, str]:
+    """Ensure an index exists and return its status + signature string."""
+    encoder = _get_encoder(log)
+    accessor = _get_accessor(split, image_col, text_col, max_samples, log)
+    sig = index_signature_from_env(
+        dataset_name=DATASET_NAME,
+        split=split,
+        max_samples=max_samples,
+        ckpt_dir=encoder.ckpt_dir,
+        image_col=image_col,
+        text_col=text_col,
+    )
+    status = ensure_index(
+        accessor=accessor,
+        encoder=encoder,
+        index_dir=index_dir,
+        signature=sig,
+        log=log,
+    )
+    return status, sig
+def _log_to_console(msg: str):
+    print(msg, flush=True)
+def ui_rebuild_index(split, image_col, text_col, max_samples, index_dir):
+    logs = []
+    def log(s: str):
+        logs.append(s)
+        _log_to_console(s)
+    status, sig = _build_index(split, image_col, text_col, max_samples, index_dir, log)
+    _maybe_load_index(sig, index_dir, log)
+    footer = f"Index status: {status.value} | signature: {sig}"
+    if _index_ref["index"] is not None:
+        footer += f" | dim={_index_ref['dim']}"
+    return "\n".join(logs), footer
+def ui_search(
+    query_text: str,
+    split: str,
+    image_col: str,
+    text_col: str,
+    max_samples: int,
+    top_k: int,
+    index_dir: str,
+):
+    if not query_text or not query_text.strip():
+        return [], "Please enter a non-empty query."
+    # Prepare logger to capture build/search messages in the footer/status
+    logs = []
+    def log(s: str):
+        logs.append(s)
+        _log_to_console(s)
+    # Make sure encoder, dataset accessor, and index are aligned to UI state
+    encoder = _get_encoder(log)
+    accessor = _get_accessor(split, image_col, text_col, max_samples, log)
+    sig = index_signature_from_env(
+        dataset_name=DATASET_NAME,
+        split=split,
+        max_samples=max_samples,
+        ckpt_dir=encoder.ckpt_dir,
+        image_col=image_col,
+        text_col=text_col,
+    )
+    if _index_ref["index"] is None or _index_ref["sig"] != sig:
+        # Try load; if not present, build
+        idx, _ = _maybe_load_index(sig, index_dir, log)
+        if idx is None:
+            log("Index not found on disk. Building now...")
+            status, sig = _build_index(split, image_col, text_col, max_samples, index_dir, log)
+            log(f"Index status after build: {status.value}")
+            _maybe_load_index(sig, index_dir, log)
+    if _index_ref["index"] is None:
+        return [], "Index is unavailable. Check logs."
+    # Encode query
+    tic = time.time()
+    q_emb = encoder.encode_texts([query_text])  # (1, D), already L2-normalized
+    encode_ms = (time.time() - tic) * 1000.0
+    # Search
+    tic = time.time()
+    scores, ids = search_faiss(_index_ref["index"], q_emb, top_k=top_k)
+    search_ms = (time.time() - tic) * 1000.0
+    # Prepare gallery: [(image, caption)] with title as "score: ..."
+    results = []
+    for rank, (idx, score) in enumerate(zip(ids[0], scores[0]), start=1):
+        sample = accessor.get(idx)
+        img: Image.Image = sample.image
+        cap: str = sample.text
+        # Gradio Gallery expects [ (image, caption) ]
+        caption = f"#{rank} | score={score:.4f}\n{cap}"
+        results.append((img, caption))
+    footer = f"Encoded in {encode_ms:.1f} ms, searched in {search_ms:.1f} ms | idx sig: {sig}"
+    if logs:
+        footer += "\n" + "\n".join(logs)
+    return results, footer
+def build_ui():
+    with gr.Blocks(css="footer {visibility: hidden}") as demo:
+        gr.Markdown(
+            """
+# 🔎 Text → Image Retrieval (SigLIP + FAISS)
+Dataset: `tomytjandra/h-and-m-fashion-caption` • Index is cached on disk • Works on CPU (default) and uses GPU FAISS if available.
+            """
+        )
+        with gr.Row():
+            with gr.Column(scale=2):
+                query = gr.Textbox(
+                    label="Enter a text query",
+                    placeholder="e.g., 'women's red floral dress with long sleeves'",
+                )
+                examples = gr.Examples(
+                    examples=[[q] for q in EXAMPLE_QUERIES],
+                    inputs=[query],
+                    label="Examples",
+                )
+                top_k = gr.Slider(1, 50, value=DEFAULT_TOP_K, step=1, label="Top‑K")
+                search_btn = gr.Button("Search", variant="primary")
+            with gr.Column(scale=1):
+                split = gr.Dropdown(
+                    choices=["train", "validation", "test"],
+                    value=DEFAULT_SPLIT,
+                    label="Dataset split",
+                )
+                image_col = gr.Textbox(value=DEFAULT_IMAGE_COL, label="IMAGE_COL")
+                text_col = gr.Textbox(value=DEFAULT_TEXT_COL, label="TEXT_COL")
+                max_samples = gr.Slider(
+                    minimum=100, maximum=200_000, value=DEFAULT_MAX_SAMPLES, step=100,
+                    label="MAX_SAMPLES (cap for demo)"
+                )
+                index_dir = gr.Textbox(value=DEFAULT_INDEX_DIR, label="INDEX_DIR")
+                rebuild_btn = gr.Button("(Re)Build Index")
+        with gr.Row():
+            gallery = gr.Gallery(
+                label="Results",
+                columns=5,
+                height=520,
+                preview=True,
+                show_label=True,
+            )
+        status = gr.Textbox(label="Status / Logs", interactive=False)
+        # Wire actions
+        search_btn.click(
+            ui_search,
+            inputs=[query, split, image_col, text_col, max_samples, top_k, index_dir],
+            outputs=[gallery, status],
+        )
+        rebuild_btn.click(
+            ui_rebuild_index,
+            inputs=[split, image_col, text_col, max_samples, index_dir],
+            outputs=[status, status],
+        )
+        gr.Markdown(
+            """
+**Notes**
+- Set `SIGLIP_CHECKPOINT_DIR` to your local SigLIP checkpoint folder (uploaded to this Space).
+- First run will build an index and cache it under `INDEX_DIR`.
+- Uses cosine similarity via L2-normalized embeddings on `IndexFlatIP`. If GPU FAISS is available, it will be used automatically.
+            """
+        )
+    return demo
+if __name__ == "__main__":
+    build_ui().launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))

configs.py CHANGED Viewed

@@ -1,2 +1,22 @@
 from __future__ import annotations
-... (full content from assistant's previous message) ...

 from __future__ import annotations
+DATASET_NAME = "tomytjandra/h-and-m-fashion-caption"
+# Default dataset/view options (can be changed in UI)
+DEFAULT_SPLIT = "validation"    # "train" | "validation" | "test"
+DEFAULT_IMAGE_COL = "image"      # change if your dataset variant differs
+DEFAULT_TEXT_COL = "caption"     # change if your dataset variant differs
+DEFAULT_MAX_SAMPLES = 5000       # cap for demo builds; adjustable in UI
+DEFAULT_TOP_K = 12
+# Index cache directory inside the Space persistent storage
+DEFAULT_INDEX_DIR = "./index_cache"
+EXAMPLE_QUERIES = [
+    "red floral summer dress",
+    "men's black leather jacket",
+    "white sneakers with chunky sole",
+    "blue denim jeans for women",
+    "kids' yellow raincoat",
+]

dataset_utils.py CHANGED Viewed

@@ -1,2 +1,91 @@
 from __future__ import annotations
-... (full content from assistant's previous message) ...

 from __future__ import annotations
+from dataclasses import dataclass
+from typing import Callable, Optional, List
+from datasets import load_dataset, Dataset
+from PIL import Image
+@dataclass
+class Sample:
+    idx: int
+    image: Image.Image
+    text: str
+class SampleAccessor:
+    """A thin wrapper for random access into a loaded HF dataset with normalized columns."""
+    def __init__(self, hf_ds: Dataset, image_col: str, text_col: str):
+        self.ds = hf_ds
+        self.image_col = image_col
+        self.text_col = text_col
+    def __len__(self) -> int:
+        return len(self.ds)
+    def get(self, i: int) -> Sample:
+        row = self.ds[i]
+        img = row[self.image_col]
+        if not isinstance(img, Image.Image):
+            # datasets may store {'bytes':..., 'path':...} or numpy array
+            img = Image.fromarray(img)
+        if img.mode not in ("RGB", "RGBA"):
+            img = img.convert("RGB")
+        if img.mode == "RGBA":
+            img = img.convert("RGB")  # drop alpha for encoders
+        text = str(row[self.text_col])
+        return Sample(idx=i, image=img, text=text)
+    def batched_images(self, start: int, end: int) -> List[Image.Image]:
+        images = []
+        rows = self.ds[start:end]
+        for row in rows:
+            img = row[self.image_col]
+            if not isinstance(img, Image.Image):
+                img = Image.fromarray(img)
+            if img.mode == "RGBA":
+                img = img.convert("RGB")
+            elif img.mode != "RGB":
+                img = img.convert("RGB")
+            images.append(img)
+        return images
+    def texts(self, start: int, end: int) -> List[str]:
+        rows = self.ds[start:end]
+        return [str(r[self.text_col]) for r in rows]
+def load_fashion_dataset(
+    dataset_name: str,
+    split: str,
+    image_col: str,
+    text_col: str,
+    max_samples: int,
+    log: Optional[Callable[[str], None]] = None,
+) -> SampleAccessor:
+    """Load and normalize the H&M fashion caption dataset.
+    Some dataset versions may vary in column names, so we accept user-specified columns.
+    """
+    if log:
+        log(f"Loading dataset: {dataset_name} [{split}] (max_samples={max_samples})")
+    ds = load_dataset(dataset_name, split=split, streaming=False)
+    total = len(ds)
+    if log:
+        log(f"Dataset size (split={split}): {total}")
+    # Trim to max_samples for demo
+    if max_samples is not None and max_samples < total:
+        ds = ds.select(range(max_samples))
+    # Validate columns
+    for col in (image_col, text_col):
+        if col not in ds.column_names:
+            raise KeyError(
+                f"Column '{col}' not found in dataset. Available: {ds.column_names}. "
+                "Adjust IMAGE_COL/TEXT_COL in the UI."
+            )
+    return SampleAccessor(hf_ds=ds, image_col=image_col, text_col=text_col)

encoders.py CHANGED Viewed

@@ -1,2 +1,140 @@
 from __future__ import annotations
-... (full content from assistant's previous message) ...

 from __future__ import annotations
+import os
+from dataclasses import dataclass
+from typing import List, Tuple, Optional, Callable
+import torch
+import numpy as np
+from PIL import Image
+from transformers import AutoModel, AutoProcessor
+def _pick_device() -> torch.device:
+    if torch.cuda.is_available():
+        return torch.device("cuda")
+    # Apple Silicon
+    if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
+        return torch.device("mps")
+    return torch.device("cpu")
+def _pick_dtype(device: torch.device) -> torch.dtype:
+    if device.type == "cuda":
+        # Prefer bf16 if supported; else fp16
+        if torch.cuda.is_bf16_supported():
+            return torch.bfloat16
+        return torch.float16
+    if device.type == "mps":
+        # mps prefers float32 accuracy
+        return torch.float32
+    return torch.float32
+@dataclass
+class SiglipEncoder:
+    model: AutoModel
+    processor: AutoProcessor
+    device: torch.device
+    dtype: torch.dtype
+    ckpt_dir: str
+    @classmethod
+    def from_checkpoint_dir(cls, ckpt_dir: str, log: Optional[Callable[[str], None]] = None) -> "SiglipEncoder":
+        if not os.path.isdir(ckpt_dir):
+            raise FileNotFoundError(
+                f"SIGLIP_CHECKPOINT_DIR not found: {ckpt_dir}. "
+                "Upload your SigLIP checkpoint folder to the Space and set the env var."
+            )
+        device = _pick_device()
+        dtype = _pick_dtype(device)
+        if log:
+            log(f"Loading processor/model from {ckpt_dir} (device={device}, dtype={dtype})")
+        processor = AutoProcessor.from_pretrained(ckpt_dir, trust_remote_code=True)
+        model = AutoModel.from_pretrained(ckpt_dir, trust_remote_code=True)
+        model.to(device)
+        model.eval()
+        return cls(model=model, processor=processor, device=device, dtype=dtype, ckpt_dir=ckpt_dir)
+    # ---------- Embedding helpers ----------
+    @torch.no_grad()
+    def _maybe_autocast(self):
+        # cuda amp context
+        if self.device.type == "cuda" and self.dtype in (torch.float16, torch.bfloat16):
+            return torch.autocast(device_type="cuda", dtype=self.dtype)
+        # for mps/cpu, no autocast by default
+        class DummyCtx:
+            def __enter__(self): return None
+            def __exit__(self, *args): return False
+        return DummyCtx()
+    def _normalize(self, x: np.ndarray) -> np.ndarray:
+        norms = np.linalg.norm(x, axis=1, keepdims=True) + 1e-12
+        return x / norms
+    def _pool_mean(self, last_hidden_state: torch.Tensor, attention_mask: Optional[torch.Tensor]) -> torch.Tensor:
+        # mean pooling with attention mask
+        if attention_mask is None:
+            return last_hidden_state.mean(dim=1)
+        mask = attention_mask.unsqueeze(-1).to(last_hidden_state.dtype)
+        summed = (last_hidden_state * mask).sum(dim=1)
+        counts = mask.sum(dim=1).clamp(min=1e-6)
+        return summed / counts
+    def _forward_image(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        # Try common signatures: get_image_features or forward(...).image_embeds
+        # Fallback: mean pool last_hidden_state of vision tower.
+        if hasattr(self.model, "get_image_features"):
+            return self.model.get_image_features(pixel_values=pixel_values)
+        out = self.model(pixel_values=pixel_values)
+        if hasattr(out, "image_embeds") and out.image_embeds is not None:
+            return out.image_embeds
+        if hasattr(out, "last_hidden_state"):
+            return out.last_hidden_state.mean(dim=1)
+        raise RuntimeError("Unable to extract image embeddings from model outputs.")
+    def _forward_text(self, **text_inputs) -> torch.Tensor:
+        if hasattr(self.model, "get_text_features"):
+            return self.model.get_text_features(**text_inputs)
+        out = self.model(**text_inputs)
+        if hasattr(out, "text_embeds") and out.text_embeds is not None:
+            return out.text_embeds
+        if hasattr(out, "last_hidden_state"):
+            return self._pool_mean(out.last_hidden_state, text_inputs.get("attention_mask"))
+        raise RuntimeError("Unable to extract text embeddings from model outputs.")
+    @torch.no_grad()
+    def encode_images(self, images: List[Image.Image], batch_size: int = 64) -> np.ndarray:
+        """Encode a list of PIL images to L2-normalized embeddings."""
+        feats: List[np.ndarray] = []
+        with self._maybe_autocast():
+            for i in range(0, len(images), batch_size):
+                batch = images[i : i + batch_size]
+                # Ensure RGB
+                batch = [im.convert("RGB") if im.mode != "RGB" else im for im in batch]
+                inputs = self.processor(images=batch, return_tensors="pt")
+                pixel_values = inputs["pixel_values"].to(self.device, dtype=self.dtype if self.device.type == "cuda" else torch.float32)
+                embs = self._forward_image(pixel_values)  # (B, D)
+                embs = embs.float().cpu().numpy()
+                feats.append(embs)
+        feats_np = np.concatenate(feats, axis=0)
+        return self._normalize(feats_np)
+    @torch.no_grad()
+    def encode_texts(self, texts: List[str], batch_size: int = 128) -> np.ndarray:
+        """Encode a list of texts to L2-normalized embeddings."""
+        feats: List[np.ndarray] = []
+        with self._maybe_autocast():
+            for i in range(0, len(texts), batch_size):
+                batch = texts[i : i + batch_size]
+                inputs = self.processor(text=batch, return_tensors="pt", padding=True, truncation=True)
+                inputs = {k: v.to(self.device) for k, v in inputs.items()}
+                embs = self._forward_text(**inputs)  # (B, D)
+                embs = embs.float().cpu().numpy()
+                feats.append(embs)
+        feats_np = np.concatenate(feats, axis=0)
+        return self._normalize(feats_np)

index_builder.py CHANGED Viewed

@@ -1,2 +1,169 @@
 from __future__ import annotations
-... (full content from assistant's previous message) ...

 from __future__ import annotations
+import os
+import io
+import json
+import time
+import hashlib
+from enum import Enum
+from typing import Optional, Callable, Tuple
+import numpy as np
+try:
+    import faiss  # type: ignore
+except Exception as e:
+    raise RuntimeError(
+        "Failed to import faiss. Ensure 'faiss-cpu' is in requirements.txt."
+    ) from e
+from dataset_utils import SampleAccessor
+from encoders import SiglipEncoder
+class IndexStatus(Enum):
+    CREATED = "CREATED"
+    LOADED = "LOADED"
+    SKIPPED_FOUND = "SKIPPED_FOUND"
+    UPDATED = "UPDATED"
+def index_signature_from_env(
+    dataset_name: str,
+    split: str,
+    max_samples: int,
+    ckpt_dir: str,
+    image_col: str,
+    text_col: str,
+) -> str:
+    """Create a stable signature for the on-disk index cache."""
+    # include optional hash of checkpoint config.json if exists
+    cfg_path = os.path.join(ckpt_dir, "config.json")
+    cfg_hash = "nocfg"
+    if os.path.isfile(cfg_path):
+        try:
+            with open(cfg_path, "rb") as f:
+                cfg_hash = hashlib.md5(f.read()).hexdigest()[:10]
+        except Exception:
+            pass
+    base = json.dumps(
+        {
+            "dataset": dataset_name,
+            "split": split,
+            "max_samples": int(max_samples),
+            "ckpt": os.path.basename(os.path.abspath(ckpt_dir)),
+            "cfg": cfg_hash,
+            "image_col": image_col,
+            "text_col": text_col,
+        },
+        sort_keys=True,
+    )
+    return hashlib.sha1(base.encode("utf-8")).hexdigest()[:16]
+def _index_paths(index_dir: str, signature: str):
+    os.makedirs(index_dir, exist_ok=True)
+    idx_path = os.path.join(index_dir, f"{signature}.faiss")
+    meta_path = os.path.join(index_dir, f"{signature}.meta.json")
+    return idx_path, meta_path
+def _maybe_gpu(index):
+    """If FAISS GPU is available, move index to GPU; else return as-is."""
+    try:
+        import faiss  # noqa
+        if faiss.get_num_gpus() > 0:
+            res = faiss.StandardGpuResources()
+            return faiss.index_cpu_to_gpu(res, 0, index)
+    except Exception:
+        pass
+    return index
+def _normalize_rows(x: np.ndarray) -> np.ndarray:
+    norms = np.linalg.norm(x, axis=1, keepdims=True) + 1e-12
+    return x / norms
+def ensure_index(
+    accessor: SampleAccessor,
+    encoder: SiglipEncoder,
+    index_dir: str,
+    signature: str,
+    log: Optional[Callable[[str], None]] = None,
+) -> IndexStatus:
+    """Create the FAISS index if not present; otherwise leave it."""
+    idx_path, meta_path = _index_paths(index_dir, signature)
+    if os.path.isfile(idx_path) and os.path.isfile(meta_path):
+        if log:
+            log(f"Index already exists at {idx_path}")
+        return IndexStatus.SKIPPED_FOUND
+    # Encode all images in batches
+    n = len(accessor)
+    if log:
+        log(f"Encoding {n} images to build index ...")
+    batch = 512
+    feats = []
+    t0 = time.time()
+    for start in range(0, n, batch):
+        end = min(n, start + batch)
+        imgs = accessor.batched_images(start, end)
+        emb = encoder.encode_images(imgs)  # (B, D), L2 normalized
+        feats.append(emb)
+        if log:
+            pct = (end / n) * 100.0
+            log(f"Progress: {end}/{n} ({pct:.1f}%)")
+    feats_np = np.concatenate(feats, axis=0).astype("float32", copy=False)
+    dim = feats_np.shape[1]
+    # Build cosine via inner-product on normalized vectors
+    cpu_index = faiss.IndexFlatIP(dim)
+    cpu_index.add(feats_np)
+    # Save to disk (CPU index for compatibility)
+    faiss.write_index(cpu_index, idx_path)
+    # Save meta information (captions and mapping)
+    meta = {
+        "signature": signature,
+        "size": int(n),
+        "dim": int(dim),
+        "created_at": time.time(),
+        "index_path": os.path.basename(idx_path),
+        "notes": "Embeddings are L2-normalized; cosine == inner product.",
+    }
+    with open(meta_path, "w", encoding="utf-8") as f:
+        json.dump(meta, f, ensure_ascii=False, indent=2)
+    if log:
+        log(f"Index built in {(time.time() - t0):.2f}s. Saved to {idx_path}")
+    return IndexStatus.CREATED
+def load_faiss_index(index_dir: str, signature: str, log: Optional[Callable[[str], None]] = None):
+    idx_path, meta_path = _index_paths(index_dir, signature)
+    if not (os.path.isfile(idx_path) and os.path.isfile(meta_path)):
+        return None, None
+    with open(meta_path, "r", encoding="utf-8") as f:
+        meta = json.load(f)
+    idx = faiss.read_index(idx_path)
+    dim = int(meta.get("dim", idx.d))
+    # Try moving to GPU
+    idx = _maybe_gpu(idx)
+    if log:
+        log(f"Loaded FAISS index: {idx_path} (dim={dim})")
+    return idx, dim
+def search_faiss(index, query_embs: np.ndarray, top_k: int = 10):
+    """Search FAISS (inner product) with normalized query embeddings."""
+    assert query_embs.ndim == 2
+    # Ensure L2-normalized
+    q = _normalize_rows(query_embs.astype("float32", copy=False))
+    scores, ids = index.search(q, int(top_k))
+    return scores, ids