Spaces:

fkonrad
/

ViT-Visualizer

Starting

App Files Files Community

Felix Konrad commited on Sep 9

Commit

2ec5753

1 Parent(s): 57c8491

Added proper Cosine-Similarity Computation + Visualization

Browse files

Files changed (1) hide show

app.py +70 -17

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import gradio as gr
 from transformers import AutoModel, AutoImageProcessor
@@ -7,29 +8,68 @@ import torch
 # Global state to store loaded model + processor
 state = {
     "model": None,
     "processor": None,
     "repo_id": None,
 }
-def plot_similarity_heatmap(sim_array: np.ndarray):
     """
-    sim_array: 2D numpy array of shape (h, w)
-    Returns a PIL image that can be displayed in Gradio
     """
-    fig, ax = plt.subplots(figsize=(5, 5))
-    cax = ax.imshow(sim_array, cmap='viridis')
-    ax.set_xticks([])
-    ax.set_yticks([])
-    fig.colorbar(cax)
-    fig.canvas.draw()
-    img = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
-    img = img.reshape(fig.canvas.get_width_height()[::-1] + (3,))
-    plt.close(fig)
-    return img
 def load_model(repo_id: str, revision: str = None):
@@ -44,6 +84,8 @@ def load_model(repo_id: str, revision: str = None):
             model.to("cuda")
         else:
             model.to("cpu")
         # Store in global state
         state["model"] = model
         state["processor"] = processor
@@ -58,10 +100,21 @@ def display_image(image: Image):
     """
     return image
 # Build the Gradio interface
 with gr.Blocks() as demo:
     gr.Markdown("# Dynamic ViT Loader Template")
     with gr.Row():
         repo_input = gr.Textbox(label="Hugging Face model repo ID", placeholder="e.g. google/vit-base-patch16-224")
         revision_input = gr.Textbox(label="Revision (optional)", placeholder="branch, tag, or commit hash")
@@ -72,13 +125,13 @@ with gr.Blocks() as demo:
     image_output = gr.Image(label="Displayed Image")
     # cos-sim visualization:
-    # sim_array is your (h, w) numpy array
-    sim_array = np.random.normal((128, 128))
-    heatmap_img = plot_similarity_heatmap(sim_array)
-    gr.Image(value=heatmap_img, label="Cosine Similarity Heatmap")
     # Button clicks / image upload handlers
     load_btn.click(fn=load_model, inputs=[repo_input, revision_input], outputs=load_status)
     image_input.change(fn=display_image, inputs=image_input, outputs=image_output)
 demo.launch()

 import matplotlib.pyplot as plt
+import matplotlib.cm as cm
 import numpy as np
 import gradio as gr
 from transformers import AutoModel, AutoImageProcessor
 # Global state to store loaded model + processor
 state = {
+    "model_type": None,
     "model": None,
     "processor": None,
     "repo_id": None,
 }
+def similarity_heatmap(image):
     """
+        ...
     """
+    model, processor = state["model"], state["processor"]
+    inputs = processor(images=image, return_tensors="pt")
+    pixel_values = inputs["pixel_values"].to(model.device)  # shape: (1, 3, H, W)
+    # get ViT patch size (from model config)
+    patch_size = model.config.patch_size  # usually 16
+    # Compute patch grid (needed for resizing later)
+    H_patch = pixel_values.shape[2] // patch_size
+    W_patch = pixel_values.shape[3] // patch_size
+    with torch.no_grad():
+        outputs = model(pixel_values)  # last_hidden_state: (1, seq_len, hidden_dim)
+        last_hidden_state = outputs.last_hidden_state
+    cls_token = last_hidden_state[:, 0, :]  # shape: (1, hidden_dim)
+    patch_tokens = last_hidden_state[:, 1:, :]  # shape: (1, num_patches, hidden_dim)
+    cls_norm = cls_token / cls_token.norm(dim=-1, keepdim=True)
+    patch_norm = patch_tokens / patch_tokens.norm(dim=-1, keepdim=True)
+    cos_sim = torch.einsum("bd,bpd->bp", cls_norm, patch_norm)  # shape: (1, num_patches)
+    cos_sim = cos_sim.reshape((H_patch, W_patch))
+    return np.array(cos_sim)
+def overlay_cosine_grid_on_image(cos_grid: np.ndarray, image: Image.Image, alpha=0.5, colormap="viridis"):
+    """
+    cos_grid: (H_patch, W_patch) numpy array of cosine similarities
+    image: PIL.Image
+    alpha: blending factor
+    colormap: matplotlib colormap name
+    """
+    # Normalize cosine values to [0, 1] for colormap
+    norm_grid = (cos_grid - cos_grid.min()) / (cos_grid.max() - cos_grid.min() + 1e-8)
+    # Apply colormap
+    cmap = cm.get_cmap(colormap)
+    heatmap_rgba = cmap(norm_grid)  # shape: (H_patch, W_patch, 4)
+    # Convert to RGB 0-255
+    heatmap_rgb = (heatmap_rgba[:, :, :3] * 255).astype(np.uint8)
+    heatmap_img = Image.fromarray(heatmap_rgb)
+    # Resize heatmap to match original image size
+    heatmap_resized = heatmap_img.resize(image.size, resample=Image.BILINEAR)
+    # Blend with original image
+    blended = Image.blend(image.convert("RGBA"), heatmap_resized.convert("RGBA"), alpha=alpha)
+    return blended
 def load_model(repo_id: str, revision: str = None):
             model.to("cuda")
         else:
             model.to("cpu")
+        model.eval()
         # Store in global state
         state["model"] = model
         state["processor"] = processor
     """
     return image
+def visualize_cosine_heatmap(image: Image):
+    if state["model"] is None:
+        return None  # or placeholder image
+    cos_grid = similarity_heatmap(image)
+    blended = overlay_cosine_grid_on_image(cos_grid, image)
+    return blended
 # Build the Gradio interface
 with gr.Blocks() as demo:
     gr.Markdown("# Dynamic ViT Loader Template")
+    # TODO: Add drop-down menu (or something else) for user to allow choosing model type (e.g. DINOv2, Google ViT-Base etc.)
+    # ...
     with gr.Row():
         repo_input = gr.Textbox(label="Hugging Face model repo ID", placeholder="e.g. google/vit-base-patch16-224")
         revision_input = gr.Textbox(label="Revision (optional)", placeholder="branch, tag, or commit hash")
     image_output = gr.Image(label="Displayed Image")
     # cos-sim visualization:
+    heatmap_output = gr.Image(label="Cosine Similarity Heatmap")
     # Button clicks / image upload handlers
     load_btn.click(fn=load_model, inputs=[repo_input, revision_input], outputs=load_status)
     image_input.change(fn=display_image, inputs=image_input, outputs=image_output)
+    compute_btn = gr.Button("Compute Heatmap")
+    compute_btn.click(fn=visualize_cosine_heatmap, inputs=image_input, outputs=heatmap_output)
 demo.launch()