Spaces:

WalidBouss
/

GEM

Runtime error

App Files Files Community

WalidBouss commited on Dec 5, 2023

Commit

be1ec96

1 Parent(s): 55fd6e9

Initial commit :tada:

Browse files

Files changed (12) hide show

.gitattributes +1 -0
LICENSE +21 -0
app.py +84 -0
assets/cats_remote_control.jpeg +3 -0
assets/elon_jeff_mark.jpeg +3 -0
gem/__init__.py +1 -0
gem/gem.py +188 -0
gem/gem_utils.py +194 -0
gem/gem_wrapper.py +123 -0
requirements.txt +15 -0
setup.py +57 -0
test_examples.py +60 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Walid Bousselham, Felix Petersen, Vittorio Ferrari, Hilde Kuehne.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

app.py ADDED Viewed

	@@ -0,0 +1,84 @@

+from PIL import Image
+import numpy as np
+import cv2 as cv2
+import torch
+import requests
+import gradio as gr
+import gem
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# OpenCLIP
+model_name = 'ViT-B-16-quickgelu'
+pretrained = 'metaclip_400m'
+preprocess = gem.get_gem_img_transform()
+# global gem_model
+gem_model = gem.create_gem_model(model_name=model_name, pretrained=pretrained, device=device)
+image_source = "image"
+_MODELS = {
+    "OpenAI": ('ViT-B-16', 'openai'),
+    "MetaCLIP": ('ViT-B-16-quickgelu', 'metaclip_400m'),
+    "OpenCLIP": ('ViT-B-16', 'laion400m_e32')
+}
+def change_weights(pretrained_weights):
+    """ Handle changing model's weights triggered by a Dropdown module change."""
+    curr_model = pretrained_weights
+    _new_model = _MODELS[pretrained_weights]
+    print(_new_model)
+    global gem_model
+    gem_model = gem.create_gem_model(model_name=_new_model[0], pretrained=_new_model[1], device=device)
+def change_to_url(url):
+    img_pil = Image.open(requests.get(url, stream=True).raw).convert('RGB')
+    return img_pil
+def viz_func(url, image, text, model_weights):
+    image_torch = preprocess(image).unsqueeze(0).to(device)
+    with torch.no_grad():
+        logits = gem_model(image_torch, [text])
+    logits = logits[0].detach().cpu().numpy()
+    img_cv = cv2.cvtColor(np.array(image.resize((448, 448))), cv2.COLOR_RGB2BGR)
+    logit_cs_viz = (logits * 255).astype('uint8')
+    heat_maps_cs = [cv2.applyColorMap(logit, cv2.COLORMAP_JET) for logit in logit_cs_viz]
+    vizs = [0.4 * img_cv + 0.6 * heat_map for heat_map in heat_maps_cs]
+    vizs = [cv2.cvtColor(viz.astype('uint8'), cv2.COLOR_BGR2RGB) for viz in vizs]
+    return vizs[0]
+inputs = [
+    gr.Textbox(label="url to the image", ),
+    gr.Image(type="pil"),
+    gr.Textbox(label="Text Prompt"),
+    gr.Dropdown(["OpenAI", "MetaCLIP", "OpenCLIP"], label="Pretrained Weights", value="MetaCLIP",
+                info='It can take a few second for the model to be updated.'),
+    ]
+with gr.Blocks() as demo:
+    inputs[-1].change(fn=change_weights, inputs=[inputs[-1]])
+    inputs[0].change(fn=change_to_url, outputs=inputs[1], inputs=inputs[0])
+    interact = gr.Interface(
+        title="GEM: Grounding Everything Module (link to paper/code)",
+        description="Grounding Everything: Emerging Localization Properties in Vision-Language Transformers",
+        fn=viz_func,
+        inputs=inputs,
+        outputs=["image"],
+    )
+    gr.Examples(
+        [
+            ["assets/cats_remote_control.jpeg", "cat"],
+            ["assets/cats_remote_control.jpeg", "remote control"],
+            ["assets/elon_jeff_mark.jpeg", "elon musk"],
+            ["assets/elon_jeff_mark.jpeg", "mark zuckerberg"],
+            ["assets/elon_jeff_mark.jpeg", "jeff bezos"],
+        ],
+        [inputs[1], inputs[2]]
+    )
+# demo.launch(server_port=5152)
+demo.launch()

assets/cats_remote_control.jpeg ADDED Viewed

Git LFS Details

SHA256: dea9e7ef97386345f7cff32f9055da4982da5471c48d575146c796ab4563b04e
Pointer size: 131 Bytes
Size of remote file: 173 kB

assets/elon_jeff_mark.jpeg ADDED Viewed

Git LFS Details

SHA256: 680a5638a2af9658bc7e9506f54fb1d984ae4282337f6ec65504f61a54e3317f
Pointer size: 132 Bytes
Size of remote file: 1.42 MB

gem/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .gem import *

gem/gem.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import logging
+from typing import Any, Union, List, Optional, Tuple, Dict
+import open_clip
+from open_clip.constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
+import torch
+from torchvision import transforms
+import matplotlib.pyplot as plt
+from PIL import Image
+import numpy as np
+import cv2 as cv2
+from .gem_wrapper import GEMWrapper
+_MODELS = {
+    # B/32
+    "ViT-B/32": [
+        "openai",
+        "laion400m_e31",
+        "laion400m_e32",
+        "laion2b_e16",
+        "laion2b_s34b_b79k",
+    ],
+    "ViT-B/32-quickgelu": [
+        "metaclip_400m",
+        "metaclip_fullcc"
+    ],
+    # B/16
+    "ViT-B/16": [
+        "openai",
+        "laion400m_e31",
+        "laion400m_e32",
+        "laion2b_s34b_b88k",
+    ],
+    "ViT-B/16-quickgelu": [
+        "metaclip_400m",
+        "metaclip_fullcc",
+    ],
+    "ViT-B/16-plus-240": [
+        "laion400m_e31",
+        "laion400m_e32"
+    ],
+    # L/14
+    "ViT-L/14": [
+        "openai",
+        "laion400m_e31",
+        "laion400m_e32",
+        "laion2b_s32b_b82k",
+    ],
+    "ViT-L/14-quickgelu": [
+        "metaclip_400m",
+    "metaclip_fullcc"
+    ],
+    "ViT-L/14-336": [
+        "openai",
+    ]
+}
+def available_models() -> List[str]:
+    """Returns the names of available GEM-VL models"""
+    # _str = "".join([": ".join([key, value]) + "\n" for key, values in _MODELS2.items() for value in values])
+    _str = "".join([": ".join([key + " "*(20 - len(key)), value]) + "\n" for key, values in _MODELS.items() for value in values])
+    return _str
+def get_tokenizer(
+        model_name: str = '',
+        context_length: Optional[int] = None,
+        **kwargs,
+):
+    """ Wrapper around openclip get_tokenizer function """
+    return open_clip.get_tokenizer(model_name=model_name, context_length=context_length, **kwargs)
+def get_gem_img_transform(
+        img_size:  Union[int, Tuple[int, int]] = (448, 448),
+        mean: Optional[Tuple[float, ...]] = None,
+        std: Optional[Tuple[float, ...]] = None,
+):
+    mean = mean or OPENAI_DATASET_MEAN
+    std = std or OPENAI_DATASET_STD
+    transform = transforms.Compose([
+        transforms.Resize(size=img_size, interpolation=transforms.InterpolationMode.BICUBIC),
+        transforms.ToTensor(),
+        transforms.Normalize(mean, std),
+    ])
+    return transform
+def create_gem_model(
+        model_name: str,
+        pretrained: Optional[str] = None,
+        gem_depth: int = 7,
+        ss_attn_iter: int = 1,
+        ss_attn_temp: Optional[float] = None,
+        precision: str = 'fp32',
+        device: Union[str, torch.device] = 'cpu',
+        jit: bool = False,
+        force_quick_gelu: bool = False,
+        force_custom_text: bool = False,
+        force_patch_dropout: Optional[float] = None,
+        force_image_size: Optional[Union[int, Tuple[int, int]]] = None,
+        force_preprocess_cfg: Optional[Dict[str, Any]] = None,
+        pretrained_image: bool = False,
+        pretrained_hf: bool = True,
+        cache_dir: Optional[str] = None,
+        output_dict: Optional[bool] = None,
+        require_pretrained: bool = False,
+        **model_kwargs,
+):
+    model_name = model_name.replace("/", "-")
+    logging.info(f'Loading pretrained {model_name} from pretrained weights {pretrained}...')
+    open_clip_model = open_clip.create_model(model_name, pretrained, precision, device, jit, force_quick_gelu, force_custom_text,
+                                  force_patch_dropout, force_image_size, force_preprocess_cfg, pretrained_image,
+                                  pretrained_hf, cache_dir, output_dict, require_pretrained, **model_kwargs)
+    tokenizer = open_clip.get_tokenizer(model_name=model_name)
+    gem_model = GEMWrapper(model=open_clip_model, tokenizer=tokenizer, depth=gem_depth,
+                           ss_attn_iter=ss_attn_iter, ss_attn_temp=ss_attn_temp)
+    logging.info(f'Loaded GEM-{model_name} from pretrained weights {pretrained}!')
+    return gem_model
+def create_model_and_transforms(
+        model_name: str,
+        pretrained: Optional[str] = None,
+        gem_depth: int = 7,
+        precision: str = 'fp32',
+        device: Union[str, torch.device] = 'cpu',
+        jit: bool = False,
+        force_quick_gelu: bool = False,
+        force_custom_text: bool = False,
+        force_patch_dropout: Optional[float] = None,
+        force_image_size: Optional[Union[int, Tuple[int, int]]] = None,
+        force_preprocess_cfg: Optional[Dict[str, Any]] = None,
+        pretrained_image: bool = False,
+        pretrained_hf: bool = True,
+        cache_dir: Optional[str] = None,
+        output_dict: Optional[bool] = None,
+        require_pretrained: bool = False,
+        **model_kwargs,
+):
+    gem_model = create_gem_model(model_name, pretrained, gem_depth, precision, device, jit, force_quick_gelu, force_custom_text,
+                                 force_patch_dropout, force_image_size, force_preprocess_cfg, pretrained_image,
+                                 pretrained_hf, cache_dir, output_dict, require_pretrained, **model_kwargs)
+    transform = get_gem_img_transform(**model_kwargs)
+    return gem_model, transform
+def visualize(image, text, logits, alpha=0.6, save_path=None):
+    W, H = logits.shape[-2:]
+    if isinstance(image, Image.Image):
+        image = image.resize((W, H))
+    elif isinstance(image, torch.Tensor):
+        if image.ndim > 3:
+            image = image.squeeze(0)
+        image_unormed = (image.detach().cpu() * torch.Tensor(OPENAI_DATASET_STD)[:, None, None]) \
+                        + torch.Tensor(OPENAI_DATASET_MEAN)[:, None, None]  # undo the normalization
+        image = Image.fromarray((image_unormed.permute(1, 2, 0).numpy() * 255).astype('uint8'))  # convert to PIL
+    else:
+        raise f'image should be either of type PIL.Image.Image or torch.Tensor but found {type(image)}'
+    # plot image
+    plt.imshow(image)
+    plt.axis('off')
+    plt.tight_layout()
+    plt.show()
+    if logits.ndim > 3:
+        logits = logits.squeeze(0)
+    logits = logits.detach().cpu().numpy()
+    img_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+    logits = (logits * 255).astype('uint8')
+    heat_maps = [cv2.applyColorMap(logit, cv2.COLORMAP_JET) for logit in logits]
+    vizs = [(1 - alpha) * img_cv + alpha * heat_map for heat_map in heat_maps]
+    for viz, cls_name in zip(vizs, text):
+        viz = cv2.cvtColor(viz.astype('uint8'), cv2.COLOR_BGR2RGB)
+        plt.imshow(viz)
+        plt.title(cls_name)
+        plt.axis('off')
+        plt.tight_layout()
+        plt.show()
+        if save_path is not None:
+            plt.savefig(f'heatmap_{cls_name}.png')

gem/gem_utils.py ADDED Viewed

	@@ -0,0 +1,194 @@

+from typing import Optional, List
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from open_clip.transformer import _expand_token, to_2tuple
+def resample_abs_pos_embed(
+        posemb,
+        new_size: List[int],
+        old_size: Optional[List[int]] = None,
+        num_prefix_tokens: int = 1,
+        interpolation: str = 'bicubic',
+        antialias: bool = True
+):
+    # sort out sizes, assume square if old size not provided
+    new_size = to_2tuple(new_size)
+    new_ntok = new_size[0] * new_size[1]
+    if not old_size:
+        old_size = int(math.sqrt(posemb.shape[1] - num_prefix_tokens))
+    old_size = to_2tuple(old_size)
+    if new_size == old_size:  # might not both be same container type
+        return posemb
+    if num_prefix_tokens:
+        posemb_prefix, posemb = posemb[:, :num_prefix_tokens], posemb[:, num_prefix_tokens:]
+    else:
+        posemb_prefix, posemb = None, posemb
+    # do the interpolation
+    posemb = posemb.reshape(1, old_size[0], old_size[1], -1).permute(0, 3, 1, 2)
+    posemb = F.interpolate(posemb, size=new_size, mode=interpolation, antialias=antialias)
+    posemb = posemb.permute(0, 2, 3, 1).reshape(1, new_ntok, -1)
+    # add back extra (class, etc) prefix tokens
+    if posemb_prefix is not None:
+        posemb = torch.cat([posemb_prefix, posemb], dim=1)
+    return posemb
+class SelfSelfAttention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., ss_attn_iter=1,
+                 ss_attn_temp=None):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.ss_attn_iter = ss_attn_iter
+        self.ss_attn_temp = ss_attn_temp
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x, attn_bias=None, prev_attn=None):
+        x = x.transpose(0, 1)
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        self.v_values = v
+        # original self-attention for the original path
+        attn_ori_return = (q @ k.transpose(-2, -1)) * self.scale
+        attn_ori = attn_ori_return.softmax(dim=-1)
+        attn_ori = self.attn_drop(attn_ori)
+        x_ori = (attn_ori @ v).transpose(1, 2).reshape(B, N, C)
+        x_ori = self.proj_drop(self.proj(x_ori))
+        # GEM
+        xs1 = v
+        xs2 = k
+        xs3 = q
+        if self.ss_attn_temp is None:
+            pre_norm = torch.norm(x, dim=-1).mean(dim=-1, keepdim=True).unsqueeze(1).unsqueeze(-1)
+            inv_temp = pre_norm * self.scale
+        else:
+            inv_temp = self.ss_attn_temp
+        for it in range(self.ss_attn_iter):
+            xs1 = F.normalize(xs1, dim=-1)
+            xs2 = F.normalize(xs2, dim=-1)
+            xs3 = F.normalize(xs3, dim=-1)
+            attn_return1 = (xs1 @ xs1.transpose(-2, -1)) * inv_temp
+            attn_return2 = (xs2 @ xs2.transpose(-2, -1)) * inv_temp
+            attn_return3 = (xs3 @ xs3.transpose(-2, -1)) * inv_temp
+            attn1 = (attn_return1).softmax(dim=-1)
+            attn2 = (attn_return2).softmax(dim=-1)
+            attn3 = (attn_return3).softmax(dim=-1)
+            xs1 = attn1 @ xs1
+            xs2 = attn2 @ xs2
+            xs3 = attn3 @ xs3
+        # Assigment to V
+        xs1 = F.normalize(xs1, dim=-1)
+        xs2 = F.normalize(xs2, dim=-1)
+        xs3 = F.normalize(xs3, dim=-1)
+        attn_return1 = (xs1 @ xs1.transpose(-2, -1)) * inv_temp
+        attn_return2 = (xs2 @ xs2.transpose(-2, -1)) * inv_temp
+        attn_return3 = (xs3 @ xs3.transpose(-2, -1)) * inv_temp
+        attn1 = (attn_return1).softmax(dim=-1)
+        attn2 = (attn_return2).softmax(dim=-1)
+        attn3 = (attn_return3).softmax(dim=-1)
+        xs1 = attn1 @ v
+        xs2 = attn2 @ v
+        xs3 = attn3 @ v
+        xs = (xs1 + xs2 + xs3) / 3
+        x = xs.transpose(1, 2).reshape(B, N, C)
+        x = self.proj_drop(self.proj(x))
+        return [x.transpose(0, 1), x_ori.transpose(0, 1)]
+class GEMResidualBlock(nn.Module):
+    def __init__(self, res_block):
+        super(GEMResidualBlock, self).__init__()
+        self.res_block = res_block
+    def forward(self,
+                q_x: torch.Tensor,
+                k_x: Optional[torch.Tensor] = None,
+                v_x: Optional[torch.Tensor] = None,
+                attn_mask: Optional[torch.Tensor] = None,
+                ):
+        if isinstance(q_x, list):
+            x_gem, q_x = q_x
+        else:
+            x_gem = q_x
+        x_gem_res, x_ori_res = self.res_block.attn(x=self.res_block.ln_1(q_x))
+        x_gem_res, x_ori_res = self.res_block.ls_1(x_gem_res), self.res_block.ls_1(x_ori_res)
+        # Original
+        x_ori = q_x + x_ori_res
+        x_ori = x_ori + self.res_block.ls_2(self.res_block.mlp(self.res_block.ln_2(x_ori)))
+        # GEM
+        x_gem = x_gem + x_gem_res
+        return [x_gem, x_ori]
+class GEMViT(nn.Module):
+    def __init__(self, vit):
+        self.vit = vit
+def modified_vit_forward(self, x: torch.Tensor):
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        grid_h, grid_w = x.shape[2:]
+        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        # class embeddings and positional embeddings
+        x = torch.cat([_expand_token(self.class_embedding, x.shape[0]).to(x.dtype), x], dim=1)
+        # shape = [*, grid ** 2 + 1, width]
+        if x.shape[1] != self.positional_embedding.shape[1]:
+            pos_emb = resample_abs_pos_embed(self.positional_embedding.unsqueeze(0),
+                                             new_size=[grid_h, grid_w],
+                                             # old_size=list(self.grid_size),
+                                             num_prefix_tokens=1,
+                                             interpolation='bicubic',
+                                             antialias=True)
+        else:
+            pos_emb = self.positional_embedding
+        x = x + pos_emb.to(x.dtype)
+        # x = x + self.positional_embedding.to(x.dtype)
+        x = self.patch_dropout(x)
+        x = self.ln_pre(x)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x_gem, x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x_gem = x_gem.permute(1, 0, 2)  # LND -> NLD
+        # Apply proj
+        x = self.ln_post(x)
+        x_gem = self.ln_post(x_gem)
+        if self.proj is not None:
+            x = x @ self.proj
+            x_gem = x_gem @ self.proj
+        return [x_gem, x]

gem/gem_wrapper.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from open_clip.transformer import VisionTransformer
+from .gem_utils import SelfSelfAttention, GEMResidualBlock, modified_vit_forward
+class GEMWrapper(nn.Module):
+    def __init__(self, model, tokenizer, depth=7, ss_attn_iter=1, ss_attn_temp=None):
+        super(GEMWrapper, self).__init__()
+        self.model = model
+        self.tokenizer = tokenizer
+        self.depth = depth
+        self.ss_attn_iter = ss_attn_iter
+        self.ss_attn_temp = ss_attn_temp
+        self.patch_size = self.model.visual.patch_size[0]
+        self.apply_gem()
+    def apply_gem(self):
+        for i in range(1, self.depth):
+            # Extract info from the original ViT
+            num_heads = self.model.visual.transformer.resblocks[-i].attn.num_heads
+            dim = int(self.model.visual.transformer.resblocks[-i].attn.head_dim * num_heads)
+            qkv_bias = True
+            # Init the self-self attention layer
+            ss_attn = SelfSelfAttention(dim=dim, num_heads=num_heads, qkv_bias=qkv_bias,
+                                        ss_attn_iter=self.ss_attn_iter, ss_attn_temp=self.ss_attn_temp)
+            # Copy necessary weights
+            ss_attn.qkv.weight.data = self.model.visual.transformer.resblocks[-i].attn.in_proj_weight.clone()
+            ss_attn.qkv.bias.data = self.model.visual.transformer.resblocks[-i].attn.in_proj_bias.clone()
+            ss_attn.proj.weight.data = self.model.visual.transformer.resblocks[-i].attn.out_proj.weight.clone()
+            ss_attn.proj.bias.data = self.model.visual.transformer.resblocks[-i].attn.out_proj.bias.clone()
+            # Swap the original Attention with our SelfSelfAttention
+            self.model.visual.transformer.resblocks[-i].attn = ss_attn
+            # Wrap Residual block to handle SelfSelfAttention outputs
+            self.model.visual.transformer.resblocks[-i] = GEMResidualBlock(self.model.visual.transformer.resblocks[-i])
+        # Modify ViT's forward function
+        self.model.visual.forward = modified_vit_forward.__get__(self.model.visual, VisionTransformer)
+        return
+    def encode_text(self, text: list):
+        prompts = [f'a photo of a {cls}.' for cls in text]
+        tokenized_prompts = self.tokenizer(prompts).to(self.model.visual.proj.device)
+        text_embedding = self.model.encode_text(tokenized_prompts)
+        text_embedding = F.normalize(text_embedding, dim=-1)
+        return text_embedding.unsqueeze(0)
+    def min_max(self, logits):
+        B, num_prompt = logits.shape[:2]
+        logits_min = logits.reshape(B, num_prompt, -1).min(dim=-1, keepdim=True)[0].unsqueeze(-1)
+        logits_max = logits.reshape(B, num_prompt, -1).max(dim=-1, keepdim=True)[0].unsqueeze(-1)
+        logits = (logits - logits_min) / (logits_max - logits_min)
+        return logits
+    def forward(self, image: torch.Tensor, text: list, normalize: bool = True, return_ori: bool =False):
+        """
+        :param image: torch.Tensor [1, 3, H, W]
+        :param text: list[]
+        :param normalize: bool - if True performs min-max normalization
+        :param return_ori: bool - if True uses the features from the original visual encoder
+        """
+        # Image
+        W, H = image.shape[-2:]
+        feat_gem, feat_ori = self.model.visual(image)
+        image_feat = feat_ori if return_ori else feat_gem
+        image_feat = F.normalize(image_feat, dim=-1)  # [1, N, dim]
+        # Text
+        text_embeddings = self.encode_text(text)  # [1, num_prompt, dim]
+        # Image-Text matching
+        img_txt_matching = image_feat[:, 1:] @ text_embeddings.transpose(-1, -2)  # [1, N, num_prompt]
+        img_txt_matching = rearrange(img_txt_matching, 'b (w h) c -> b c w h',
+                                     w=W//self.patch_size, h=H//self.patch_size)  # [1, num_prompt, w, h]
+        # Interpolate
+        img_txt_matching = F.interpolate(img_txt_matching, size=(W, H), mode='bilinear')  # [1, num_prompt, W, H]
+        # Heat Maps
+        if normalize:
+            img_txt_matching = self.min_max(img_txt_matching)
+        return img_txt_matching
+    def batched_forward(self, image: torch.Tensor, text: list, normalize: bool = True, return_ori: bool =False):
+        """
+        :param image: torch.Tensor [B, 3, H, W]
+        :param text: list[list[]]
+        :param normalize: bool - if True performs min-max normalization
+        :param return_ori: bool - if True uses the features from the original visual encoder
+        """
+        L = len(text)
+        cumm_idx = np.cumsum([len(t) for t in text]).tolist()
+        B, _, W, H = image.shape
+        assert B == L, f'Number of prompts L: {L} should be the same as number of images B: {B}.'
+        # Image
+        feat_gem, feat_ori = self.model.visual(image)
+        image_feat = feat_ori if return_ori else feat_gem
+        image_feat = F.normalize(image_feat, dim=-1)  # [B, N, dim]
+        # Text
+        flatten_text = [t for sub_text in text for t in sub_text]
+        text_embeddings = self.encode_text(flatten_text)  # [B, num_prompt, dim]
+        # Image-Text matching
+        img_txt_matching = 100 * image_feat[:, 1:] @ text_embeddings.transpose(-1, -2)  # [B, N, num_prompt]
+        img_txt_matching = rearrange(img_txt_matching, 'b (w h) c -> b c w h',
+                                     w=W // self.patch_size, h=H // self.patch_size)  # [B, num_prompt, w, h]
+        # Interpolate
+        img_txt_matching = F.interpolate(img_txt_matching, size=(W, H), mode='bilinear')  # [B,num_prompt, W, H]
+        # Heat Maps
+        if normalize:
+            img_txt_matching = self.min_max(img_txt_matching)  # [B,num_prompt, W, H]
+        # unflatten
+        img_txt_matching = torch.tensor_split(img_txt_matching, cumm_idx[:-1], dim=1)
+        img_txt_matching = [itm[i] for i, itm in enumerate(img_txt_matching)]
+        return img_txt_matching

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+torch>=1.9.0
+torchvision
+regex
+ftfy
+tqdm
+huggingface_hub
+sentencepiece
+protobuf
+timm
+einops
+open_clip_torch
+opencv-python
+matplotlib
+numpy
+requests

setup.py ADDED Viewed

	@@ -0,0 +1,57 @@

+""" Setup
+Adapted from https://github.com/mlfoundations/open_clip
+"""
+from setuptools import setup, find_packages
+from codecs import open
+from os import path
+here = path.abspath(path.dirname(__file__))
+# Get the long description from the README file
+with open(path.join(here, 'README.md'), encoding='utf-8') as f:
+    long_description = f.read()
+def _read_reqs(relpath):
+    fullpath = path.join(path.dirname(__file__), relpath)
+    with open(fullpath) as f:
+        return [s.strip() for s in f.readlines() if (s.strip() and not s.startswith("#"))]
+REQUIREMENTS = _read_reqs("requirements.txt")
+setup(
+    name='gem_torch',
+    version="1.0",
+    description='GEM',
+    long_description=long_description,
+    long_description_content_type='text/markdown',
+    url='https://github.com/WalBouss/GEM',
+    author='Walid Bousselham, Felix Petersen, Vittorio Ferrari, Hilde Kuehne',
+    author_email='',
+    classifiers=[
+        # How mature is this project? Common values are
+        #   3 - Alpha
+        #   4 - Beta
+        #   5 - Production/Stable
+        'Development Status :: 3 - Alpha',
+        'Intended Audience :: Education',
+        'Intended Audience :: Science/Research',
+        'License :: OSI Approved :: Apache Software License',
+        'Programming Language :: Python :: 3.7',
+        'Programming Language :: Python :: 3.8',
+        'Programming Language :: Python :: 3.9',
+        'Programming Language :: Python :: 3.10',
+        'Topic :: Scientific/Engineering',
+        'Topic :: Scientific/Engineering :: Artificial Intelligence',
+        'Topic :: Software Development',
+        'Topic :: Software Development :: Libraries',
+        'Topic :: Software Development :: Libraries :: Python Modules',
+    ],
+    # Note that this is a string of words separated by whitespace, not a list.
+    keywords='CLIP pretrained',
+    py_modules=["gem"],
+    packages=find_packages(exclude=["assets*"]),
+    include_package_data=True,
+    install_requires=REQUIREMENTS,
+    python_requires='>=3.7',
+)

test_examples.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from PIL import Image
+from gem import create_gem_model, get_gem_img_transform, visualize, available_models
+import torch
+import requests
+print(available_models())
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+model_name = 'ViT-B-16-quickgelu'
+pretrained = 'metaclip_400m'
+gem_model = create_gem_model(model_name=model_name, pretrained=pretrained, device=device)
+gem_model.eval()
+###########################
+# Single Image
+###########################
+url = "http://images.cocodataset.org/val2017/000000039769.jpg"  # cat & remote control
+text = ['remote control', 'cat']
+# image_path = 'path/to/image'  #,  <-- uncomment to use path
+image_pil = Image.open(requests.get(url, stream=True).raw)
+# image_pil = Image.open(image_path)  # <-- uncomment to use path
+gem_img_transform = get_gem_img_transform()
+image = gem_img_transform(image_pil).unsqueeze(0).to(device)
+with torch.no_grad():
+    logits = gem_model(image, text)
+    visualize(image, text, logits)
+    print(logits.shape)  # torch.Size([1, 2, 448, 448])
+    # visualize(image_pil, text, logits)  # <-- works with torch.Tensor and PIL.Image
+###########################
+# Batch of Images
+###########################
+urls = [
+    "http://images.cocodataset.org/val2017/000000039769.jpg",
+    "https://cdn.vietnambiz.vn/171464876016439296/2021/7/11/headshots16170695297430-1626006880779826347793.jpg",
+    "https://preview.redd.it/do-you-think-joker-should-be-unpredictable-enough-to-put-up-v0-6a2ax4ngtlaa1.jpg?auto=webp&s=f8762e6a1b40642bcae5900bac184fc597131503",
+    ]
+texts = [
+    ['remote control', 'cat'],
+    ['elon musk', 'mark zuckerberg', 'jeff bezos', 'bill gates'],
+    ['batman', 'joker', 'shoe', 'belt', 'purple suit'],
+    ]  # note that the number of prompt per image can be different
+# download images + convert to PIL.Image
+images_pil = [Image.open(requests.get(url, stream=True).raw) for url in urls]
+images = torch.stack([gem_img_transform(img) for img in images_pil]).to(device)
+with torch.no_grad():
+    # return list with logits of size [1, num_prompt, W, H]
+    logits_list = gem_model.batched_forward(images, texts)
+    print(logits_list[0].shape)  # torch.Size([2, 448, 448])
+    print(logits_list[1].shape)  # torch.Size([4, 448, 448])
+    print(logits_list[2].shape)  # torch.Size([5, 448, 448])
+    for i, _logits in enumerate(logits_list):
+        visualize(images[i], texts[i], _logits)  # (optional visualization)