Spaces:

vivien
/

clip-slip

Running

App Files Files Community

Vivien commited on Dec 28, 2021

Commit

74e4bcd

1 Parent(s): 383bcb1

Create app

Browse files

Files changed (16) hide show

.gitattributes +1 -0
.gitignore +1 -0
README.md +6 -5
app.py +199 -0
bpe_simple_vocab_16e6.txt.gz +3 -0
data.csv +0 -0
data2.csv +0 -0
embeddings.npy +3 -0
embeddings2.npy +3 -0
embeddings2_slip_large.npy +3 -0
embeddings_slip_large.npy +3 -0
losses.py +132 -0
models.py +331 -0
requirements.txt +6 -0
tokenizer.py +157 -0
utils.py +213 -0

.gitattributes CHANGED Viewed

@@ -25,3 +25,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .vscode/

README.md CHANGED Viewed

@@ -1,11 +1,12 @@
 ---
-title: Clip Slip
-emoji: 📚
-colorFrom: pink
-colorTo: gray
 sdk: streamlit
 app_file: app.py
-pinned: false
 ---
 # Configuration

 ---
+title: Comparing CLIP and SLIP
+emoji: 🖼️
+colorFrom: indigo
+colorTo: blue
 sdk: streamlit
+sdk_version: 1.0.0
 app_file: app.py
+pinned: true
 ---
 # Configuration

app.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import os
+import urllib.request
+from collections import OrderedDict
+from html import escape
+import pandas as pd
+import numpy as np
+import torch
+import torchvision.transforms as transforms
+from transformers import CLIPProcessor, CLIPModel
+import tokenizers
+import regex
+import streamlit as st
+import models
+from tokenizer import SimpleTokenizer
+cuda_available = torch.cuda.is_available()
+model_url = "https://dl.fbaipublicfiles.com/slip/slip_large_100ep.pt"
+model_filename = "slip_large_100ep.pt"
+def get_model(model):
+    if isinstance(model, torch.nn.DataParallel) or isinstance(
+        model, torch.nn.parallel.DistributedDataParallel
+    ):
+        return model.module
+    else:
+        return model
+@st.cache(
+    show_spinner=False,
+    hash_funcs={
+        CLIPModel: lambda _: None,
+        CLIPProcessor: lambda _: None,
+        dict: lambda _: None,
+    },
+)
+def load():
+    # Load SLIP model from Facebook AI Research
+    if model_filename not in os.listdir():
+        urllib.request.urlretrieve(model_url, model_filename)
+    ckpt = torch.load("slip_large_100ep.pt", map_location="cpu")
+    state_dict = OrderedDict()
+    for k, v in ckpt["state_dict"].items():
+        state_dict[k.replace("module.", "")] = v
+    old_args = ckpt["args"]
+    slip_model = getattr(models, "SLIP_VITL16")(
+        rand_embed=False,
+        ssl_mlp_dim=old_args.ssl_mlp_dim,
+        ssl_emb_dim=old_args.ssl_emb_dim,
+    )
+    if cuda_available:
+        slip_model.cuda()
+    slip_model.load_state_dict(state_dict, strict=True)
+    slip_model = get_model(slip_model)
+    tokenizer = SimpleTokenizer()
+    del ckpt
+    del state_dict
+    # Load CLIP model from HuggingFace
+    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+    # Load images' descriptions and embeddings
+    df = {0: pd.read_csv("data.csv"), 1: pd.read_csv("data2.csv")}
+    embeddings = {0: np.load("embeddings.npy"), 1: np.load("embeddings2.npy")}
+    slip_embeddings = {
+        0: np.load("embeddings_slip_large.npy"),
+        1: np.load("embeddings2_slip_large.npy"),
+    }
+    for k in [0, 1]:
+        embeddings[k] = np.divide(
+            embeddings[k], np.sqrt(np.sum(embeddings[k] ** 2, axis=1, keepdims=True))
+        )
+    return model, processor, slip_model, tokenizer, df, embeddings, slip_embeddings
+model, processor, slip_model, tokenizer, df, embeddings, slip_embeddings = load()
+source = {0: "\nSource: Unsplash", 1: "\nSource: The Movie Database (TMDB)"}
+def get_html(url_list, url_list_slip, height=150):
+    html = "<div style='display: flex; flex-wrap: wrap; justify-content: space-evenly;'>"
+    html += "<span style='margin-top: 20px; max-width: 1200px; display: flex; align-content: flex-start; flex-wrap: wrap; justify-content: space-evenly; width: 50%'>"
+    html += "<div style='width: 100%; text-align: center;'><b>CLIP</b> (<a href='https://arxiv.org/abs/2103.00020'>Arxiv</a>, <a href='https://github.com/openai/CLIP'>GitHub</a>) from OpenAI</div>"
+    for url, title, link in url_list:
+        html2 = f"<img title='{escape(title)}' style='height: {height}px; margin: 5px' src='{escape(url)}'>"
+        if len(link) > 0:
+            html2 = f"<a href='{escape(link)}' target='_blank'>" + html2 + "</a>"
+        html = html + html2
+    html += "</span>"
+    html += "<span style='margin-top: 20px; max-width: 1200px; display: flex; align-content: flex-start; flex-wrap: wrap; justify-content: space-evenly; width: 50%; border-left: solid; border-color: #ffc423; border-width: thin;'>"
+    html += "<div style='width: 100%; text-align: center;'><b>SLIP</b> (<a href='https://arxiv.org/abs/2112.12750'>Arxiv</a>, <a href='https://github.com/facebookresearch/SLIP'>GitHub</a>) from Meta AI</div>"
+    for url, title, link in url_list_slip:
+        html2 = f"<img title='{escape(title)}' style='height: {height}px; margin: 5px' src='{escape(url)}'>"
+        if len(link) > 0:
+            html2 = f"<a href='{escape(link)}' target='_blank'>" + html2 + "</a>"
+        html = html + html2
+    html += "</span></div>"
+    return html
+def compute_text_embeddings(list_of_strings):
+    inputs = processor(text=list_of_strings, return_tensors="pt", padding=True)
+    return model.get_text_features(**inputs)
+def compute_text_embeddings_slip(list_of_strings):
+    texts = tokenizer(list_of_strings)
+    if cuda_available:
+        texts = texts.cuda(non_blocking=True)
+    texts = texts.view(-1, 77).contiguous()
+    return slip_model.encode_text(texts)
+def image_search(query, corpus, n_results=24):
+    text_embeddings = compute_text_embeddings([query]).detach().numpy()
+    text_embeddings_slip = compute_text_embeddings_slip([query]).detach().numpy()
+    k = 0 if corpus == "Unsplash" else 1
+    results = np.argsort((embeddings[k] @ text_embeddings.T)[:, 0])[
+        -1 : -n_results - 1 : -1
+    ]
+    results_slip = np.argsort((slip_embeddings[k] @ text_embeddings_slip.T)[:, 0])[
+        -1 : -n_results - 1 : -1
+    ]
+    return (
+        [
+            (
+                df[k].iloc[i]["path"],
+                df[k].iloc[i]["tooltip"] + source[k],
+                df[k].iloc[i]["link"],
+            )
+            for i in results
+        ],
+        [
+            (
+                df[k].iloc[i]["path"],
+                df[k].iloc[i]["tooltip"] + source[k],
+                df[k].iloc[i]["link"],
+            )
+            for i in results_slip
+        ],
+    )
+description = """
+# Comparing CLIP and SLIP side by side
+**Enter your query and hit enter**
+CLIP and SLIP are ML models that encode images and texts as vectors so that the vectors of an image and its caption are similar. They can notably be used for zero-shot image classification, text-based image retrieval or image generation.
+*Built with OpenAI's [CLIP](https://openai.com/blog/clip/) model, Meta AI's [SLIP](https://github.com/facebookresearch/SLIP) model, 🤗 Hugging Face's [transformers library](https://huggingface.co/transformers/), [Streamlit](https://streamlit.io/), 25k images from [Unsplash](https://unsplash.com/) and 8k images from [The Movie Database (TMDB)](https://www.themoviedb.org/)*
+"""
+st.markdown(
+    """
+          <style>
+          .block-container{
+            max-width: 1200px;
+          }
+          div.row-widget.stRadio > div{
+            flex-direction:row;
+            display: flex;
+            justify-content: center;
+          }
+          div.row-widget.stRadio > div > label{
+            margin-left: 5px;
+            margin-right: 5px;
+          }
+          section.main>div:first-child {
+            padding-top: 0px;
+          }
+          section:not(.main)>div:first-child {
+            padding-top: 30px;
+          }
+          div.reportview-container > section:first-child{
+            max-width: 320px;
+          }
+          #MainMenu {
+            visibility: hidden;
+          }
+          footer {
+            visibility: hidden;
+          }
+          </style>""",
+    unsafe_allow_html=True,
+)
+st.sidebar.markdown(description)
+_, c, _ = st.columns((1, 3, 1))
+query = c.text_input("", value="clouds at sunset")
+corpus = st.radio("", ["Unsplash", "Movies"])
+if len(query) > 0:
+    results, results_slip = image_search(query, corpus)
+    st.markdown(get_html(results, results_slip), unsafe_allow_html=True)

bpe_simple_vocab_16e6.txt.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a
+size 1356917

data.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data2.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

embeddings.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9f8c171e32276739be6b020592edc8a2c06e029ff6505a9d1d4efe3cafa073bd
+size 51200128

embeddings2.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9664e980f31e81c4a34e07833539fea32795d83a4262c9828ceae445fa2e412a
+size 16732288

embeddings2_slip_large.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5632813e4a27062f2a7bc3f2db23ac3f62d946b53d3b9144c1d5c7e8f9865f90
+size 16732288

embeddings_slip_large.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:98fd7411e6874bfd703c134470b9e5a82c0a7a403bb1cf1cac5851dc3871498f
+size 51200128

losses.py ADDED Viewed

	@@ -0,0 +1,132 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import utils
+class CLIPLoss(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.labels = None
+        self.last_local_batch_size = None
+    def forward(self, outputs):
+        image_embed = outputs['image_embed']
+        text_embed = outputs['text_embed']
+        logit_scale = outputs['logit_scale']
+        local_batch_size = image_embed.size(0)
+        if local_batch_size != self.last_local_batch_size:
+            self.labels = local_batch_size * utils.get_rank() + torch.arange(
+                local_batch_size, device=image_embed.device
+            )
+            self.last_local_batch_size = local_batch_size
+        # normalized features
+        image_embed = F.normalize(image_embed, dim=-1, p=2)
+        text_embed = F.normalize(text_embed, dim=-1, p=2)
+        # gather features from all GPUs
+        image_embed_all, text_embed_all = \
+            utils.all_gather_batch([image_embed, text_embed])
+        # cosine similarity as logits
+        logits_per_image = logit_scale * image_embed @ text_embed_all.t()
+        logits_per_text = logit_scale * text_embed @ image_embed_all.t()
+        loss = (F.cross_entropy(logits_per_image, self.labels) + \
+            F.cross_entropy(logits_per_text, self.labels)) / 2
+        # compute accuracy
+        with torch.no_grad():
+            pred = torch.argmax(logits_per_image, dim=-1)
+            correct = pred.eq(self.labels).sum()
+            acc = 100 * correct / local_batch_size
+        return {'loss': loss, 'clip_loss': loss, 'clip_acc': acc}
+class SIMCLRLoss(nn.Module):
+    """
+    This is the SimCLR loss in https://arxiv.org/abs/2002.05709
+    The embedding vectors are assumed to have size (2 x batch_size, embedding_dim) and
+    the memory layout that can be reshaped into shape (2, batch_size, embedding_dim).
+    This memory layout is consistent with the SimCLR collator in
+    https://github.com/facebookresearch/vissl/blob/master/vissl/data/collators/simclr_collator.py
+    Config params:
+        temperature (float): the temperature to be applied on the logits
+    """
+    def __init__(self, temperature=0.1):
+        super().__init__()
+        self.tau = temperature
+        self.labels = None
+        self.masks = None
+        self.last_local_batch_size = None
+    def forward(self, outputs):
+        q_a = outputs['aug1_embed']
+        q_b = outputs['aug2_embed']
+        q_a = F.normalize(q_a, dim=-1, p=2)
+        q_b = F.normalize(q_b, dim=-1, p=2)
+        local_batch_size = q_a.size(0)
+        k_a, k_b = utils.all_gather_batch_with_grad([q_a, q_b])
+        if local_batch_size != self.last_local_batch_size:
+            self.labels = local_batch_size * utils.get_rank() + torch.arange(
+                local_batch_size, device=q_a.device
+            )
+            total_batch_size = local_batch_size * utils.get_world_size()
+            self.masks = F.one_hot(self.labels, total_batch_size) * 1e9
+            self.last_local_batch_size = local_batch_size
+        logits_aa = torch.matmul(q_a, k_a.transpose(0, 1)) / self.tau
+        logits_aa = logits_aa - self.masks
+        logits_bb = torch.matmul(q_b, k_b.transpose(0, 1)) / self.tau
+        logits_bb = logits_bb - self.masks
+        logits_ab = torch.matmul(q_a, k_b.transpose(0, 1)) / self.tau
+        logits_ba = torch.matmul(q_b, k_a.transpose(0, 1)) / self.tau
+        loss_a = F.cross_entropy(torch.cat([logits_ab, logits_aa], dim=1), self.labels)
+        loss_b = F.cross_entropy(torch.cat([logits_ba, logits_bb], dim=1), self.labels)
+        loss = (loss_a + loss_b) / 2  # divide by 2 to average over all samples
+        # compute accuracy
+        with torch.no_grad():
+            pred = torch.argmax(torch.cat([logits_ab, logits_aa], dim=1), dim=-1)
+            correct = pred.eq(self.labels).sum()
+            acc = 100 * correct / local_batch_size
+        return {'loss': loss, 'ssl_loss': loss, 'ssl_acc': acc}
+class SLIPLoss(nn.Module):
+    def __init__(self, ssl_loss, ssl_scale):
+        super().__init__()
+        self.clip_loss = CLIPLoss()
+        self.ssl_loss = ssl_loss
+        self.ssl_scale = ssl_scale
+    def forward(self, outputs):
+        clip_loss_dict = self.clip_loss(outputs)
+        clip_loss = clip_loss_dict['clip_loss']
+        clip_acc = clip_loss_dict['clip_acc']
+        ssl_loss_dict = self.ssl_loss(outputs)
+        ssl_loss = ssl_loss_dict['ssl_loss']
+        ssl_acc = ssl_loss_dict['ssl_acc']
+        return {'loss': clip_loss + self.ssl_scale * ssl_loss,
+                'clip_loss': clip_loss,
+                'clip_acc': clip_acc,
+                'ssl_loss': ssl_loss,
+                'ssl_acc': ssl_acc}

models.py ADDED Viewed

	@@ -0,0 +1,331 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Modified from github.com/openai/CLIP
+from collections import OrderedDict
+import numpy as np
+import timm
+import torch
+from torch import nn
+import losses
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(OrderedDict([
+            ("c_fc", nn.Linear(d_model, d_model * 4)),
+            ("gelu", QuickGELU()),
+            ("c_proj", nn.Linear(d_model * 4, d_model))
+        ]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
+        return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+    def forward(self, x: torch.Tensor):
+        x = x + self.attention(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class Transformer(nn.Module):
+    def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)])
+    def forward(self, x: torch.Tensor):
+        return self.resblocks(x)
+class CLIP(nn.Module):
+    def __init__(self,
+                 embed_dim: int,
+                 # vision
+                 vision_width: int,
+                 vision_model: nn.Module,
+                 # text
+                 context_length: int,
+                 vocab_size: int,
+                 transformer_width: int,
+                 transformer_heads: int,
+                 transformer_layers: int,
+                 **kwargs,
+                 ):
+        super().__init__()
+        self.context_length = context_length
+        self.vision_width = vision_width
+        self.visual = vision_model
+        self.transformer = Transformer(
+            width=transformer_width,
+            layers=transformer_layers,
+            heads=transformer_heads,
+            attn_mask=self.build_attention_mask(),
+        )
+        self.vocab_size = vocab_size
+        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
+        self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width))
+        self.ln_final = LayerNorm(transformer_width)
+        self.image_projection = nn.Parameter(torch.empty(vision_width, embed_dim))
+        self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim))
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.initialize_parameters()
+    def initialize_parameters(self):
+        nn.init.normal_(self.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.positional_embedding, std=0.01)
+        proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5)
+        attn_std = self.transformer.width ** -0.5
+        fc_std = (2 * self.transformer.width) ** -0.5
+        for block in self.transformer.resblocks:
+            nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
+            nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
+            nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
+            nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
+        nn.init.normal_(self.image_projection, std=self.vision_width ** -0.5)
+        nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5)
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+    def encode_image(self, image):
+        x = self.visual(image)
+        x = x @ self.image_projection
+        return x
+    def encode_text(self, text):
+        x = self.token_embedding(text)  # [batch_size, n_ctx, d_model]
+        x = x + self.positional_embedding
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x)
+        # x.shape = [batch_size, n_ctx, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
+        return x
+    def forward(self, image, text):
+        image_embed = self.encode_image(image)
+        text_embed = self.encode_text(text)
+        return {'image_embed': image_embed,
+                'text_embed': text_embed,
+                'logit_scale': self.logit_scale.exp()}
+class SIMCLR(nn.Module):
+    def __init__(self,
+                 # vision
+                 vision_width: int,
+                 vision_model: nn.Module,
+                 # ssl
+                 ssl_mlp_dim: int,
+                 ssl_emb_dim: int,
+                 **kwargs,
+                 ):
+        super().__init__()
+        self.vision_width = vision_width
+        self.visual = vision_model
+        self.image_mlp = self._build_mlp(in_dim=vision_width, mlp_dim=ssl_mlp_dim, out_dim=ssl_emb_dim)
+    def _build_mlp(self, in_dim, mlp_dim, out_dim):
+        return nn.Sequential(OrderedDict([
+            ("layer1", nn.Linear(in_dim, mlp_dim)),
+            ("bn1", nn.SyncBatchNorm(mlp_dim)),
+            ("relu1", nn.ReLU(inplace=True)),
+            ("layer2", nn.Linear(mlp_dim, mlp_dim)),
+            ("bn2", nn.SyncBatchNorm(mlp_dim)),
+            ("relu2", nn.ReLU(inplace=True)),
+            ("layer3", nn.Linear(mlp_dim, out_dim)),
+        ]))
+    def encode_image(self, image):
+        x = self.visual(image)
+        return x
+    def forward(self, aug1, aug2):
+        h1 = self.visual(aug1)
+        h2 = self.visual(aug2)
+        aug1_embed = self.image_mlp(h1)
+        aug2_embed = self.image_mlp(h2)
+        return {'aug1_embed': aug1_embed,
+                'aug2_embed': aug2_embed}
+class SLIP(CLIP):
+    def __init__(self,
+                 ssl_mlp_dim: int,
+                 ssl_emb_dim: int,
+                 **kwargs,
+                 ):
+        super().__init__(**kwargs)
+        self.image_mlp = self._build_mlp(in_dim=self.vision_width, mlp_dim=ssl_mlp_dim, out_dim=ssl_emb_dim)
+    def _build_mlp(self, in_dim, mlp_dim, out_dim):
+        return nn.Sequential(OrderedDict([
+            ("layer1", nn.Linear(in_dim, mlp_dim)),
+            ("bn1", nn.SyncBatchNorm(mlp_dim)),
+            ("relu1", nn.ReLU(inplace=True)),
+            ("layer2", nn.Linear(mlp_dim, mlp_dim)),
+            ("bn2", nn.SyncBatchNorm(mlp_dim)),
+            ("relu2", nn.ReLU(inplace=True)),
+            ("layer3", nn.Linear(mlp_dim, out_dim)),
+        ]))
+    def forward(self, image, text, aug1, aug2):
+        aug1_embed = self.image_mlp(self.visual(aug1))
+        aug2_embed = self.image_mlp(self.visual(aug2))
+        image_embed = self.encode_image(image)
+        text_embed = self.encode_text(text)
+        return {'image_embed': image_embed,
+                'text_embed': text_embed,
+                'logit_scale': self.logit_scale.exp(),
+                'aug1_embed': aug1_embed,
+                'aug2_embed': aug2_embed}
+def get_loss(model, ssl_temp, ssl_scale):
+    if model.startswith('SLIP'):
+        ssl_loss = losses.SIMCLRLoss(temperature=ssl_temp)
+        return losses.SLIPLoss(ssl_loss, ssl_scale)
+    if model.startswith('CLIP'):
+        return losses.CLIPLoss()
+    if model.startswith('SIMCLR'):
+        return losses.SIMCLRLoss(temperature=ssl_temp)
+def get_metric_names(model):
+    if model.startswith('SLIP'):
+        return ['loss', 'clip_loss', 'ssl_loss', 'clip_acc', 'ssl_acc']
+    elif model.startswith('CLIP'):
+        return ['loss', 'clip_loss', 'clip_acc']
+    else:
+        return ['loss', 'ssl_loss', 'ssl_acc']
+@timm.models.registry.register_model
+def vit_small_mocov3_patch16_224(**kwargs):
+    model_kwargs = dict(patch_size=16, embed_dim=384, depth=12, num_heads=12, **kwargs)
+    model = timm.models.vision_transformer._create_vision_transformer('vit_small_patch16_224', **model_kwargs)
+    return model
+def CLIP_VITS16(**kwargs):
+    vision_model = timm.create_model('vit_small_mocov3_patch16_224', num_classes=0)
+    model = CLIP(embed_dim=512, vision_width=384, vision_model=vision_model, context_length=77, vocab_size=49408,
+        transformer_width=512, transformer_heads=8, transformer_layers=12, **kwargs)
+    return model
+def SIMCLR_VITS16(**kwargs):
+    vision_model = timm.create_model('vit_small_mocov3_patch16_224', num_classes=0)
+    model = SIMCLR(vision_width=384, vision_model=vision_model, **kwargs)
+    return model
+def SLIP_VITS16(**kwargs):
+    vision_model = timm.create_model('vit_small_mocov3_patch16_224', num_classes=0)
+    model = SLIP(embed_dim=512, vision_width=384, vision_model=vision_model, context_length=77, vocab_size=49408,
+        transformer_width=512, transformer_heads=8, transformer_layers=12, **kwargs)
+    return model
+def CLIP_VITB16(**kwargs):
+    vision_model = timm.create_model('vit_base_patch16_224', num_classes=0)
+    model = CLIP(embed_dim=512, vision_width=768, vision_model=vision_model, context_length=77, vocab_size=49408,
+        transformer_width=512, transformer_heads=8, transformer_layers=12, **kwargs)
+    return model
+def SIMCLR_VITB16(**kwargs):
+    vision_model = timm.create_model('vit_base_patch16_224', num_classes=0)
+    model = SIMCLR(vision_width=768, vision_model=vision_model, **kwargs)
+    return model
+def SLIP_VITB16(**kwargs):
+    vision_model = timm.create_model('vit_base_patch16_224', num_classes=0)
+    model = SLIP(embed_dim=512, vision_width=768, vision_model=vision_model, context_length=77, vocab_size=49408,
+        transformer_width=512, transformer_heads=8, transformer_layers=12, **kwargs)
+    return model
+def CLIP_VITL16(**kwargs):
+    vision_model = timm.create_model('vit_large_patch16_224', num_classes=0)
+    model = CLIP(embed_dim=512, vision_width=1024, vision_model=vision_model, context_length=77, vocab_size=49408,
+        transformer_width=512, transformer_heads=8, transformer_layers=12, **kwargs)
+    return model
+def SIMCLR_VITL16(**kwargs):
+    vision_model = timm.create_model('vit_large_patch16_224', num_classes=0)
+    model = SIMCLR(vision_width=1024, vision_model=vision_model, **kwargs)
+    return model
+def SLIP_VITL16(**kwargs):
+    vision_model = timm.create_model('vit_large_patch16_224', num_classes=0)
+    model = SLIP(embed_dim=512, vision_width=1024, vision_model=vision_model, context_length=77, vocab_size=49408,
+        transformer_width=512, transformer_heads=8, transformer_layers=12, **kwargs)
+    return model

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torchvision
+transformers
+numpy
+pandas
+timm
+ftfy

tokenizer.py ADDED Viewed

	@@ -0,0 +1,157 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Modified from github.com/openai/CLIP
+import gzip
+import html
+import os
+from functools import lru_cache
+import ftfy
+import regex as re
+import torch
+@lru_cache()
+def default_bpe():
+    return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+class SimpleTokenizer(object):
+    def __init__(self, bpe_path: str = default_bpe()):
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
+        merges = merges[1:49152-256-2+1]
+        merges = [tuple(merge.split()) for merge in merges]
+        vocab = list(bytes_to_unicode().values())
+        vocab = vocab + [v+'</w>' for v in vocab]
+        for merge in merges:
+            vocab.append(''.join(merge))
+        vocab.extend(['<|startoftext|>', '<|endoftext|>'])
+        self.encoder = dict(zip(vocab, range(len(vocab))))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}
+        self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + ( token[-1] + '</w>',)
+        pairs = get_pairs(word)
+        if not pairs:
+            return token+'</w>'
+        while True:
+            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+                if word[i] == first and i < len(word)-1 and word[i+1] == second:
+                    new_word.append(first+second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+    def encode(self, text):
+        bpe_tokens = []
+        text = whitespace_clean(basic_clean(text)).lower()
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
+        return text
+    def __call__(self, texts, context_length=77):
+        if isinstance(texts, str):
+            texts = [texts]
+        sot_token = self.encoder["<|startoftext|>"]
+        eot_token = self.encoder["<|endoftext|>"]
+        all_tokens = [[sot_token] + self.encode(text) + [eot_token] for text in texts]
+        result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+        for i, tokens in enumerate(all_tokens):
+            tokens = tokens[:context_length]
+            result[i, :len(tokens)] = torch.tensor(tokens)
+        if len(result) == 1:
+            return result[0]
+        return result

utils.py ADDED Viewed

	@@ -0,0 +1,213 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import os
+import random
+import shutil
+import torch
+import torch.distributed as dist
+import torch.autograd as autograd
+from PIL import ImageFilter
+def get_model(model):
+    if isinstance(model, torch.nn.DataParallel) \
+      or isinstance(model, torch.nn.parallel.DistributedDataParallel):
+        return model.module
+    else:
+        return model
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+    __builtin__.print = print
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+def is_main_process():
+    return get_rank() == 0
+def save_on_master(state, is_best, output_dir):
+    if is_main_process():
+        ckpt_path = f'{output_dir}/checkpoint.pt'
+        best_path = f'{output_dir}/checkpoint_best.pt'
+        torch.save(state, ckpt_path)
+        if is_best:
+            shutil.copyfile(ckpt_path, best_path)
+def init_distributed_mode(args):
+    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ['WORLD_SIZE'])
+        args.gpu = int(os.environ['LOCAL_RANK'])
+    elif 'SLURM_PROCID' in os.environ:
+        args.rank = int(os.environ['SLURM_PROCID'])
+        args.gpu = args.rank % torch.cuda.device_count()
+    else:
+        print('Not using distributed mode')
+        args.distributed = False
+        return
+    args.distributed = True
+    torch.cuda.set_device(args.gpu)
+    args.dist_backend = 'nccl'
+    print('| distributed init (rank {}): {}'.format(
+        args.rank, args.dist_url), flush=True)
+    torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                         world_size=args.world_size, rank=args.rank)
+    torch.distributed.barrier()
+    setup_for_distributed(args.rank == 0)
+def scaled_all_reduce(tensors, is_scale=True):
+    """Performs the scaled all_reduce operation on the provided tensors.
+    The input tensors are modified in-place. Currently supports only the sum
+    reduction operator. The reduced values are scaled by the inverse size of the
+    world size.
+    """
+    world_size = get_world_size()
+    # There is no need for reduction in the single-proc case
+    if world_size == 1:
+        return tensors
+    # Queue the reductions
+    reductions = []
+    for tensor in tensors:
+        reduction = dist.all_reduce(tensor, async_op=True)
+        reductions.append(reduction)
+    # Wait for reductions to finish
+    for reduction in reductions:
+        reduction.wait()
+    # Scale the results
+    if is_scale:
+        for tensor in tensors:
+            tensor.mul_(1.0 / world_size)
+    return tensors
+def all_gather_batch(tensors):
+    """
+    Performs all_gather operation on the provided tensors.
+    """
+    # Queue the gathered tensors
+    world_size = get_world_size()
+    # There is no need for reduction in the single-proc case
+    if world_size == 1:
+        return tensors
+    tensor_list = []
+    output_tensor = []
+    for tensor in tensors:
+        tensor_all = [torch.ones_like(tensor) for _ in range(world_size)]
+        dist.all_gather(
+            tensor_all,
+            tensor,
+            async_op=False  # performance opt
+        )
+        tensor_list.append(tensor_all)
+    for tensor_all in tensor_list:
+        output_tensor.append(torch.cat(tensor_all, dim=0))
+    return output_tensor
+class GatherLayer(autograd.Function):
+    """
+    Gather tensors from all workers with support for backward propagation:
+    This implementation does not cut the gradients as torch.distributed.all_gather does.
+    """
+    @staticmethod
+    def forward(ctx, x):
+        output = [torch.zeros_like(x) for _ in range(dist.get_world_size())]
+        dist.all_gather(output, x)
+        return tuple(output)
+    @staticmethod
+    def backward(ctx, *grads):
+        all_gradients = torch.stack(grads)
+        dist.all_reduce(all_gradients)
+        return all_gradients[dist.get_rank()]
+def all_gather_batch_with_grad(tensors):
+    """
+    Performs all_gather operation on the provided tensors.
+    Graph remains connected for backward grad computation.
+    """
+    # Queue the gathered tensors
+    world_size = get_world_size()
+    # There is no need for reduction in the single-proc case
+    if world_size == 1:
+        return tensors
+    tensor_list = []
+    output_tensor = []
+    for tensor in tensors:
+        tensor_all = GatherLayer.apply(tensor)
+        tensor_list.append(tensor_all)
+    for tensor_all in tensor_list:
+        output_tensor.append(torch.cat(tensor_all, dim=0))
+    return output_tensor
+def cosine_scheduler(base_value, final_value, epochs, niter_per_ep, warmup_epochs=0, start_warmup_value=0):
+    warmup_schedule = np.array([])
+    warmup_iters = warmup_epochs * niter_per_ep
+    if warmup_epochs > 0:
+        warmup_schedule = np.linspace(start_warmup_value, base_value, warmup_iters)
+    iters = np.arange(epochs * niter_per_ep - warmup_iters)
+    schedule = final_value + 0.5 * (base_value - final_value) * (1 + np.cos(np.pi * iters / len(iters)))
+    schedule = np.concatenate((warmup_schedule, schedule))
+    assert len(schedule) == epochs * niter_per_ep
+    return schedule
+class GaussianBlur(object):
+    """Gaussian blur augmentation in SimCLR https://arxiv.org/abs/2002.05709"""
+    def __init__(self, sigma=[.1, 2.]):
+        self.sigma = sigma
+    def __call__(self, x):
+        sigma = random.uniform(self.sigma[0], self.sigma[1])
+        x = x.filter(ImageFilter.GaussianBlur(radius=sigma))
+        return x