Zero-shot Classification

"""
 # Copyright (c) 2022, salesforce.com, inc.
 # All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""

import plotly.graph_objects as go
import requests
import streamlit as st
import torch
from lavis.models import load_model
from lavis.processors import load_processor
from lavis.processors.blip_processors import BlipCaptionProcessor
from PIL import Image

from app import device, load_demo_image
from app.utils import load_blip_itm_model
from lavis.processors.clip_processors import ClipImageEvalProcessor


@st.cache()
def load_demo_image(img_url=None):
    if not img_url:
        img_url = "https://img.atlasobscura.com/yDJ86L8Ou6aIjBsxnlAy5f164w1rjTgcHZcx2yUs4mo/rt:fit/w:1200/q:81/sm:1/scp:1/ar:1/aHR0cHM6Ly9hdGxh/cy1kZXYuczMuYW1h/em9uYXdzLmNvbS91/cGxvYWRzL3BsYWNl/X2ltYWdlcy85MDll/MDRjOS00NTJjLTQx/NzQtYTY4MS02NmQw/MzI2YWIzNjk1ZGVk/MGZhMTJiMTM5MmZi/NGFfUmVhcl92aWV3/X29mX3RoZV9NZXJs/aW9uX3N0YXR1ZV9h/dF9NZXJsaW9uX1Bh/cmssX1NpbmdhcG9y/ZSxfd2l0aF9NYXJp/bmFfQmF5X1NhbmRz/X2luX3RoZV9kaXN0/YW5jZV8tXzIwMTQw/MzA3LmpwZw.jpg"
    raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
    return raw_image


@st.cache(
    hash_funcs={
        torch.nn.parameter.Parameter: lambda parameter: parameter.data.detach()
        .cpu()
        .numpy()
    },
    allow_output_mutation=True,
)
def load_model_cache(model_type, device):
    if model_type == "blip":
        model = load_model(
            "blip_feature_extractor", model_type="base", is_eval=True, device=device
        )
    elif model_type == "albef":
        model = load_model(
            "albef_feature_extractor", model_type="base", is_eval=True, device=device
        )
    elif model_type == "CLIP_ViT-B-32":
        model = load_model(
            "clip_feature_extractor", "ViT-B-32", is_eval=True, device=device
        )
    elif model_type == "CLIP_ViT-B-16":
        model = load_model(
            "clip_feature_extractor", "ViT-B-16", is_eval=True, device=device
        )
    elif model_type == "CLIP_ViT-L-14":
        model = load_model(
            "clip_feature_extractor", "ViT-L-14", is_eval=True, device=device
        )

    return model


def app():
    model_type = st.sidebar.selectbox(
        "Model:",
        ["ALBEF", "BLIP_Base", "CLIP_ViT-B-32", "CLIP_ViT-B-16", "CLIP_ViT-L-14"],
    )
    score_type = st.sidebar.selectbox("Score type:", ["Cosine", "Multimodal"])

    # ===== layout =====
    st.markdown(
        "<h1 style='text-align: center;'>Zero-shot Classification</h1>",
        unsafe_allow_html=True,
    )

    instructions = """Try the provided image or upload your own:"""
    file = st.file_uploader(instructions)

    st.header("Image")
    if file:
        raw_img = Image.open(file).convert("RGB")
    else:
        raw_img = load_demo_image()

    st.image(raw_img)  # , use_column_width=True)

    col1, col2 = st.columns(2)

    col1.header("Categories")

    cls_0 = col1.text_input("category 1", value="merlion")
    cls_1 = col1.text_input("category 2", value="sky")
    cls_2 = col1.text_input("category 3", value="giraffe")
    cls_3 = col1.text_input("category 4", value="fountain")
    cls_4 = col1.text_input("category 5", value="marina bay")

    cls_names = [cls_0, cls_1, cls_2, cls_3, cls_4]
    cls_names = [cls_nm for cls_nm in cls_names if len(cls_nm) > 0]

    if len(cls_names) != len(set(cls_names)):
        st.error("Please provide unique class names")
        return

    button = st.button("Submit")

    col2.header("Prediction")

    # ===== event =====

    if button:
        if model_type.startswith("BLIP"):
            text_processor = BlipCaptionProcessor(prompt="A picture of ")
            cls_prompt = [text_processor(cls_nm) for cls_nm in cls_names]

            if score_type == "Cosine":
                vis_processor = load_processor("blip_image_eval").build(image_size=224)
                img = vis_processor(raw_img).unsqueeze(0).to(device)

                feature_extractor = load_model_cache(model_type="blip", device=device)

                sample = {"image": img, "text_input": cls_prompt}

                with torch.no_grad():
                    image_features = feature_extractor.extract_features(
                        sample, mode="image"
                    ).image_embeds_proj[:, 0]
                    text_features = feature_extractor.extract_features(
                        sample, mode="text"
                    ).text_embeds_proj[:, 0]
                    sims = (image_features @ text_features.t())[
                        0
                    ] / feature_extractor.temp

            else:
                vis_processor = load_processor("blip_image_eval").build(image_size=384)
                img = vis_processor(raw_img).unsqueeze(0).to(device)

                model = load_blip_itm_model(device)

                output = model(img, cls_prompt, match_head="itm")
                sims = output[:, 1]

            sims = torch.nn.Softmax(dim=0)(sims)
            inv_sims = [sim * 100 for sim in sims.tolist()[::-1]]

        elif model_type.startswith("ALBEF"):
            vis_processor = load_processor("blip_image_eval").build(image_size=224)
            img = vis_processor(raw_img).unsqueeze(0).to(device)

            text_processor = BlipCaptionProcessor(prompt="A picture of ")
            cls_prompt = [text_processor(cls_nm) for cls_nm in cls_names]

            feature_extractor = load_model_cache(model_type="albef", device=device)

            sample = {"image": img, "text_input": cls_prompt}

            with torch.no_grad():
                image_features = feature_extractor.extract_features(
                    sample, mode="image"
                ).image_embeds_proj[:, 0]
                text_features = feature_extractor.extract_features(
                    sample, mode="text"
                ).text_embeds_proj[:, 0]

                st.write(image_features.shape)
                st.write(text_features.shape)

                sims = (image_features @ text_features.t())[0] / feature_extractor.temp

            sims = torch.nn.Softmax(dim=0)(sims)
            inv_sims = [sim * 100 for sim in sims.tolist()[::-1]]

        elif model_type.startswith("CLIP"):
            if model_type == "CLIP_ViT-B-32":
                model = load_model_cache(model_type="CLIP_ViT-B-32", device=device)
            elif model_type == "CLIP_ViT-B-16":
                model = load_model_cache(model_type="CLIP_ViT-B-16", device=device)
            elif model_type == "CLIP_ViT-L-14":
                model = load_model_cache(model_type="CLIP_ViT-L-14", device=device)
            else:
                raise ValueError(f"Unknown model type {model_type}")

            if score_type == "Cosine":
                # image_preprocess = ClipImageEvalProcessor(image_size=336)
                image_preprocess = ClipImageEvalProcessor(image_size=224)
                img = image_preprocess(raw_img).unsqueeze(0).to(device)

                sample = {"image": img, "text_input": cls_names}

                with torch.no_grad():
                    clip_features = model.extract_features(sample)

                    image_features = clip_features.image_embeds_proj
                    text_features = clip_features.text_embeds_proj

                    sims = (100.0 * image_features @ text_features.T)[0].softmax(dim=-1)
                    inv_sims = sims.tolist()[::-1]
            else:
                st.warning("CLIP does not support multimodal scoring.")
                return

        fig = go.Figure(
            go.Bar(
                x=inv_sims,
                y=cls_names[::-1],
                text=["{:.2f}".format(s) for s in inv_sims],
                orientation="h",
            )
        )
        fig.update_traces(
            textfont_size=12,
            textangle=0,
            textposition="outside",
            cliponaxis=False,
        )
        col2.plotly_chart(fig, use_container_width=True)