Spaces:

wenkai
/

FAPM_demo

Running on Zero

App Files Files Community

wenkai commited on Jun 24, 2024

Commit

846a3aa

verified ·

1 Parent(s): c338275

Upload 12 files

Browse files

Files changed (12) hide show

app/__init__.py +26 -0
app/calculate_coco_features.py +87 -0
app/caption.py +98 -0
app/classification.py +216 -0
app/dataset_browser.py +240 -0
app/image_text_match.py +87 -0
app/main.py +25 -0
app/multimodal_search.py +230 -0
app/multipage.py +41 -0
app/text_localization.py +105 -0
app/utils.py +81 -0
app/vqa.py +63 -0

app/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+"""
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+from PIL import Image
+import requests
+import streamlit as st
+import torch
+@st.cache()
+def load_demo_image():
+    img_url = (
+        "https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg"
+    )
+    raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
+    return raw_image
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+cache_root = "/export/home/.cache/lavis/"

app/calculate_coco_features.py ADDED Viewed

	@@ -0,0 +1,87 @@

+"""
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+from PIL import Image
+import requests
+import torch
+import os
+from lavis.common.registry import registry
+from lavis.processors import *
+from lavis.models import *
+from lavis.common.utils import build_default_model
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+def load_demo_image():
+    img_url = (
+        "https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg"
+    )
+    raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
+    return raw_image
+def read_img(filepath):
+    raw_image = Image.open(filepath).convert("RGB")
+    return raw_image
+# model
+model_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base.pth"
+feature_extractor = BlipFeatureExtractor(pretrained=model_url)
+feature_extractor.eval()
+feature_extractor = feature_extractor.to(device)
+# preprocessors
+vis_processor = BlipImageEvalProcessor(image_size=224)
+text_processor = BlipCaptionProcessor()
+# files to process
+# file_root = "/export/home/.cache/lavis/coco/images/val2014"
+file_root = "/export/home/.cache/lavis/coco/images/train2014"
+filepaths = os.listdir(file_root)
+print(len(filepaths))
+caption = "dummy"
+path2feat = dict()
+bsz = 256
+images_in_batch = []
+filepaths_in_batch = []
+for i, filename in enumerate(filepaths):
+    if i % bsz == 0 and i > 0:
+        images_in_batch = torch.cat(images_in_batch, dim=0).to(device)
+        with torch.no_grad():
+            image_features = feature_extractor(
+                images_in_batch, caption, mode="image", normalized=True
+            )[:, 0]
+        for filepath, image_feat in zip(filepaths_in_batch, image_features):
+            path2feat[os.path.basename(filepath)] = image_feat.detach().cpu()
+        images_in_batch = []
+        filepaths_in_batch = []
+        print(len(path2feat), image_features.shape)
+    else:
+        filepath = os.path.join(file_root, filename)
+        image = read_img(filepath)
+        image = vis_processor(image).unsqueeze(0)
+        images_in_batch.append(image)
+        filepaths_in_batch.append(filepath)
+torch.save(path2feat, "path2feat_coco_train2014.pth")

app/caption.py ADDED Viewed

	@@ -0,0 +1,98 @@

+"""
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import streamlit as st
+from app import device, load_demo_image
+from app.utils import load_model_cache
+from lavis.processors import load_processor
+from PIL import Image
+def app():
+    # ===== layout =====
+    model_type = st.sidebar.selectbox("Model:", ["BLIP_base", "BLIP_large"])
+    sampling_method = st.sidebar.selectbox(
+        "Sampling method:", ["Beam search", "Nucleus sampling"]
+    )
+    st.markdown(
+        "<h1 style='text-align: center;'>Image Description Generation</h1>",
+        unsafe_allow_html=True,
+    )
+    instructions = """Try the provided image or upload your own:"""
+    file = st.file_uploader(instructions)
+    use_beam = sampling_method == "Beam search"
+    col1, col2 = st.columns(2)
+    if file:
+        raw_img = Image.open(file).convert("RGB")
+    else:
+        raw_img = load_demo_image()
+    col1.header("Image")
+    w, h = raw_img.size
+    scaling_factor = 720 / w
+    resized_image = raw_img.resize((int(w * scaling_factor), int(h * scaling_factor)))
+    col1.image(resized_image, use_column_width=True)
+    col2.header("Description")
+    cap_button = st.button("Generate")
+    # ==== event ====
+    vis_processor = load_processor("blip_image_eval").build(image_size=384)
+    if cap_button:
+        if model_type.startswith("BLIP"):
+            blip_type = model_type.split("_")[1].lower()
+            model = load_model_cache(
+                "blip_caption",
+                model_type=f"{blip_type}_coco",
+                is_eval=True,
+                device=device,
+            )
+        img = vis_processor(raw_img).unsqueeze(0).to(device)
+        captions = generate_caption(
+            model=model, image=img, use_nucleus_sampling=not use_beam
+        )
+        col2.write("\n\n".join(captions), use_column_width=True)
+def generate_caption(
+    model, image, use_nucleus_sampling=False, num_beams=3, max_length=40, min_length=5
+):
+    samples = {"image": image}
+    captions = []
+    if use_nucleus_sampling:
+        for _ in range(5):
+            caption = model.generate(
+                samples,
+                use_nucleus_sampling=True,
+                max_length=max_length,
+                min_length=min_length,
+                top_p=0.9,
+            )
+            captions.append(caption[0])
+    else:
+        caption = model.generate(
+            samples,
+            use_nucleus_sampling=False,
+            num_beams=num_beams,
+            max_length=max_length,
+            min_length=min_length,
+        )
+        captions.append(caption[0])
+    return captions

app/classification.py ADDED Viewed

	@@ -0,0 +1,216 @@

+"""
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import plotly.graph_objects as go
+import requests
+import streamlit as st
+import torch
+from lavis.models import load_model
+from lavis.processors import load_processor
+from lavis.processors.blip_processors import BlipCaptionProcessor
+from PIL import Image
+from app import device, load_demo_image
+from app.utils import load_blip_itm_model
+from lavis.processors.clip_processors import ClipImageEvalProcessor
+@st.cache()
+def load_demo_image(img_url=None):
+    if not img_url:
+        img_url = "https://img.atlasobscura.com/yDJ86L8Ou6aIjBsxnlAy5f164w1rjTgcHZcx2yUs4mo/rt:fit/w:1200/q:81/sm:1/scp:1/ar:1/aHR0cHM6Ly9hdGxh/cy1kZXYuczMuYW1h/em9uYXdzLmNvbS91/cGxvYWRzL3BsYWNl/X2ltYWdlcy85MDll/MDRjOS00NTJjLTQx/NzQtYTY4MS02NmQw/MzI2YWIzNjk1ZGVk/MGZhMTJiMTM5MmZi/NGFfUmVhcl92aWV3/X29mX3RoZV9NZXJs/aW9uX3N0YXR1ZV9h/dF9NZXJsaW9uX1Bh/cmssX1NpbmdhcG9y/ZSxfd2l0aF9NYXJp/bmFfQmF5X1NhbmRz/X2luX3RoZV9kaXN0/YW5jZV8tXzIwMTQw/MzA3LmpwZw.jpg"
+    raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
+    return raw_image
+@st.cache(
+    hash_funcs={
+        torch.nn.parameter.Parameter: lambda parameter: parameter.data.detach()
+        .cpu()
+        .numpy()
+    },
+    allow_output_mutation=True,
+)
+def load_model_cache(model_type, device):
+    if model_type == "blip":
+        model = load_model(
+            "blip_feature_extractor", model_type="base", is_eval=True, device=device
+        )
+    elif model_type == "albef":
+        model = load_model(
+            "albef_feature_extractor", model_type="base", is_eval=True, device=device
+        )
+    elif model_type == "CLIP_ViT-B-32":
+        model = load_model(
+            "clip_feature_extractor", "ViT-B-32", is_eval=True, device=device
+        )
+    elif model_type == "CLIP_ViT-B-16":
+        model = load_model(
+            "clip_feature_extractor", "ViT-B-16", is_eval=True, device=device
+        )
+    elif model_type == "CLIP_ViT-L-14":
+        model = load_model(
+            "clip_feature_extractor", "ViT-L-14", is_eval=True, device=device
+        )
+    return model
+def app():
+    model_type = st.sidebar.selectbox(
+        "Model:",
+        ["ALBEF", "BLIP_Base", "CLIP_ViT-B-32", "CLIP_ViT-B-16", "CLIP_ViT-L-14"],
+    )
+    score_type = st.sidebar.selectbox("Score type:", ["Cosine", "Multimodal"])
+    # ===== layout =====
+    st.markdown(
+        "<h1 style='text-align: center;'>Zero-shot Classification</h1>",
+        unsafe_allow_html=True,
+    )
+    instructions = """Try the provided image or upload your own:"""
+    file = st.file_uploader(instructions)
+    st.header("Image")
+    if file:
+        raw_img = Image.open(file).convert("RGB")
+    else:
+        raw_img = load_demo_image()
+    st.image(raw_img)  # , use_column_width=True)
+    col1, col2 = st.columns(2)
+    col1.header("Categories")
+    cls_0 = col1.text_input("category 1", value="merlion")
+    cls_1 = col1.text_input("category 2", value="sky")
+    cls_2 = col1.text_input("category 3", value="giraffe")
+    cls_3 = col1.text_input("category 4", value="fountain")
+    cls_4 = col1.text_input("category 5", value="marina bay")
+    cls_names = [cls_0, cls_1, cls_2, cls_3, cls_4]
+    cls_names = [cls_nm for cls_nm in cls_names if len(cls_nm) > 0]
+    if len(cls_names) != len(set(cls_names)):
+        st.error("Please provide unique class names")
+        return
+    button = st.button("Submit")
+    col2.header("Prediction")
+    # ===== event =====
+    if button:
+        if model_type.startswith("BLIP"):
+            text_processor = BlipCaptionProcessor(prompt="A picture of ")
+            cls_prompt = [text_processor(cls_nm) for cls_nm in cls_names]
+            if score_type == "Cosine":
+                vis_processor = load_processor("blip_image_eval").build(image_size=224)
+                img = vis_processor(raw_img).unsqueeze(0).to(device)
+                feature_extractor = load_model_cache(model_type="blip", device=device)
+                sample = {"image": img, "text_input": cls_prompt}
+                with torch.no_grad():
+                    image_features = feature_extractor.extract_features(
+                        sample, mode="image"
+                    ).image_embeds_proj[:, 0]
+                    text_features = feature_extractor.extract_features(
+                        sample, mode="text"
+                    ).text_embeds_proj[:, 0]
+                    sims = (image_features @ text_features.t())[
+                        0
+                    ] / feature_extractor.temp
+            else:
+                vis_processor = load_processor("blip_image_eval").build(image_size=384)
+                img = vis_processor(raw_img).unsqueeze(0).to(device)
+                model = load_blip_itm_model(device)
+                output = model(img, cls_prompt, match_head="itm")
+                sims = output[:, 1]
+            sims = torch.nn.Softmax(dim=0)(sims)
+            inv_sims = [sim * 100 for sim in sims.tolist()[::-1]]
+        elif model_type.startswith("ALBEF"):
+            vis_processor = load_processor("blip_image_eval").build(image_size=224)
+            img = vis_processor(raw_img).unsqueeze(0).to(device)
+            text_processor = BlipCaptionProcessor(prompt="A picture of ")
+            cls_prompt = [text_processor(cls_nm) for cls_nm in cls_names]
+            feature_extractor = load_model_cache(model_type="albef", device=device)
+            sample = {"image": img, "text_input": cls_prompt}
+            with torch.no_grad():
+                image_features = feature_extractor.extract_features(
+                    sample, mode="image"
+                ).image_embeds_proj[:, 0]
+                text_features = feature_extractor.extract_features(
+                    sample, mode="text"
+                ).text_embeds_proj[:, 0]
+                st.write(image_features.shape)
+                st.write(text_features.shape)
+                sims = (image_features @ text_features.t())[0] / feature_extractor.temp
+            sims = torch.nn.Softmax(dim=0)(sims)
+            inv_sims = [sim * 100 for sim in sims.tolist()[::-1]]
+        elif model_type.startswith("CLIP"):
+            if model_type == "CLIP_ViT-B-32":
+                model = load_model_cache(model_type="CLIP_ViT-B-32", device=device)
+            elif model_type == "CLIP_ViT-B-16":
+                model = load_model_cache(model_type="CLIP_ViT-B-16", device=device)
+            elif model_type == "CLIP_ViT-L-14":
+                model = load_model_cache(model_type="CLIP_ViT-L-14", device=device)
+            else:
+                raise ValueError(f"Unknown model type {model_type}")
+            if score_type == "Cosine":
+                # image_preprocess = ClipImageEvalProcessor(image_size=336)
+                image_preprocess = ClipImageEvalProcessor(image_size=224)
+                img = image_preprocess(raw_img).unsqueeze(0).to(device)
+                sample = {"image": img, "text_input": cls_names}
+                with torch.no_grad():
+                    clip_features = model.extract_features(sample)
+                    image_features = clip_features.image_embeds_proj
+                    text_features = clip_features.text_embeds_proj
+                    sims = (100.0 * image_features @ text_features.T)[0].softmax(dim=-1)
+                    inv_sims = sims.tolist()[::-1]
+            else:
+                st.warning("CLIP does not support multimodal scoring.")
+                return
+        fig = go.Figure(
+            go.Bar(
+                x=inv_sims,
+                y=cls_names[::-1],
+                text=["{:.2f}".format(s) for s in inv_sims],
+                orientation="h",
+            )
+        )
+        fig.update_traces(
+            textfont_size=12,
+            textangle=0,
+            textposition="outside",
+            cliponaxis=False,
+        )
+        col2.plotly_chart(fig, use_container_width=True)

app/dataset_browser.py ADDED Viewed

	@@ -0,0 +1,240 @@

+"""
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import random
+from collections import OrderedDict
+from functools import reduce
+from tkinter import N
+import streamlit as st
+from lavis.common.registry import registry
+from lavis.datasets.builders import dataset_zoo, load_dataset
+from lavis.datasets.builders.base_dataset_builder import load_dataset_config
+from PIL import Image
+IMAGE_LAYOUT = 3, 4
+VIDEO_LAYOUT = 1, 2
+PREV_STR = "Prev"
+NEXT_STR = "Next"
+def sample_dataset(dataset, indices):
+    samples = [dataset.displ_item(idx) for idx in indices]
+    return samples
+def get_concat_v(im1, im2):
+    margin = 5
+    canvas_size = (im1.width + im2.width + margin, max(im1.height, im2.height))
+    canvas = Image.new("RGB", canvas_size, "White")
+    canvas.paste(im1, (0, 0))
+    canvas.paste(im2, (im1.width + margin, 0))
+    return canvas
+def resize_img_w(raw_img, new_w=224):
+    if isinstance(raw_img, list):
+        resized_imgs = [resize_img_w(img, 196) for img in raw_img]
+        # concatenate images
+        resized_image = reduce(get_concat_v, resized_imgs)
+    else:
+        w, h = raw_img.size
+        scaling_factor = new_w / w
+        resized_image = raw_img.resize(
+            (int(w * scaling_factor), int(h * scaling_factor))
+        )
+    return resized_image
+def get_visual_key(dataset):
+    if "image" in dataset[0]:
+        return "image"
+    elif "image0" in dataset[0]:  # NLVR2 dataset
+        return "image"
+    elif "video" in dataset[0]:
+        return "video"
+    else:
+        raise ValueError("Visual key not found.")
+def gather_items(samples, exclude=[]):
+    gathered = []
+    for s in samples:
+        ns = OrderedDict()
+        for k in s.keys():
+            if k not in exclude:
+                ns[k] = s[k]
+        gathered.append(ns)
+    return gathered
+@st.cache(allow_output_mutation=True)
+def load_dataset_cache(name):
+    return load_dataset(name)
+def format_text(text):
+    md = "\n\n".join([f"**{k}**: {v}" for k, v in text.items()])
+    return md
+def show_samples(dataset, offset=0, is_next=False):
+    visual_key = get_visual_key(dataset)
+    num_rows, num_cols = IMAGE_LAYOUT if visual_key == "image" else VIDEO_LAYOUT
+    n_samples = num_rows * num_cols
+    if not shuffle:
+        if is_next:
+            start = min(int(start_idx) + offset + n_samples, len(dataset) - n_samples)
+        else:
+            start = max(0, int(start_idx) + offset - n_samples)
+        st.session_state.last_start = start
+        end = min(start + n_samples, len(dataset))
+        indices = list(range(start, end))
+    else:
+        indices = random.sample(range(len(dataset)), n_samples)
+    samples = sample_dataset(dataset, indices)
+    visual_info = (
+        iter([resize_img_w(s[visual_key]) for s in samples])
+        if visual_key == "image"
+        # else iter([s[visual_key] for s in samples])
+        else iter([s["file"] for s in samples])
+    )
+    text_info = gather_items(samples, exclude=["image", "video"])
+    text_info = iter([format_text(s) for s in text_info])
+    st.markdown(
+        """<hr style="height:1px;border:none;color:#c7ccd4;background-color:#c7ccd4;"/> """,
+        unsafe_allow_html=True,
+    )
+    for _ in range(num_rows):
+        with st.container():
+            for col in st.columns(num_cols):
+                # col.text(next(text_info))
+                # col.caption(next(text_info))
+                try:
+                    col.markdown(next(text_info))
+                    if visual_key == "image":
+                        col.image(next(visual_info), use_column_width=True, clamp=True)
+                    elif visual_key == "video":
+                        col.markdown(
+                            "![Alt Text](https://media.giphy.com/media/vFKqnCdLPNOKc/giphy.gif)"
+                        )
+                except StopIteration:
+                    break
+            st.markdown(
+                """<hr style="height:1px;border:none;color:#c7ccd4;background-color:#c7ccd4;"/> """,
+                unsafe_allow_html=True,
+            )
+    st.session_state.n_display = n_samples
+if __name__ == "__main__":
+    st.set_page_config(
+        page_title="LAVIS Dataset Explorer",
+        # layout="wide",
+        initial_sidebar_state="expanded",
+    )
+    dataset_name = st.sidebar.selectbox("Dataset:", dataset_zoo.get_names())
+    function = st.sidebar.selectbox("Function:", ["Browser"], index=0)
+    if function == "Browser":
+        shuffle = st.sidebar.selectbox("Shuffled:", [True, False], index=0)
+        dataset = load_dataset_cache(dataset_name)
+        split = st.sidebar.selectbox("Split:", dataset.keys())
+        dataset_len = len(dataset[split])
+        st.success(
+            f"Loaded {dataset_name}/{split} with **{dataset_len}** records.  **Image/video directory**: {dataset[split].vis_root}"
+        )
+        if "last_dataset" not in st.session_state:
+            st.session_state.last_dataset = dataset_name
+            st.session_state.last_split = split
+        if "last_start" not in st.session_state:
+            st.session_state.last_start = 0
+        if "start_idx" not in st.session_state:
+            st.session_state.start_idx = 0
+        if "shuffle" not in st.session_state:
+            st.session_state.shuffle = shuffle
+        if "first_run" not in st.session_state:
+            st.session_state.first_run = True
+        elif (
+            st.session_state.last_dataset != dataset_name
+            or st.session_state.last_split != split
+        ):
+            st.session_state.first_run = True
+            st.session_state.last_dataset = dataset_name
+            st.session_state.last_split = split
+        elif st.session_state.shuffle != shuffle:
+            st.session_state.shuffle = shuffle
+            st.session_state.first_run = True
+        if not shuffle:
+            n_col, p_col = st.columns([0.05, 1])
+            prev_button = n_col.button(PREV_STR)
+            next_button = p_col.button(NEXT_STR)
+        else:
+            next_button = st.button(NEXT_STR)
+        if not shuffle:
+            start_idx = st.sidebar.text_input(f"Begin from (total {dataset_len})", 0)
+            if not start_idx.isdigit():
+                st.error(f"Input to 'Begin from' must be digits, found {start_idx}.")
+            else:
+                if int(start_idx) != st.session_state.start_idx:
+                    st.session_state.start_idx = int(start_idx)
+                    st.session_state.last_start = int(start_idx)
+            if prev_button:
+                show_samples(
+                    dataset[split],
+                    offset=st.session_state.last_start - st.session_state.start_idx,
+                    is_next=False,
+                )
+        if next_button:
+            show_samples(
+                dataset[split],
+                offset=st.session_state.last_start - st.session_state.start_idx,
+                is_next=True,
+            )
+        if st.session_state.first_run:
+            st.session_state.first_run = False
+            show_samples(
+                dataset[split],
+                offset=st.session_state.last_start - st.session_state.start_idx,
+                is_next=True,
+            )

app/image_text_match.py ADDED Viewed

	@@ -0,0 +1,87 @@

+"""
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import numpy as np
+import streamlit as st
+import torch
+from lavis.models.blip_models.blip_image_text_matching import compute_gradcam
+from lavis.processors import load_processor
+from PIL import Image
+from app import device, load_demo_image
+from app.utils import getAttMap, init_bert_tokenizer, load_blip_itm_model
+def app():
+    model_type = st.sidebar.selectbox("Model:", ["BLIP_base", "BLIP_large"])
+    if model_type.startswith("BLIP"):
+        blip_type = model_type.split("_")[1]
+        model = load_blip_itm_model(device, model_type=blip_type)
+    vis_processor = load_processor("blip_image_eval").build(image_size=384)
+    st.markdown(
+        "<h1 style='text-align: center;'>Image Text Matching</h1>",
+        unsafe_allow_html=True,
+    )
+    values = list(range(1, 12))
+    default_layer_num = values.index(7)
+    layer_num = (
+        st.sidebar.selectbox("Layer number", values, index=default_layer_num) - 1
+    )
+    instructions = """Try the provided image or upload your own:"""
+    file = st.file_uploader(instructions)
+    col1, col2 = st.columns(2)
+    col1.header("Image")
+    col2.header("GradCam")
+    if file:
+        raw_img = Image.open(file).convert("RGB")
+    else:
+        raw_img = load_demo_image()
+    w, h = raw_img.size
+    scaling_factor = 720 / w
+    resized_image = raw_img.resize((int(w * scaling_factor), int(h * scaling_factor)))
+    col1.image(resized_image, use_column_width=True)
+    col3, col4 = st.columns(2)
+    col3.header("Text")
+    user_question = col3.text_input(
+        "Input your sentence!", "a woman sitting on the beach with a dog"
+    )
+    submit_button = col3.button("Submit")
+    col4.header("Matching score")
+    if submit_button:
+        tokenizer = init_bert_tokenizer()
+        img = vis_processor(raw_img).unsqueeze(0).to(device)
+        text_processor = load_processor("blip_caption").build()
+        qry = text_processor(user_question)
+        norm_img = np.float32(resized_image) / 255
+        qry_tok = tokenizer(qry, return_tensors="pt").to(device)
+        gradcam, output = compute_gradcam(model, img, qry, qry_tok, block_num=layer_num)
+        avg_gradcam = getAttMap(norm_img, gradcam[0][1], blur=True)
+        col2.image(avg_gradcam, use_column_width=True, clamp=True)
+        # output = model(img, question)
+        itm_score = torch.nn.functional.softmax(output, dim=1)
+        new_title = (
+            '<p style="text-align: left; font-size: 25px;">\n{:.3f}%</p>'.format(
+                itm_score[0][1].item() * 100
+            )
+        )
+        col4.markdown(new_title, unsafe_allow_html=True)

app/main.py ADDED Viewed

	@@ -0,0 +1,25 @@

+"""
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+from app.multipage import MultiPage
+from app import vqa, caption
+from app import image_text_match as itm
+from app import text_localization as tl
+from app import multimodal_search as ms
+from app import classification as cl
+if __name__ == "__main__":
+    app = MultiPage()
+    app.add_page("Image Description Generation", caption.app)
+    app.add_page("Multimodal Search", ms.app)
+    app.add_page("Visual Question Answering", vqa.app)
+    app.add_page("Image Text Matching", itm.app)
+    app.add_page("Text Localization", tl.app)
+    app.add_page("Classification", cl.app)
+    app.run()

app/multimodal_search.py ADDED Viewed

	@@ -0,0 +1,230 @@

+"""
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import os
+import numpy as np
+import streamlit as st
+import torch
+import torch.nn.functional as F
+from app import cache_root, device
+from app.utils import (
+    getAttMap,
+    init_bert_tokenizer,
+    load_blip_itm_model,
+    read_img,
+    resize_img,
+)
+from lavis.models import load_model
+from lavis.processors import load_processor
+@st.cache(
+    hash_funcs={
+        torch.nn.parameter.Parameter: lambda parameter: parameter.data.detach()
+        .cpu()
+        .numpy()
+    },
+    allow_output_mutation=True,
+)
+def load_feat():
+    from lavis.common.utils import download_url
+    dirname = os.path.join(os.path.dirname(__file__), "assets")
+    filename = "path2feat_coco_train2014.pth"
+    filepath = os.path.join(dirname, filename)
+    url = "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/path2feat_coco_train2014.pth"
+    if not os.path.exists(filepath):
+        download_url(url=url, root=dirname, filename="path2feat_coco_train2014.pth")
+    path2feat = torch.load(filepath)
+    paths = sorted(path2feat.keys())
+    all_img_feats = torch.stack([path2feat[k] for k in paths], dim=0).to(device)
+    return path2feat, paths, all_img_feats
+@st.cache(
+    hash_funcs={
+        torch.nn.parameter.Parameter: lambda parameter: parameter.data.detach()
+        .cpu()
+        .numpy()
+    },
+    allow_output_mutation=True,
+)
+def load_feature_extractor_model(device):
+    model_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base.pth"
+    model = load_model(
+        "blip_feature_extractor", model_type="base", is_eval=True, device=device
+    )
+    model.load_from_pretrained(model_url)
+    return model
+def app():
+    # === layout ===
+    model_type = st.sidebar.selectbox("Model:", ["BLIP_base", "BLIP_large"])
+    file_root = os.path.join(cache_root, "coco/images/train2014/")
+    values = [12, 24, 48]
+    default_layer_num = values.index(24)
+    num_display = st.sidebar.selectbox(
+        "Number of images:", values, index=default_layer_num
+    )
+    show_gradcam = st.sidebar.selectbox("Show GradCam:", [True, False], index=1)
+    itm_ranking = st.sidebar.selectbox("Multimodal re-ranking:", [True, False], index=0)
+    # st.title('Multimodal Search')
+    st.markdown(
+        "<h1 style='text-align: center;'>Multimodal Search</h1>", unsafe_allow_html=True
+    )
+    # === event ===
+    vis_processor = load_processor("blip_image_eval").build(image_size=384)
+    text_processor = load_processor("blip_caption")
+    user_question = st.text_input(
+        "Search query", "A dog running on the grass.", help="Type something to search."
+    )
+    user_question = text_processor(user_question)
+    feature_extractor = load_feature_extractor_model(device)
+    # ======= ITC =========
+    sample = {"text_input": user_question}
+    with torch.no_grad():
+        text_feature = feature_extractor.extract_features(
+            sample, mode="text"
+        ).text_embeds_proj[0, 0]
+        path2feat, paths, all_img_feats = load_feat()
+        all_img_feats.to(device)
+        all_img_feats = F.normalize(all_img_feats, dim=1)
+        num_cols = 4
+        num_rows = int(num_display / num_cols)
+        similarities = text_feature @ all_img_feats.T
+        indices = torch.argsort(similarities, descending=True)[:num_display]
+    top_paths = [paths[ind.detach().cpu().item()] for ind in indices]
+    sorted_similarities = [similarities[idx] for idx in indices]
+    filenames = [os.path.join(file_root, p) for p in top_paths]
+    # ========= ITM and GradCam ==========
+    bsz = 4  # max number of images to avoid cuda oom
+    if model_type.startswith("BLIP"):
+        blip_type = model_type.split("_")[1]
+    itm_model = load_blip_itm_model(device, model_type=blip_type)
+    tokenizer = init_bert_tokenizer()
+    queries_batch = [user_question] * bsz
+    queries_tok_batch = tokenizer(queries_batch, return_tensors="pt").to(device)
+    num_batches = int(num_display / bsz)
+    avg_gradcams = []
+    all_raw_images = []
+    itm_scores = []
+    for i in range(num_batches):
+        filenames_in_batch = filenames[i * bsz : (i + 1) * bsz]
+        raw_images, images = read_and_process_images(filenames_in_batch, vis_processor)
+        gradcam, itm_output = compute_gradcam_batch(
+            itm_model, images, queries_batch, queries_tok_batch
+        )
+        all_raw_images.extend([resize_img(r_img) for r_img in raw_images])
+        norm_imgs = [np.float32(r_img) / 255 for r_img in raw_images]
+        for norm_img, grad_cam in zip(norm_imgs, gradcam):
+            avg_gradcam = getAttMap(norm_img, grad_cam[0], blur=True)
+            avg_gradcams.append(avg_gradcam)
+        with torch.no_grad():
+            itm_score = torch.nn.functional.softmax(itm_output, dim=1)
+        itm_scores.append(itm_score)
+    # ========= ITM re-ranking =========
+    itm_scores = torch.cat(itm_scores)[:, 1]
+    if itm_ranking:
+        itm_scores_sorted, indices = torch.sort(itm_scores, descending=True)
+        avg_gradcams_sorted = []
+        all_raw_images_sorted = []
+        for idx in indices:
+            avg_gradcams_sorted.append(avg_gradcams[idx])
+            all_raw_images_sorted.append(all_raw_images[idx])
+        avg_gradcams = avg_gradcams_sorted
+        all_raw_images = all_raw_images_sorted
+    if show_gradcam:
+        images_to_show = iter(avg_gradcams)
+    else:
+        images_to_show = iter(all_raw_images)
+    for _ in range(num_rows):
+        with st.container():
+            for col in st.columns(num_cols):
+                col.image(next(images_to_show), use_column_width=True, clamp=True)
+def read_and_process_images(image_paths, vis_processor):
+    raw_images = [read_img(path) for path in image_paths]
+    images = [vis_processor(r_img) for r_img in raw_images]
+    images_tensors = torch.stack(images).to(device)
+    return raw_images, images_tensors
+def compute_gradcam_batch(model, visual_input, text_input, tokenized_text, block_num=6):
+    model.text_encoder.base_model.base_model.encoder.layer[
+        block_num
+    ].crossattention.self.save_attention = True
+    output = model({"image": visual_input, "text_input": text_input}, match_head="itm")
+    loss = output[:, 1].sum()
+    model.zero_grad()
+    loss.backward()
+    with torch.no_grad():
+        mask = tokenized_text.attention_mask.view(
+            tokenized_text.attention_mask.size(0), 1, -1, 1, 1
+        )  # (bsz,1,token_len, 1,1)
+        token_length = mask.sum() - 2
+        token_length = token_length.cpu()
+        # grads and cams [bsz, num_head, seq_len, image_patch]
+        grads = model.text_encoder.base_model.base_model.encoder.layer[
+            block_num
+        ].crossattention.self.get_attn_gradients()
+        cams = model.text_encoder.base_model.base_model.encoder.layer[
+            block_num
+        ].crossattention.self.get_attention_map()
+        # assume using vit large with 576 num image patch
+        cams = cams[:, :, :, 1:].reshape(visual_input.size(0), 12, -1, 24, 24) * mask
+        grads = (
+            grads[:, :, :, 1:].clamp(0).reshape(visual_input.size(0), 12, -1, 24, 24)
+            * mask
+        )
+        gradcam = cams * grads
+        # [enc token gradcam, average gradcam across token, gradcam for individual token]
+        # gradcam = torch.cat((gradcam[0:1,:], gradcam[1:token_length+1, :].sum(dim=0, keepdim=True)/token_length, gradcam[1:, :]))
+        gradcam = gradcam.mean(1).cpu().detach()
+        gradcam = (
+            gradcam[:, 1 : token_length + 1, :].sum(dim=1, keepdim=True) / token_length
+        )
+    return gradcam, output

app/multipage.py ADDED Viewed

	@@ -0,0 +1,41 @@

+"""
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+"""
+This file is the framework for generating multiple Streamlit applications
+through an object oriented framework.
+"""
+# Import necessary libraries
+import streamlit as st
+# Define the multipage class to manage the multiple apps in our program
+class MultiPage:
+    """Framework for combining multiple streamlit applications."""
+    def __init__(self) -> None:
+        """Constructor class to generate a list which will store all our applications as an instance variable."""
+        self.pages = []
+    def add_page(self, title, func) -> None:
+        """Class Method to Add pages to the project
+        Args:
+            title ([str]): The title of page which we are adding to the list of apps
+            func: Python function to render this page in Streamlit
+        """
+        self.pages.append({"title": title, "function": func})
+    def run(self):
+        # Drodown to select the page to run
+        page = st.sidebar.selectbox(
+            "Navigation", self.pages, format_func=lambda page: page["title"]
+        )
+        # run the app function
+        page["function"]()

app/text_localization.py ADDED Viewed

	@@ -0,0 +1,105 @@

+"""
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import math
+import numpy as np
+import streamlit as st
+from lavis.models.blip_models.blip_image_text_matching import compute_gradcam
+from lavis.processors import load_processor
+from PIL import Image
+from app import device, load_demo_image
+from app.utils import getAttMap, init_bert_tokenizer, load_blip_itm_model
+def app():
+    model_type = st.sidebar.selectbox("Model:", ["BLIP_base", "BLIP_large"])
+    values = list(range(1, 12))
+    default_layer_num = values.index(7)
+    layer_num = (
+        st.sidebar.selectbox("Layer number", values, index=default_layer_num) - 1
+    )
+    st.markdown(
+        "<h1 style='text-align: center;'>Text Localization</h1>", unsafe_allow_html=True
+    )
+    vis_processor = load_processor("blip_image_eval").build(image_size=384)
+    text_processor = load_processor("blip_caption")
+    tokenizer = init_bert_tokenizer()
+    instructions = "Try the provided image and text or use your own ones."
+    file = st.file_uploader(instructions)
+    query = st.text_input(
+        "Try a different input.", "A girl playing with her dog on the beach."
+    )
+    submit_button = st.button("Submit")
+    col1, col2 = st.columns(2)
+    if file:
+        raw_img = Image.open(file).convert("RGB")
+    else:
+        raw_img = load_demo_image()
+    col1.header("Image")
+    w, h = raw_img.size
+    scaling_factor = 720 / w
+    resized_image = raw_img.resize((int(w * scaling_factor), int(h * scaling_factor)))
+    col1.image(resized_image, use_column_width=True)
+    col2.header("GradCam")
+    if submit_button:
+        if model_type.startswith("BLIP"):
+            blip_type = model_type.split("_")[1]
+            model = load_blip_itm_model(device, model_type=blip_type)
+        img = vis_processor(raw_img).unsqueeze(0).to(device)
+        qry = text_processor(query)
+        qry_tok = tokenizer(qry, return_tensors="pt").to(device)
+        norm_img = np.float32(resized_image) / 255
+        gradcam, _ = compute_gradcam(model, img, qry, qry_tok, block_num=layer_num)
+        avg_gradcam = getAttMap(norm_img, gradcam[0][1], blur=True)
+        col2.image(avg_gradcam, use_column_width=True, clamp=True)
+        num_cols = 4.0
+        num_tokens = len(qry_tok.input_ids[0]) - 2
+        num_rows = int(math.ceil(num_tokens / num_cols))
+        gradcam_iter = iter(gradcam[0][2:-1])
+        token_id_iter = iter(qry_tok.input_ids[0][1:-1])
+        for _ in range(num_rows):
+            with st.container():
+                for col in st.columns(int(num_cols)):
+                    token_id = next(token_id_iter, None)
+                    if not token_id:
+                        break
+                    gradcam_img = next(gradcam_iter)
+                    word = tokenizer.decode([token_id])
+                    gradcam_todraw = getAttMap(norm_img, gradcam_img, blur=True)
+                    new_title = (
+                        '<p style="text-align: center; font-size: 25px;">{}</p>'.format(
+                            word
+                        )
+                    )
+                    col.markdown(new_title, unsafe_allow_html=True)
+                    # st.image(image, channels="BGR")
+                    col.image(gradcam_todraw, use_column_width=True, clamp=True)

app/utils.py ADDED Viewed

	@@ -0,0 +1,81 @@

+"""
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import numpy as np
+import streamlit as st
+import torch
+from lavis.models import BlipBase, load_model
+from matplotlib import pyplot as plt
+from PIL import Image
+from scipy.ndimage import filters
+from skimage import transform as skimage_transform
+def resize_img(raw_img):
+    w, h = raw_img.size
+    scaling_factor = 240 / w
+    resized_image = raw_img.resize((int(w * scaling_factor), int(h * scaling_factor)))
+    return resized_image
+def read_img(filepath):
+    raw_image = Image.open(filepath).convert("RGB")
+    return raw_image
+@st.cache(
+    hash_funcs={
+        torch.nn.parameter.Parameter: lambda parameter: parameter.data.detach()
+        .cpu()
+        .numpy()
+    },
+    allow_output_mutation=True,
+)
+def load_model_cache(name, model_type, is_eval, device):
+    return load_model(name, model_type, is_eval, device)
+@st.cache(allow_output_mutation=True)
+def init_bert_tokenizer():
+    tokenizer = BlipBase.init_tokenizer()
+    return tokenizer
+def getAttMap(img, attMap, blur=True, overlap=True):
+    attMap -= attMap.min()
+    if attMap.max() > 0:
+        attMap /= attMap.max()
+    attMap = skimage_transform.resize(attMap, (img.shape[:2]), order=3, mode="constant")
+    if blur:
+        attMap = filters.gaussian_filter(attMap, 0.02 * max(img.shape[:2]))
+        attMap -= attMap.min()
+        attMap /= attMap.max()
+    cmap = plt.get_cmap("jet")
+    attMapV = cmap(attMap)
+    attMapV = np.delete(attMapV, 3, 2)
+    if overlap:
+        attMap = (
+            1 * (1 - attMap**0.7).reshape(attMap.shape + (1,)) * img
+            + (attMap**0.7).reshape(attMap.shape + (1,)) * attMapV
+        )
+    return attMap
+@st.cache(
+    hash_funcs={
+        torch.nn.parameter.Parameter: lambda parameter: parameter.data.detach()
+        .cpu()
+        .numpy()
+    },
+    allow_output_mutation=True,
+)
+def load_blip_itm_model(device, model_type="base"):
+    model = load_model(
+        "blip_image_text_matching", model_type, is_eval=True, device=device
+    )
+    return model

app/vqa.py ADDED Viewed

	@@ -0,0 +1,63 @@

+"""
+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import streamlit as st
+from app import load_demo_image, device
+from app.utils import load_model_cache
+from lavis.processors import load_processor
+from PIL import Image
+def app():
+    model_type = st.sidebar.selectbox("Model:", ["BLIP"])
+    # ===== layout =====
+    st.markdown(
+        "<h1 style='text-align: center;'>Visual Question Answering</h1>",
+        unsafe_allow_html=True,
+    )
+    instructions = """Try the provided image or upload your own:"""
+    file = st.file_uploader(instructions)
+    col1, col2 = st.columns(2)
+    col1.header("Image")
+    if file:
+        raw_img = Image.open(file).convert("RGB")
+    else:
+        raw_img = load_demo_image()
+    w, h = raw_img.size
+    scaling_factor = 720 / w
+    resized_image = raw_img.resize((int(w * scaling_factor), int(h * scaling_factor)))
+    col1.image(resized_image, use_column_width=True)
+    col2.header("Question")
+    user_question = col2.text_input("Input your question!", "What are objects there?")
+    qa_button = st.button("Submit")
+    col2.header("Answer")
+    # ===== event =====
+    vis_processor = load_processor("blip_image_eval").build(image_size=480)
+    text_processor = load_processor("blip_question").build()
+    if qa_button:
+        if model_type.startswith("BLIP"):
+            model = load_model_cache(
+                "blip_vqa", model_type="vqav2", is_eval=True, device=device
+            )
+            img = vis_processor(raw_img).unsqueeze(0).to(device)
+            question = text_processor(user_question)
+            vqa_samples = {"image": img, "text_input": [question]}
+            answers = model.predict_answers(vqa_samples, inference_method="generate")
+            col2.write("\n".join(answers), use_column_width=True)