File size: 5,921 Bytes
0241217
 
 
 
 
 
 
 
 
d80767e
c3ca2bd
d80767e
e71dad4
ae6e057
e71dad4
0241217
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae6e057
 
 
d80767e
0241217
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
929c841
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201e3f5
 
c3ca2bd
41478ca
 
c3ca2bd
 
 
929c841
cca85c2
 
2910b3b
 
 
 
14dfe9b
 
6d91375
 
cca85c2
6d91375
c3ca2bd
6d91375
 
201e3f5
c3ca2bd
 
6d91375
 
0241217
f0f4f09
 
 
0241217
6d91375
0241217
 
929c841
be112ad
 
 
 
 
 
 
 
 
 
 
 
d46a4b3
 
3cba57d
929c841
5d101e4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import sys
import gradio as gr

# sys.path.append("../")
sys.path.append("CLIP_explainability/Transformer-MM-Explainability/")

import torch
import CLIP.clip as clip

import spacy
from PIL import Image, ImageFont, ImageDraw, ImageOps

import os
os.system('python -m spacy download en_core_web_sm')


from clip_grounding.utils.image import pad_to_square
from clip_grounding.datasets.png import (
    overlay_relevance_map_on_image,
)
from CLIP_explainability.utils import interpret, show_img_heatmap, show_heatmap_on_text

clip.clip._MODELS = {
    "ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
    "ViT-B/16": "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt",
}

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device, jit=False)

# nlp = spacy.load("en_core_web_sm")
import en_core_web_sm
nlp = en_core_web_sm.load()

# Gradio Section:
def run_demo(image, text):
    orig_image = pad_to_square(image)
    img = preprocess(orig_image).unsqueeze(0).to(device)
    text_input = clip.tokenize([text]).to(device)

    R_text, R_image = interpret(model=model, image=img, texts=text_input, device=device)

    image_relevance = show_img_heatmap(R_image[0], img, orig_image=orig_image, device=device, show=False)
    overlapped = overlay_relevance_map_on_image(image, image_relevance)

    text_scores, text_tokens_decoded = show_heatmap_on_text(text, text_input, R_text[0], show=False)

    highlighted_text = []
    for i, token in enumerate(text_tokens_decoded):
        highlighted_text.append((str(token), float(text_scores[i])))

    return overlapped, highlighted_text


# Default demo:
input_img = gr.inputs.Image(type='pil', label="Original Image")
input_txt = "text"
inputs = [input_img, input_txt]

outputs = [gr.inputs.Image(type='pil', label="Output Image"), "highlight"]


iface = gr.Interface(fn=run_demo,
                     inputs=inputs,
                     outputs=outputs,
                     title="CLIP Grounding Explainability",
                     description="A demonstration based on the Generic Attention-model Explainability method for Interpreting Bi-Modal Transformers by Chefer et al. (2021): https://github.com/hila-chefer/Transformer-MM-Explainability.",
                     examples=[["example_images/London.png", "London Eye"],
                               ["example_images/London.png", "Big Ben"],
                               ["example_images/harrypotter.png", "Harry"],
                               ["example_images/harrypotter.png", "Hermione"],
                               ["example_images/harrypotter.png", "Ron"],
                               ["example_images/Amsterdam.png", "Amsterdam canal"],
                               ["example_images/Amsterdam.png", "Old buildings"],
                               ["example_images/Amsterdam.png", "Pink flowers"],
                               ["example_images/dogs_on_bed.png", "Two dogs"],
                               ["example_images/dogs_on_bed.png", "Book"],
                               ["example_images/dogs_on_bed.png", "Cat"]])

# NER demo:
def add_label_to_img(img, label):
    img = ImageOps.expand(img, border=45, fill=(255,255,255))
    draw = ImageDraw.Draw(img)
    font = ImageFont.truetype("arial.ttf", 36)
    draw.text((0,0), label, align="center", fill=(0, 0, 0), font=font)

    return img

def NER_demo(image, text):
    # Apply NER to extract named entities, and run the explainability method
    # for each named entity.
    highlighed_entities = []
    for ent in nlp(text).ents:
        ent_text = ent.text
        ent_label = ent.label_
        highlighed_entities.append((ent_text, ent_label))

    # As the default image, we run the default demo on the input image and text:
    overlapped, highlighted_text = run_demo(image, text)

    # Then, we run the demo for each of the named entities:
    gallery_images = [overlapped]
    for ent_text, ent_label in highlighed_entities:
        overlapped_ent, highlighted_text_ent = run_demo(image, ent_text)
        overlapped_ent_labelled = add_label_to_img(overlapped_ent, ent_text)

        gallery_images.append(overlapped_ent_labelled)

    return highlighed_entities, gallery_images

input_img_NER = gr.inputs.Image(type='pil', label="Original Image")
input_txt_NER = "text"
inputs_NER = [input_img_NER, input_txt_NER]

outputs_NER = ["highlight", gr.Gallery(type='pil', label="NER Entity explanations")]


iface_NER = gr.Interface(fn=NER_demo,
                         inputs=inputs_NER,
                         outputs=outputs_NER,
                         examples=[["example_images/London.png", "London Eye"],
                                   ["example_images/London.png", "Big Ben"],
                                   ["example_images/harrypotter.png", "Harry"],
                                   ["example_images/harrypotter.png", "Hermione"],
                                   ["example_images/harrypotter.png", "Ron"],
                                   ["example_images/Amsterdam.png", "Amsterdam canal"],
                                   ["example_images/Amsterdam.png", "Old buildings"],
                                   ["example_images/Amsterdam.png", "Pink flowers"],
                                   ["example_images/dogs_on_bed.png", "Two dogs"],
                                   ["example_images/dogs_on_bed.png", "Book"],
                                   ["example_images/dogs_on_bed.png", "Cat"]],
                        cache_examples=False)

demo_tabs = gr.TabbedInterface([iface, iface_NER], ["Default", "NER"])
demo_tabs.launch(debug=True)