Spaces:
Runtime error
Runtime error
paul hilders
commited on
Commit
•
2dec07d
1
Parent(s):
c148272
Added first application file
Browse files- CLIP_explainability/Transformer-MM-Explainability +1 -0
- app.py +57 -0
CLIP_explainability/Transformer-MM-Explainability
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Subproject commit 6a2c3c9da3fc186878e0c2bcf238c3a4c76d8af8
|
app.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import gradio as gr
|
3 |
+
|
4 |
+
# sys.path.append("../")
|
5 |
+
sys.path.append("CLIP_explainability/Transformer-MM-Explainability/")
|
6 |
+
|
7 |
+
import torch
|
8 |
+
import CLIP.clip as clip
|
9 |
+
|
10 |
+
|
11 |
+
from clip_grounding.utils.image import pad_to_square
|
12 |
+
from clip_grounding.datasets.png import (
|
13 |
+
overlay_relevance_map_on_image,
|
14 |
+
)
|
15 |
+
from CLIP_explainability.utils import interpret, show_img_heatmap, show_heatmap_on_text
|
16 |
+
|
17 |
+
clip.clip._MODELS = {
|
18 |
+
"ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
|
19 |
+
"ViT-B/16": "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt",
|
20 |
+
}
|
21 |
+
|
22 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
23 |
+
model, preprocess = clip.load("ViT-B/32", device=device, jit=False)
|
24 |
+
|
25 |
+
# Gradio Section:
|
26 |
+
def run_demo(image, text):
|
27 |
+
orig_image = pad_to_square(image)
|
28 |
+
img = preprocess(orig_image).unsqueeze(0).to(device)
|
29 |
+
text_input = clip.tokenize([text]).to(device)
|
30 |
+
|
31 |
+
R_text, R_image = interpret(model=model, image=img, texts=text_input, device=device)
|
32 |
+
|
33 |
+
image_relevance = show_img_heatmap(R_image[0], img, orig_image=orig_image, device=device, show=False)
|
34 |
+
overlapped = overlay_relevance_map_on_image(image, image_relevance)
|
35 |
+
|
36 |
+
text_scores, text_tokens_decoded = show_heatmap_on_text(text, text_input, R_text[0], show=False)
|
37 |
+
|
38 |
+
highlighted_text = []
|
39 |
+
for i, token in enumerate(text_tokens_decoded):
|
40 |
+
highlighted_text.append((str(token), float(text_scores[i])))
|
41 |
+
|
42 |
+
return overlapped, highlighted_text
|
43 |
+
|
44 |
+
input_img = gr.inputs.Image(type='pil', label="Original Image")
|
45 |
+
input_txt = "text"
|
46 |
+
inputs = [input_img, input_txt]
|
47 |
+
|
48 |
+
outputs = [gr.inputs.Image(type='pil', label="Output Image"), "highlight"]
|
49 |
+
|
50 |
+
|
51 |
+
iface = gr.Interface(fn=run_demo,
|
52 |
+
inputs=inputs,
|
53 |
+
outputs=outputs,
|
54 |
+
title="CLIP Grounding Explainability",
|
55 |
+
description="A demonstration based on the Generic Attention-model Explainability method for Interpreting Bi-Modal Transformers by Chefer et al. (2021): https://github.com/hila-chefer/Transformer-MM-Explainability.",
|
56 |
+
examples=[["harrypotter.png", "Harry"], ["harrypotter.png", "Hermione"], ["harrypotter.png", "Ron"], ["Amsterdam.png", "Amsterdam canal"], ["Amsterdam.png", "Old buildings"], ["Amsterdam.png", "Pink flowers"], ["dogs_on_bed.png", "Two dogs"], ["dogs_on_bed.png", "Book"], ["dogs_on_bed.png", "Cat"], ["Solar_system.png", "Sun"], ["Solar_system.png", "Earth"]])
|
57 |
+
iface.launch(debug=True)
|