paul hilders commited on
Commit
2dec07d
1 Parent(s): c148272

Added first application file

Browse files
CLIP_explainability/Transformer-MM-Explainability ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit 6a2c3c9da3fc186878e0c2bcf238c3a4c76d8af8
app.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import gradio as gr
3
+
4
+ # sys.path.append("../")
5
+ sys.path.append("CLIP_explainability/Transformer-MM-Explainability/")
6
+
7
+ import torch
8
+ import CLIP.clip as clip
9
+
10
+
11
+ from clip_grounding.utils.image import pad_to_square
12
+ from clip_grounding.datasets.png import (
13
+ overlay_relevance_map_on_image,
14
+ )
15
+ from CLIP_explainability.utils import interpret, show_img_heatmap, show_heatmap_on_text
16
+
17
+ clip.clip._MODELS = {
18
+ "ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
19
+ "ViT-B/16": "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt",
20
+ }
21
+
22
+ device = "cuda" if torch.cuda.is_available() else "cpu"
23
+ model, preprocess = clip.load("ViT-B/32", device=device, jit=False)
24
+
25
+ # Gradio Section:
26
+ def run_demo(image, text):
27
+ orig_image = pad_to_square(image)
28
+ img = preprocess(orig_image).unsqueeze(0).to(device)
29
+ text_input = clip.tokenize([text]).to(device)
30
+
31
+ R_text, R_image = interpret(model=model, image=img, texts=text_input, device=device)
32
+
33
+ image_relevance = show_img_heatmap(R_image[0], img, orig_image=orig_image, device=device, show=False)
34
+ overlapped = overlay_relevance_map_on_image(image, image_relevance)
35
+
36
+ text_scores, text_tokens_decoded = show_heatmap_on_text(text, text_input, R_text[0], show=False)
37
+
38
+ highlighted_text = []
39
+ for i, token in enumerate(text_tokens_decoded):
40
+ highlighted_text.append((str(token), float(text_scores[i])))
41
+
42
+ return overlapped, highlighted_text
43
+
44
+ input_img = gr.inputs.Image(type='pil', label="Original Image")
45
+ input_txt = "text"
46
+ inputs = [input_img, input_txt]
47
+
48
+ outputs = [gr.inputs.Image(type='pil', label="Output Image"), "highlight"]
49
+
50
+
51
+ iface = gr.Interface(fn=run_demo,
52
+ inputs=inputs,
53
+ outputs=outputs,
54
+ title="CLIP Grounding Explainability",
55
+ description="A demonstration based on the Generic Attention-model Explainability method for Interpreting Bi-Modal Transformers by Chefer et al. (2021): https://github.com/hila-chefer/Transformer-MM-Explainability.",
56
+ examples=[["harrypotter.png", "Harry"], ["harrypotter.png", "Hermione"], ["harrypotter.png", "Ron"], ["Amsterdam.png", "Amsterdam canal"], ["Amsterdam.png", "Old buildings"], ["Amsterdam.png", "Pink flowers"], ["dogs_on_bed.png", "Two dogs"], ["dogs_on_bed.png", "Book"], ["dogs_on_bed.png", "Cat"], ["Solar_system.png", "Sun"], ["Solar_system.png", "Earth"]])
57
+ iface.launch(debug=True)