Spaces:

PaulHilders
/

CLIPGroundingExplainability

Runtime error

App Files Files Community

paul hilders commited on Jul 2, 2022

Commit

2dec07d

•

1 Parent(s): c148272

Added first application file

Browse files

Files changed (2) hide show

CLIP_explainability/Transformer-MM-Explainability +1 -0
app.py +57 -0

CLIP_explainability/Transformer-MM-Explainability ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit 6a2c3c9da3fc186878e0c2bcf238c3a4c76d8af8

app.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import sys
+import gradio as gr
+# sys.path.append("../")
+sys.path.append("CLIP_explainability/Transformer-MM-Explainability/")
+import torch
+import CLIP.clip as clip
+from clip_grounding.utils.image import pad_to_square
+from clip_grounding.datasets.png import (
+    overlay_relevance_map_on_image,
+)
+from CLIP_explainability.utils import interpret, show_img_heatmap, show_heatmap_on_text
+clip.clip._MODELS = {
+    "ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
+    "ViT-B/16": "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt",
+}
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model, preprocess = clip.load("ViT-B/32", device=device, jit=False)
+# Gradio Section:
+def run_demo(image, text):
+    orig_image = pad_to_square(image)
+    img = preprocess(orig_image).unsqueeze(0).to(device)
+    text_input = clip.tokenize([text]).to(device)
+    R_text, R_image = interpret(model=model, image=img, texts=text_input, device=device)
+    image_relevance = show_img_heatmap(R_image[0], img, orig_image=orig_image, device=device, show=False)
+    overlapped = overlay_relevance_map_on_image(image, image_relevance)
+    text_scores, text_tokens_decoded = show_heatmap_on_text(text, text_input, R_text[0], show=False)
+    highlighted_text = []
+    for i, token in enumerate(text_tokens_decoded):
+        highlighted_text.append((str(token), float(text_scores[i])))
+    return overlapped, highlighted_text
+input_img = gr.inputs.Image(type='pil', label="Original Image")
+input_txt = "text"
+inputs = [input_img, input_txt]
+outputs = [gr.inputs.Image(type='pil', label="Output Image"), "highlight"]
+iface = gr.Interface(fn=run_demo,
+                     inputs=inputs,
+                     outputs=outputs,
+                     title="CLIP Grounding Explainability",
+                     description="A demonstration based on the Generic Attention-model Explainability method for Interpreting Bi-Modal Transformers by Chefer et al. (2021): https://github.com/hila-chefer/Transformer-MM-Explainability.",
+                     examples=[["harrypotter.png", "Harry"], ["harrypotter.png", "Hermione"], ["harrypotter.png", "Ron"], ["Amsterdam.png", "Amsterdam canal"], ["Amsterdam.png", "Old buildings"], ["Amsterdam.png", "Pink flowers"], ["dogs_on_bed.png", "Two dogs"], ["dogs_on_bed.png", "Book"], ["dogs_on_bed.png", "Cat"], ["Solar_system.png", "Sun"], ["Solar_system.png", "Earth"]])
+iface.launch(debug=True)