Spaces:
Runtime error
Runtime error
File size: 5,970 Bytes
0241217 d80767e c3ca2bd d80767e e71dad4 ae6e057 e71dad4 0241217 ae6e057 d80767e 0241217 929c841 66dfac7 9ee9e02 66dfac7 9ee9e02 66dfac7 929c841 66dfac7 929c841 39c7251 201e3f5 c3ca2bd 39c7251 c81ac67 39c7251 c3ca2bd 929c841 cca85c2 2910b3b 14dfe9b 6d91375 cca85c2 6d91375 39c7251 6d91375 c81ac67 c3ca2bd 6d91375 0241217 f0f4f09 0241217 6d91375 0241217 929c841 be112ad 66dfac7 2a6b42b c81ac67 3cba57d 929c841 5d101e4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
import sys
import gradio as gr
# sys.path.append("../")
sys.path.append("CLIP_explainability/Transformer-MM-Explainability/")
import torch
import CLIP.clip as clip
import spacy
from PIL import Image, ImageFont, ImageDraw, ImageOps
import os
os.system('python -m spacy download en_core_web_sm')
from clip_grounding.utils.image import pad_to_square
from clip_grounding.datasets.png import (
overlay_relevance_map_on_image,
)
from CLIP_explainability.utils import interpret, show_img_heatmap, show_heatmap_on_text
clip.clip._MODELS = {
"ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
"ViT-B/16": "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt",
}
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device, jit=False)
# nlp = spacy.load("en_core_web_sm")
import en_core_web_sm
nlp = en_core_web_sm.load()
# Gradio Section:
def run_demo(image, text):
orig_image = pad_to_square(image)
img = preprocess(orig_image).unsqueeze(0).to(device)
text_input = clip.tokenize([text]).to(device)
R_text, R_image = interpret(model=model, image=img, texts=text_input, device=device)
image_relevance = show_img_heatmap(R_image[0], img, orig_image=orig_image, device=device, show=False)
overlapped = overlay_relevance_map_on_image(image, image_relevance)
text_scores, text_tokens_decoded = show_heatmap_on_text(text, text_input, R_text[0], show=False)
highlighted_text = []
for i, token in enumerate(text_tokens_decoded):
highlighted_text.append((str(token), float(text_scores[i])))
return overlapped, highlighted_text
# Default demo:
input_img = gr.inputs.Image(type='pil', label="Original Image")
input_txt = "text"
inputs = [input_img, input_txt]
outputs = [gr.inputs.Image(type='pil', label="Output Image"), "highlight"]
description = """A demonstration based on the Generic Attention-model Explainability method for Interpreting Bi-Modal
Transformers by Chefer et al. (2021): https://github.com/hila-chefer/Transformer-MM-Explainability.
This demo shows attributions scores on both the image and the text input when presented CLIP with a
<text,image> pair. Attributions are computed as Gradient-weighted Attention Rollout (Chefer et al.,
2021), and can be thought of as an estimate of the effective attention CLIP pays to its input when
computing a multimodal representation."""
iface = gr.Interface(fn=run_demo,
inputs=inputs,
outputs=outputs,
title="CLIP Grounding Explainability",
description=description,
examples=[["example_images/London.png", "London Eye"],
["example_images/London.png", "Big Ben"],
["example_images/harrypotter.png", "Harry"],
["example_images/harrypotter.png", "Hermione"],
["example_images/harrypotter.png", "Ron"],
["example_images/Amsterdam.png", "Amsterdam canal"],
["example_images/Amsterdam.png", "Old buildings"],
["example_images/Amsterdam.png", "Pink flowers"],
["example_images/dogs_on_bed.png", "Two dogs"],
["example_images/dogs_on_bed.png", "Book"],
["example_images/dogs_on_bed.png", "Cat"]])
# NER demo:
def add_label_to_img(img, label, add_entity_label=True):
img = ImageOps.expand(img, border=45, fill=(255,255,255))
draw = ImageDraw.Draw(img)
font = ImageFont.truetype("arial.ttf", 24)
if add_entity_label:
draw.text((0,0), f"Entity: {str(label)}" , align="center", fill=(0, 0, 0), font=font)
else:
draw.text((0,0), str(label), align="center", fill=(0, 0, 0), font=font)
return img
def NER_demo(image, text):
# Apply NER to extract named entities, and run the explainability method
# for each named entity.
highlighed_entities = []
for ent in nlp(text).ents:
ent_text = ent.text
ent_label = ent.label_
highlighed_entities.append((ent_text, ent_label))
# As the default image, we run the default demo on the input image and text:
overlapped, highlighted_text = run_demo(image, text)
# Then, we run the demo for each of the named entities:
gallery_images = [add_label_to_img(overlapped, "Full explanation", add_entity_label=False)]
for ent_text, ent_label in highlighed_entities:
overlapped_ent, highlighted_text_ent = run_demo(image, ent_text)
overlapped_ent_labelled = add_label_to_img(overlapped_ent, f"{str(ent_text)} ({str(ent_label)})")
gallery_images.append(overlapped_ent_labelled)
return highlighed_entities, gallery_images
input_img_NER = gr.inputs.Image(type='pil', label="Original Image")
input_txt_NER = "text"
inputs_NER = [input_img_NER, input_txt_NER]
outputs_NER = ["highlight", gr.Gallery(type='pil', label="NER Entity explanations")]
iface_NER = gr.Interface(fn=NER_demo,
inputs=inputs_NER,
outputs=outputs_NER,
title="Named Entity Grounding explainability using CLIP",
examples=[["example_images/London.png", "In this image we see Big Ben and the London Eye, on both sides of the river Thames."]],
cache_examples=False)
demo_tabs = gr.TabbedInterface([iface, iface_NER], ["Default", "NER"])
demo_tabs.launch(debug=True) |