Martijn van Beers commited on
Commit
dc15657
1 Parent(s): 8c7ba46

Update to blocks for layer selection

Browse files

Change from using gradio.Interface to gradio.Blocks so I understand how
to add a event handler to change the range of the layer selection when
the user chooses a different model.

Files changed (2) hide show
  1. CLIP_explainability/utils.py +2 -2
  2. app.py +66 -56
CLIP_explainability/utils.py CHANGED
@@ -13,13 +13,13 @@ _tokenizer = _Tokenizer()
13
 
14
  #@title Control context expansion (number of attention layers to consider)
15
  #@title Number of layers for image Transformer
16
- start_layer = 11#@param {type:"number"}
17
 
18
  #@title Number of layers for text Transformer
19
  start_layer_text = 11#@param {type:"number"}
20
 
21
 
22
- def interpret(image, texts, model, device):
23
  batch_size = texts.shape[0]
24
  images = image.repeat(batch_size, 1, 1, 1)
25
  logits_per_image, logits_per_text = model(images, texts)
 
13
 
14
  #@title Control context expansion (number of attention layers to consider)
15
  #@title Number of layers for image Transformer
16
+ #start_layer = 11#@param {type:"number"}
17
 
18
  #@title Number of layers for text Transformer
19
  start_layer_text = 11#@param {type:"number"}
20
 
21
 
22
+ def interpret(image, texts, model, device, start_layer):
23
  batch_size = texts.shape[0]
24
  images = image.repeat(batch_size, 1, 1, 1)
25
  logits_per_image, logits_per_text = model(images, texts)
app.py CHANGED
@@ -53,21 +53,29 @@ import en_core_web_sm
53
  nlp = en_core_web_sm.load()
54
 
55
  # Gradio Section:
 
 
 
 
 
 
56
  def run_demo(*args):
57
- if len(args) == 3:
58
- image, text, model_name = args
59
  elif len(args) == 2:
60
  image, text = args
61
  model_name = "ViT-B/32"
 
62
  else:
63
  raise ValueError("Unexpected number of parameters")
64
 
 
65
  model, preprocess = clip.load(model_name, device=device, jit=False)
66
  orig_image = pad_to_square(image)
67
  img = preprocess(orig_image).unsqueeze(0).to(device)
68
  text_input = clip.tokenize([text]).to(device)
69
 
70
- R_text, R_image = interpret(model=model, image=img, texts=text_input, device=device)
71
 
72
  image_relevance = show_img_heatmap(R_image[0], img, orig_image=orig_image, device=device)
73
  overlapped = overlay_relevance_map_on_image(image, image_relevance)
@@ -83,18 +91,6 @@ def run_demo(*args):
83
 
84
  # Default demo:
85
 
86
- default_inputs = [
87
- gr.components.Image(type='pil', label="Original Image"),
88
- gr.components.Textbox(label="Image description"),
89
- gr.Dropdown(label="CLIP Model", choices=['ViT-B/16', 'ViT-B/32', 'ViT-L/14'], value="ViT-B/32"),
90
- ]
91
-
92
- default_outputs = [
93
- gr.components.Image(type='pil', label="Output Image"),
94
- gr.components.HighlightedText(label="Text importance"),
95
- ]
96
-
97
-
98
  description = """This demo is a copy of the demo CLIPGroundingExlainability built by Paul Hilders, Danilo de Goede and Piyush Bagad, as part of the course Interpretability and Explainability in AI (MSc AI, UvA, June 2022).
99
  <br> <br>
100
  This demo shows attributions scores on both the image and the text input when presenting CLIP with a
@@ -104,23 +100,37 @@ description = """This demo is a copy of the demo CLIPGroundingExlainability buil
104
  methods such as the one from this demo can only give an estimate of the real underlying behavior
105
  of the model."""
106
 
107
- iface = gr.Interface(fn=run_demo,
108
- inputs=default_inputs,
109
- outputs=default_outputs,
110
- title="CLIP Grounding Explainability",
111
- description=description,
112
- cache_examples=False,
113
- examples=[["example_images/London.png", "London Eye"],
114
- ["example_images/London.png", "Big Ben"],
115
- ["example_images/harrypotter.png", "Harry"],
116
- ["example_images/harrypotter.png", "Hermione"],
117
- ["example_images/harrypotter.png", "Ron"],
118
- ["example_images/Amsterdam.png", "Amsterdam canal"],
119
- ["example_images/Amsterdam.png", "Old buildings"],
120
- ["example_images/Amsterdam.png", "Pink flowers"],
121
- ["example_images/dogs_on_bed.png", "Two dogs"],
122
- ["example_images/dogs_on_bed.png", "Book"],
123
- ["example_images/dogs_on_bed.png", "Cat"]])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
  # NER demo:
126
  def add_label_to_img(img, label, add_entity_label=True):
@@ -170,36 +180,36 @@ def NER_demo(image, text, model_name):
170
 
171
  return labeled_text, gallery_images
172
 
173
- inputs_NER = [
174
- gr.Image(type='pil', label="Original Image"),
175
- gr.components.Textbox(label="Descriptive text"),
176
- gr.Dropdown(label="CLIP Model", choices=['ViT-B/16', 'ViT-B/32', 'ViT-L/14'], value="ViT-L/14"),
177
- ]
178
-
179
- #colours = highlighter._style["color_map"]
180
- outputs_NER = [
181
- gr.components.HighlightedText(show_legend=True, color_map=colour_map, label="Noun chunks"),
182
- gr.components.Gallery(type='pil', label="NER Entity explanations")
183
- ]
184
 
185
  description_NER = """Automatically generated CLIP grounding explanations for
186
- named entities, retrieved from the spacy NER model. <span style="color:red">Warning:</span> Note
187
  that attribution methods such as the one from this demo can only give an estimate of the real
188
  underlying behavior of the model."""
189
 
190
- iface_NER = gr.Interface(fn=NER_demo,
191
- inputs=inputs_NER,
192
- outputs=outputs_NER,
193
- title="Named Entity Grounding explainability using CLIP",
194
- description=description_NER,
195
- examples=[
196
- ["example_images/London.png", "In this image we see Big Ben and the London Eye, on both sides of the river Thames."],
197
- ["example_images/harrypotter.png", "Hermione, Harry and Ron in their school uniform"],
198
- ],
199
- cache_examples=False)
200
-
201
- demo_tabs = gr.TabbedInterface([iface, iface_NER], ["Default", "NER"])
 
 
 
 
 
 
 
 
 
 
202
 
 
203
  with demo_tabs:
204
  gr.Markdown("""
205
  ### Acknowledgements
 
53
  nlp = en_core_web_sm.load()
54
 
55
  # Gradio Section:
56
+ def update_slider(model):
57
+ if model == "ViT-L/14":
58
+ return gr.update(maximum=23, value=23)
59
+ else:
60
+ return gr.update(maximum=11, value=11)
61
+
62
  def run_demo(*args):
63
+ if len(args) == 4:
64
+ image, text, model_name, vision_layer = args
65
  elif len(args) == 2:
66
  image, text = args
67
  model_name = "ViT-B/32"
68
+ vision_layer = 11
69
  else:
70
  raise ValueError("Unexpected number of parameters")
71
 
72
+ vision_layer = int(vision_layer)
73
  model, preprocess = clip.load(model_name, device=device, jit=False)
74
  orig_image = pad_to_square(image)
75
  img = preprocess(orig_image).unsqueeze(0).to(device)
76
  text_input = clip.tokenize([text]).to(device)
77
 
78
+ R_text, R_image = interpret(model=model, image=img, texts=text_input, device=device, start_layer=vision_layer)
79
 
80
  image_relevance = show_img_heatmap(R_image[0], img, orig_image=orig_image, device=device)
81
  overlapped = overlay_relevance_map_on_image(image, image_relevance)
 
91
 
92
  # Default demo:
93
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  description = """This demo is a copy of the demo CLIPGroundingExlainability built by Paul Hilders, Danilo de Goede and Piyush Bagad, as part of the course Interpretability and Explainability in AI (MSc AI, UvA, June 2022).
95
  <br> <br>
96
  This demo shows attributions scores on both the image and the text input when presenting CLIP with a
 
100
  methods such as the one from this demo can only give an estimate of the real underlying behavior
101
  of the model."""
102
 
103
+ with gr.Blocks(title="CLIP Grounding Explainability") as iface_default:
104
+ gr.Markdown(description)
105
+ with gr.Row():
106
+ with gr.Column() as inputs:
107
+ orig = gr.components.Image(type='pil', label="Original Image")
108
+ description = gr.components.Textbox(label="Image description")
109
+ default_model = gr.Dropdown(label="CLIP Model", choices=['ViT-B/16', 'ViT-B/32', 'ViT-L/14'], value="ViT-B/32")
110
+ default_layer = gr.Slider(label="Vision start layer", minimum=0, maximum=11, step=1, value=11)
111
+ submit = gr.Button("Submit")
112
+ with gr.Column() as outputs:
113
+ image = gr.components.Image(type='pil', label="Output Image")
114
+ text = gr.components.HighlightedText(label="Text importance")
115
+ gr.Examples(
116
+ examples=[
117
+ ["example_images/London.png", "London Eye"],
118
+ ["example_images/London.png", "Big Ben"],
119
+ ["example_images/harrypotter.png", "Harry"],
120
+ ["example_images/harrypotter.png", "Hermione"],
121
+ ["example_images/harrypotter.png", "Ron"],
122
+ ["example_images/Amsterdam.png", "Amsterdam canal"],
123
+ ["example_images/Amsterdam.png", "Old buildings"],
124
+ ["example_images/Amsterdam.png", "Pink flowers"],
125
+ ["example_images/dogs_on_bed.png", "Two dogs"],
126
+ ["example_images/dogs_on_bed.png", "Book"],
127
+ ["example_images/dogs_on_bed.png", "Cat"]
128
+ ],
129
+ inputs=[orig, description]
130
+ )
131
+ default_model.change(update_slider, inputs=default_model, outputs=default_layer)
132
+ submit.click(run_demo, inputs=[orig, description, default_model, default_layer], outputs=[image, text])
133
+
134
 
135
  # NER demo:
136
  def add_label_to_img(img, label, add_entity_label=True):
 
180
 
181
  return labeled_text, gallery_images
182
 
 
 
 
 
 
 
 
 
 
 
 
183
 
184
  description_NER = """Automatically generated CLIP grounding explanations for
185
+ noun chunks, retrieved with the spaCy model. <span style="color:red">Warning:</span> Note
186
  that attribution methods such as the one from this demo can only give an estimate of the real
187
  underlying behavior of the model."""
188
 
189
+ with gr.Blocks(title="Entity Grounding explainability using CLIP") as iface_NER:
190
+ gr.Markdown(description_NER)
191
+ with gr.Row():
192
+ with gr.Column() as inputs:
193
+ img = gr.Image(type='pil', label="Original Image")
194
+ text = gr.components.Textbox(label="Descriptive text")
195
+ ner_model = gr.Dropdown(label="CLIP Model", choices=['ViT-B/16', 'ViT-B/32', 'ViT-L/14'], value="ViT-B/32")
196
+ ner_layer = gr.Slider(label="Vision start layer", minimum=0, maximum=11, step=1, value=11)
197
+ submit = gr.Button("Submit")
198
+ with gr.Column() as outputs:
199
+ text = gr.components.HighlightedText(show_legend=True, color_map=colour_map, label="Noun chunks")
200
+ gallery = gr.components.Gallery(type='pil', label="NER Entity explanations")
201
+
202
+ gr.Examples(
203
+ examples=[
204
+ ["example_images/London.png", "In this image we see Big Ben and the London Eye, on both sides of the river Thames."],
205
+ ["example_images/harrypotter.png", "Hermione, Harry and Ron in their school uniform"],
206
+ ],
207
+ inputs=[img, text],
208
+ )
209
+ ner_model.change(update_slider, inputs=ner_model, outputs=ner_layer)
210
+ submit.click(run_demo, inputs=[img, text, ner_model, ner_layer], outputs=[text, gallery])
211
 
212
+ demo_tabs = gr.TabbedInterface([iface_default, iface_NER], ["Default", "Entities"])
213
  with demo_tabs:
214
  gr.Markdown("""
215
  ### Acknowledgements