Martijn van Beers commited on
Commit
1fd86da
1 Parent(s): 0c9f8df

Add model selection

Browse files

Add an extra input to be able to select which CLIP model to use.

Files changed (1) hide show
  1. app.py +18 -15
app.py CHANGED
@@ -47,14 +47,15 @@ colour_map = {
47
  }
48
 
49
  device = "cuda" if torch.cuda.is_available() else "cpu"
50
- model, preprocess = clip.load("ViT-B/32", device=device, jit=False)
51
 
52
  # nlp = spacy.load("en_core_web_sm")
53
  import en_core_web_sm
54
  nlp = en_core_web_sm.load()
55
 
56
  # Gradio Section:
57
- def run_demo(image, text):
 
 
58
  orig_image = pad_to_square(image)
59
  img = preprocess(orig_image).unsqueeze(0).to(device)
60
  text_input = clip.tokenize([text]).to(device)
@@ -76,6 +77,7 @@ def run_demo(image, text):
76
  # Default demo:
77
 
78
  default_inputs = [
 
79
  gr.components.Image(type='pil', label="Original Image"),
80
  gr.components.Textbox(label="Image description"),
81
  ]
@@ -100,17 +102,17 @@ iface = gr.Interface(fn=run_demo,
100
  outputs=default_outputs,
101
  title="CLIP Grounding Explainability",
102
  description=description,
103
- examples=[["example_images/London.png", "London Eye"],
104
- ["example_images/London.png", "Big Ben"],
105
- ["example_images/harrypotter.png", "Harry"],
106
- ["example_images/harrypotter.png", "Hermione"],
107
- ["example_images/harrypotter.png", "Ron"],
108
- ["example_images/Amsterdam.png", "Amsterdam canal"],
109
- ["example_images/Amsterdam.png", "Old buildings"],
110
- ["example_images/Amsterdam.png", "Pink flowers"],
111
- ["example_images/dogs_on_bed.png", "Two dogs"],
112
- ["example_images/dogs_on_bed.png", "Book"],
113
- ["example_images/dogs_on_bed.png", "Cat"]])
114
 
115
  # NER demo:
116
  def add_label_to_img(img, label, add_entity_label=True):
@@ -160,6 +162,7 @@ def NER_demo(image, text):
160
  return labeled_text, gallery_images
161
 
162
  inputs_NER = [
 
163
  gr.Image(type='pil', label="Original Image"),
164
  gr.components.Textbox(label="Descriptive text"),
165
  ]
@@ -181,8 +184,8 @@ iface_NER = gr.Interface(fn=NER_demo,
181
  title="Named Entity Grounding explainability using CLIP",
182
  description=description_NER,
183
  examples=[
184
- ["example_images/London.png", "In this image we see Big Ben and the London Eye, on both sides of the river Thames."],
185
- ["example_images/harrypotter.png", "Hermione, Harry and Ron in their school uniform"],
186
  ],
187
  cache_examples=False)
188
 
 
47
  }
48
 
49
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
50
 
51
  # nlp = spacy.load("en_core_web_sm")
52
  import en_core_web_sm
53
  nlp = en_core_web_sm.load()
54
 
55
  # Gradio Section:
56
+ def run_demo(model_name, image, text):
57
+
58
+ model, preprocess = clip.load(model_name, device=device, jit=False)
59
  orig_image = pad_to_square(image)
60
  img = preprocess(orig_image).unsqueeze(0).to(device)
61
  text_input = clip.tokenize([text]).to(device)
 
77
  # Default demo:
78
 
79
  default_inputs = [
80
+ gr.Dropdown(label="CLIP Model", choices=['ViT-B/16', 'ViT-B/32', 'ViT-L/14'], value="ViT-L/14"),
81
  gr.components.Image(type='pil', label="Original Image"),
82
  gr.components.Textbox(label="Image description"),
83
  ]
 
102
  outputs=default_outputs,
103
  title="CLIP Grounding Explainability",
104
  description=description,
105
+ examples=[[None, "example_images/London.png", "London Eye"],
106
+ [None, "example_images/London.png", "Big Ben"],
107
+ [None, "example_images/harrypotter.png", "Harry"],
108
+ [None, "example_images/harrypotter.png", "Hermione"],
109
+ [None, "example_images/harrypotter.png", "Ron"],
110
+ [None, "example_images/Amsterdam.png", "Amsterdam canal"],
111
+ [None, "example_images/Amsterdam.png", "Old buildings"],
112
+ [None, "example_images/Amsterdam.png", "Pink flowers"],
113
+ [None, "example_images/dogs_on_bed.png", "Two dogs"],
114
+ [None, "example_images/dogs_on_bed.png", "Book"],
115
+ [None, "example_images/dogs_on_bed.png", "Cat"]])
116
 
117
  # NER demo:
118
  def add_label_to_img(img, label, add_entity_label=True):
 
162
  return labeled_text, gallery_images
163
 
164
  inputs_NER = [
165
+ gr.Dropdown(label="CLIP Model", choices=['ViT-B/16', 'ViT-B/32', 'ViT-L/14'], value="ViT-L/14"),
166
  gr.Image(type='pil', label="Original Image"),
167
  gr.components.Textbox(label="Descriptive text"),
168
  ]
 
184
  title="Named Entity Grounding explainability using CLIP",
185
  description=description_NER,
186
  examples=[
187
+ [None, "example_images/London.png", "In this image we see Big Ben and the London Eye, on both sides of the river Thames."],
188
+ [None, "example_images/harrypotter.png", "Hermione, Harry and Ron in their school uniform"],
189
  ],
190
  cache_examples=False)
191