nielsr HF staff commited on
Commit
61ac553
1 Parent(s): e67d9c6

Add example

Browse files
Files changed (1) hide show
  1. app.py +6 -2
app.py CHANGED
@@ -2,8 +2,11 @@ import gradio as gr
2
  from transformers import ViltProcessor, ViltForNaturalLanguageVisualReasoning
3
  import torch
4
 
 
5
  torch.hub.download_url_to_file('https://lil.nlp.cornell.edu/nlvr/exs/ex0_0.jpg', 'image1.jpg')
6
  torch.hub.download_url_to_file('https://lil.nlp.cornell.edu/nlvr/exs/ex0_1.jpg', 'image2.jpg')
 
 
7
 
8
  processor = ViltProcessor.from_pretrained("nielsr/vilt-b32-finetuned-nlvr2")
9
  model = ViltForNaturalLanguageVisualReasoning.from_pretrained("nielsr/vilt-b32-finetuned-nlvr2")
@@ -26,8 +29,9 @@ images = [gr.inputs.Image(type="pil"), gr.inputs.Image(type="pil")]
26
  text = gr.inputs.Textbox(lines=2, label="Sentence")
27
  answer = gr.outputs.Textbox(label="Predicted answer")
28
 
29
- example_sentence = "The left image contains twice the number of dogs as the right image, and at least two dogs in total are standing."
30
- examples = [["image1.jpg", "image2.jpg", example_sentence]]
 
31
 
32
  title = "Interactive demo: natural language visual reasoning with ViLT"
33
  description = "Gradio Demo for ViLT (Vision and Language Transformer), fine-tuned on NLVR2. To use it, simply upload a pair of images and type a sentence and click 'submit', or click one of the examples to load them. The model will predict whether the sentence is true or false, based on the 2 images. Read more at the links below."
 
2
  from transformers import ViltProcessor, ViltForNaturalLanguageVisualReasoning
3
  import torch
4
 
5
+ # NLRV2 example images
6
  torch.hub.download_url_to_file('https://lil.nlp.cornell.edu/nlvr/exs/ex0_0.jpg', 'image1.jpg')
7
  torch.hub.download_url_to_file('https://lil.nlp.cornell.edu/nlvr/exs/ex0_1.jpg', 'image2.jpg')
8
+ torch.hub.download_url_to_file('https://lil.nlp.cornell.edu/nlvr/exs/acorns_1.jpg', 'image3.jpg')
9
+ torch.hub.download_url_to_file('https://lil.nlp.cornell.edu/nlvr/exs/acorns_6.jpg', 'image4.jpg')
10
 
11
  processor = ViltProcessor.from_pretrained("nielsr/vilt-b32-finetuned-nlvr2")
12
  model = ViltForNaturalLanguageVisualReasoning.from_pretrained("nielsr/vilt-b32-finetuned-nlvr2")
 
29
  text = gr.inputs.Textbox(lines=2, label="Sentence")
30
  answer = gr.outputs.Textbox(label="Predicted answer")
31
 
32
+ example_sentence_1 = "One image contains twice the number of dogs as the other image, and at least two dogs in total are standing."
33
+ example_sentence_2 = "One image shows exactly two brown acorns in back-to-back caps on green foliage."
34
+ examples = [["image1.jpg", "image2.jpg", example_sentence_1], ["image1.jpg", "image2.jpg", example_sentence_2]]
35
 
36
  title = "Interactive demo: natural language visual reasoning with ViLT"
37
  description = "Gradio Demo for ViLT (Vision and Language Transformer), fine-tuned on NLVR2. To use it, simply upload a pair of images and type a sentence and click 'submit', or click one of the examples to load them. The model will predict whether the sentence is true or false, based on the 2 images. Read more at the links below."