Spaces:

nielsr
/

vilt-nlvr

Running

App Files Files Community

nielsr HF staff commited on Dec 22, 2021

Commit

61ac553

1 Parent(s): e67d9c6

Add example

Browse files

Files changed (1) hide show

app.py +6 -2

app.py CHANGED Viewed

@@ -2,8 +2,11 @@ import gradio as gr
 from transformers import ViltProcessor, ViltForNaturalLanguageVisualReasoning
 import torch
 torch.hub.download_url_to_file('https://lil.nlp.cornell.edu/nlvr/exs/ex0_0.jpg', 'image1.jpg')
 torch.hub.download_url_to_file('https://lil.nlp.cornell.edu/nlvr/exs/ex0_1.jpg', 'image2.jpg')
 processor = ViltProcessor.from_pretrained("nielsr/vilt-b32-finetuned-nlvr2")
 model = ViltForNaturalLanguageVisualReasoning.from_pretrained("nielsr/vilt-b32-finetuned-nlvr2")
@@ -26,8 +29,9 @@ images = [gr.inputs.Image(type="pil"), gr.inputs.Image(type="pil")]
 text = gr.inputs.Textbox(lines=2, label="Sentence")
 answer = gr.outputs.Textbox(label="Predicted answer")
-example_sentence = "The left image contains twice the number of dogs as the right image, and at least two dogs in total are standing."
-examples = [["image1.jpg", "image2.jpg", example_sentence]]
 title = "Interactive demo: natural language visual reasoning with ViLT"
 description = "Gradio Demo for ViLT (Vision and Language Transformer), fine-tuned on NLVR2. To use it, simply upload a pair of images and type a sentence and click 'submit', or click one of the examples to load them. The model will predict whether the sentence is true or false, based on the 2 images. Read more at the links below."

 from transformers import ViltProcessor, ViltForNaturalLanguageVisualReasoning
 import torch
+# NLRV2 example images
 torch.hub.download_url_to_file('https://lil.nlp.cornell.edu/nlvr/exs/ex0_0.jpg', 'image1.jpg')
 torch.hub.download_url_to_file('https://lil.nlp.cornell.edu/nlvr/exs/ex0_1.jpg', 'image2.jpg')
+torch.hub.download_url_to_file('https://lil.nlp.cornell.edu/nlvr/exs/acorns_1.jpg', 'image3.jpg')
+torch.hub.download_url_to_file('https://lil.nlp.cornell.edu/nlvr/exs/acorns_6.jpg', 'image4.jpg')
 processor = ViltProcessor.from_pretrained("nielsr/vilt-b32-finetuned-nlvr2")
 model = ViltForNaturalLanguageVisualReasoning.from_pretrained("nielsr/vilt-b32-finetuned-nlvr2")
 text = gr.inputs.Textbox(lines=2, label="Sentence")
 answer = gr.outputs.Textbox(label="Predicted answer")
+example_sentence_1 = "One image contains twice the number of dogs as the other image, and at least two dogs in total are standing."
+example_sentence_2 = "One image shows exactly two brown acorns in back-to-back caps on green foliage."
+examples = [["image1.jpg", "image2.jpg", example_sentence_1], ["image1.jpg", "image2.jpg", example_sentence_2]]
 title = "Interactive demo: natural language visual reasoning with ViLT"
 description = "Gradio Demo for ViLT (Vision and Language Transformer), fine-tuned on NLVR2. To use it, simply upload a pair of images and type a sentence and click 'submit', or click one of the examples to load them. The model will predict whether the sentence is true or false, based on the 2 images. Read more at the links below."