Add example
Browse files
app.py
CHANGED
@@ -2,8 +2,11 @@ import gradio as gr
|
|
2 |
from transformers import ViltProcessor, ViltForNaturalLanguageVisualReasoning
|
3 |
import torch
|
4 |
|
|
|
5 |
torch.hub.download_url_to_file('https://lil.nlp.cornell.edu/nlvr/exs/ex0_0.jpg', 'image1.jpg')
|
6 |
torch.hub.download_url_to_file('https://lil.nlp.cornell.edu/nlvr/exs/ex0_1.jpg', 'image2.jpg')
|
|
|
|
|
7 |
|
8 |
processor = ViltProcessor.from_pretrained("nielsr/vilt-b32-finetuned-nlvr2")
|
9 |
model = ViltForNaturalLanguageVisualReasoning.from_pretrained("nielsr/vilt-b32-finetuned-nlvr2")
|
@@ -26,8 +29,9 @@ images = [gr.inputs.Image(type="pil"), gr.inputs.Image(type="pil")]
|
|
26 |
text = gr.inputs.Textbox(lines=2, label="Sentence")
|
27 |
answer = gr.outputs.Textbox(label="Predicted answer")
|
28 |
|
29 |
-
|
30 |
-
|
|
|
31 |
|
32 |
title = "Interactive demo: natural language visual reasoning with ViLT"
|
33 |
description = "Gradio Demo for ViLT (Vision and Language Transformer), fine-tuned on NLVR2. To use it, simply upload a pair of images and type a sentence and click 'submit', or click one of the examples to load them. The model will predict whether the sentence is true or false, based on the 2 images. Read more at the links below."
|
|
|
2 |
from transformers import ViltProcessor, ViltForNaturalLanguageVisualReasoning
|
3 |
import torch
|
4 |
|
5 |
+
# NLRV2 example images
|
6 |
torch.hub.download_url_to_file('https://lil.nlp.cornell.edu/nlvr/exs/ex0_0.jpg', 'image1.jpg')
|
7 |
torch.hub.download_url_to_file('https://lil.nlp.cornell.edu/nlvr/exs/ex0_1.jpg', 'image2.jpg')
|
8 |
+
torch.hub.download_url_to_file('https://lil.nlp.cornell.edu/nlvr/exs/acorns_1.jpg', 'image3.jpg')
|
9 |
+
torch.hub.download_url_to_file('https://lil.nlp.cornell.edu/nlvr/exs/acorns_6.jpg', 'image4.jpg')
|
10 |
|
11 |
processor = ViltProcessor.from_pretrained("nielsr/vilt-b32-finetuned-nlvr2")
|
12 |
model = ViltForNaturalLanguageVisualReasoning.from_pretrained("nielsr/vilt-b32-finetuned-nlvr2")
|
|
|
29 |
text = gr.inputs.Textbox(lines=2, label="Sentence")
|
30 |
answer = gr.outputs.Textbox(label="Predicted answer")
|
31 |
|
32 |
+
example_sentence_1 = "One image contains twice the number of dogs as the other image, and at least two dogs in total are standing."
|
33 |
+
example_sentence_2 = "One image shows exactly two brown acorns in back-to-back caps on green foliage."
|
34 |
+
examples = [["image1.jpg", "image2.jpg", example_sentence_1], ["image1.jpg", "image2.jpg", example_sentence_2]]
|
35 |
|
36 |
title = "Interactive demo: natural language visual reasoning with ViLT"
|
37 |
description = "Gradio Demo for ViLT (Vision and Language Transformer), fine-tuned on NLVR2. To use it, simply upload a pair of images and type a sentence and click 'submit', or click one of the examples to load them. The model will predict whether the sentence is true or false, based on the 2 images. Read more at the links below."
|