Irpan commited on
Commit
64f507f
·
1 Parent(s): e98f157
Files changed (2) hide show
  1. app.py +38 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import ViltProcessor, ViltForQuestionAnswering
3
+ import torch
4
+
5
+ torch.hub.download_url_to_file('http://images.cocodataset.org/val2017/000000039769.jpg', 'cats.jpg')
6
+
7
+ processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
8
+ model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
9
+
10
+ def answer_question(image, text):
11
+ encoding = processor(image, text, return_tensors="pt")
12
+
13
+ # forward pass
14
+ with torch.no_grad():
15
+ outputs = model(**encoding)
16
+
17
+ logits = outputs.logits
18
+ idx = logits.argmax(-1).item()
19
+ predicted_answer = model.config.id2label[idx]
20
+
21
+ return predicted_answer
22
+
23
+ image = gr.inputs.Image(type="pil")
24
+ question = gr.inputs.Textbox(label="Question")
25
+ answer = gr.outputs.Textbox(label="Predicted answer")
26
+ examples = [["cats.jpg", "How many cats are there?"]]
27
+
28
+ title = "Cross-lingual VQA"
29
+ description = "ViLT (Vision and Language Transformer), fine-tuned on VQAv2 "
30
+
31
+ interface = gr.Interface(fn=answer_question,
32
+ inputs=[image, question],
33
+ outputs=answer,
34
+ examples=examples,
35
+ title=title,
36
+ description=description,
37
+ enable_queue=True)
38
+ interface.launch(debug=True)
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio
2
+ torch
3
+ git+https://github.com/huggingface/transformers.git