sathvikparasa20 commited on
Commit
0d49808
1 Parent(s): 73ce49a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -0
app.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import ViltProcessor, ViltForQuestionAnswering
2
+ import torch
3
+ from PIL import Image
4
+ import gradio as gr
5
+
6
+ # Load the model and processor
7
+ processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
8
+ model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
9
+
10
+ def answer_question(image, text):
11
+ # Convert the uploaded image to PIL format
12
+ image = Image.fromarray(image.astype('uint8'), 'RGB')
13
+
14
+ # Process the image and text
15
+ encoding = processor(images=image, text=text, return_tensors="pt", padding=True)
16
+
17
+ # Forward pass
18
+ with torch.no_grad():
19
+ outputs = model(**encoding)
20
+
21
+ logits = outputs.logits
22
+ idx = logits.argmax(-1).item()
23
+ predicted_answer = model.config.id2label[idx]
24
+
25
+ # Return the predicted answer
26
+ return predicted_answer
27
+
28
+ # Define Gradio inputs and outputs
29
+ image = gr.Image(type="numpy", label="Upload Image")
30
+ question = gr.Textbox(lines=2, label="Question")
31
+ answer = gr.Textbox(label="Predicted Answer")
32
+
33
+ # Create Gradio Interface
34
+ gr.Interface(
35
+ fn=answer_question,
36
+ inputs=[image, question],
37
+ outputs=answer,
38
+ title="Image Based Visual Question Answering",
39
+ description="This is a demonstration of ViLT (Vision and Language Transformer) using Gradio, which has been fine-tuned on VQAv2 to answer questions based on images. To get a predicted answer, please provide an image and type in your question, then press the submit button."
40
+ ).launch()