MinxuanQin commited on
Commit
de05d04
1 Parent(s): f1c6918

add application file

Browse files
Files changed (1) hide show
  1. app.py +28 -0
app.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from PIL import Image
3
+ from transformers import ViltConfig, ViltProcessor, ViltForQuestionAnswering
4
+
5
+ import streamlit as st
6
+
7
+ st.title("Live demo of multimodal vqa")
8
+
9
+ config = ViltConfig.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
10
+
11
+ processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
12
+ model = ViltForQuestionAnswering.from_pretrained("Minqin/carets_vqa_finetuned")
13
+
14
+ uploaded_file = st.file_uploader("Please upload one image (jpg)", type="jpg")
15
+
16
+ question = st.text_input("Type here the question")
17
+ if uploaded_file is not None:
18
+ file_bytes = np.asarray(bytearray(uploaded_file.read()), dtype=np.unit8)
19
+ img = Image.fromarray(file_bytes)
20
+
21
+ encoding = processor(images=file_bytes, text=question, return_tensors="pt")
22
+
23
+ outputs = model(**encoding)
24
+ logits = outputs.logits
25
+ idx = logits.argmax(-1).item()
26
+ pred = model.config.id2label[idx]
27
+
28
+ st.text(f"Answer: {pred}")