|
import gradio as gr |
|
|
|
from transformers import ViltProcessor, ViltForQuestionAnswering |
|
|
|
|
|
def getResult(query, image): |
|
|
|
|
|
text = query |
|
|
|
processor = ViltProcessor.from_pretrained( |
|
"dandelin/vilt-b32-finetuned-vqa") |
|
model = ViltForQuestionAnswering.from_pretrained( |
|
"dandelin/vilt-b32-finetuned-vqa") |
|
|
|
|
|
encoding = processor(image, text, return_tensors="pt") |
|
|
|
|
|
outputs = model(**encoding) |
|
logits = outputs.logits |
|
idx = logits.argmax(-1).item() |
|
print("Predicted answer:", model.config.id2label[idx]) |
|
return model.config.id2label[idx] |
|
|
|
|
|
iface = gr.Interface(fn=getResult, inputs=[ |
|
"text", gr.Image(type="pil")], outputs="text") |
|
iface.launch(server_name="0.0.0.0",share=True) |
|
|