VQA / app.py
c-o-s's picture
Upload app.py
14ccce5
import gradio as gr
from transformers import ViltProcessor, ViltForQuestionAnswering
from PIL import Image
import requests
# Modell und Prozessor initialisieren
processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
def answer_questions(image):
questions = {
"face_check": "Does the image contain a face?",
"hair_style": "What is the hair style of the person?",
"hair_color": "What is the hair color of the person?",
"eye_color": "What is the eye color of the person?",
"person_sex": "What is the person's sex?",
"facial_hair": "Describe the facial hair of the person?",
"glasses": "Does the person wear glasses?",
"age": "What is the age of the person?"
}
answers = {}
# Überprüfen, ob das Bild ein Gesicht enthält
face_check_input = processor(images=image, text=questions["face_check"], return_tensors="pt", padding=True)
face_check_output = model(**face_check_input)
face_check_idx = face_check_output.logits.argmax(-1).item()
face_check_answer = model.config.id2label[face_check_idx]
answers["face_check"] = face_check_answer
if face_check_answer == "no":
return {"error": "ERROR: Can not detect any person in this image"}
# Die restlichen Fragen beantworten
for key, question in questions.items():
if key != "face_check": # Überspringe die bereits geprüfte Frage
inputs = processor(images=image, text=question, return_tensors="pt", padding=True)
outputs = model(**inputs)
answer_idx = outputs.logits.argmax(-1).item()
answer_text = model.config.id2label[answer_idx]
answers[key] = answer_text
return answers
# Gradio-Oberfläche erstellen
iface = gr.Interface(
fn=answer_questions,
inputs=gr.inputs.Image(type="pil"),
outputs=gr.outputs.Textbox()
)
# App starten
iface.launch()