import gradio as gr
from transformers import ViltProcessor, ViltForQuestionAnswering
from PIL import Image
import requests

# Modell und Prozessor initialisieren
processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

def answer_questions(image):
    questions = {
        "face_check": "Does the image contain a face?",
        "hair_style": "What is the hair style of the person?",
        "hair_color": "What is the hair color of the person?",
        "eye_color": "What is the eye color of the person?",
        "person_sex": "What is the person's sex?",
        "facial_hair": "Describe the facial hair of the person?",
        "glasses": "Does the person wear glasses?",
        "age": "What is the age of the person?"
    }

    answers = {}
    
    # Überprüfen, ob das Bild ein Gesicht enthält
    face_check_input = processor(images=image, text=questions["face_check"], return_tensors="pt", padding=True)
    face_check_output = model(**face_check_input)
    face_check_idx = face_check_output.logits.argmax(-1).item()
    face_check_answer = model.config.id2label[face_check_idx]

    answers["face_check"] = face_check_answer

    if face_check_answer == "no":
        return {"error": "ERROR: Can not detect any person in this image"}

    # Die restlichen Fragen beantworten
    for key, question in questions.items():
        if key != "face_check":  # Überspringe die bereits geprüfte Frage
            inputs = processor(images=image, text=question, return_tensors="pt", padding=True)
            outputs = model(**inputs)
            answer_idx = outputs.logits.argmax(-1).item()
            answer_text = model.config.id2label[answer_idx]
            answers[key] = answer_text

    return answers

# Gradio-Oberfläche erstellen
iface = gr.Interface(
    fn=answer_questions,
    inputs=gr.inputs.Image(type="pil"),
    outputs=gr.outputs.Textbox()
)

# App starten
iface.launch()