vqa-vitgpt

Sleeping

vqa-vitgpt / app.py

Create app.py

0d49808 verified 8 months ago

1.47 kB

	from transformers import ViltProcessor, ViltForQuestionAnswering
	import torch
	from PIL import Image
	import gradio as gr

	# Load the model and processor
	processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
	model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

	def answer_question(image, text):
	# Convert the uploaded image to PIL format
	image = Image.fromarray(image.astype('uint8'), 'RGB')

	# Process the image and text
	encoding = processor(images=image, text=text, return_tensors="pt", padding=True)

	# Forward pass
	with torch.no_grad():
	outputs = model(**encoding)

	logits = outputs.logits
	idx = logits.argmax(-1).item()
	predicted_answer = model.config.id2label[idx]

	# Return the predicted answer
	return predicted_answer

	# Define Gradio inputs and outputs
	image = gr.Image(type="numpy", label="Upload Image")
	question = gr.Textbox(lines=2, label="Question")
	answer = gr.Textbox(label="Predicted Answer")

	# Create Gradio Interface
	gr.Interface(
	fn=answer_question,
	inputs=[image, question],
	outputs=answer,
	title="Image Based Visual Question Answering",
	description="This is a demonstration of ViLT (Vision and Language Transformer) using Gradio, which has been fine-tuned on VQAv2 to answer questions based on images. To get a predicted answer, please provide an image and type in your question, then press the submit button."
	).launch()