Spaces:

ManishThota
/

InstructVQA

Paused

InstructVQA / app.py

Update app.py

9a64a7c verified over 1 year ago

1.29 kB

	import gradio as gr
	from PIL import Image
	import torch
	from transformers import BlipProcessor, BlipForQuestionAnswering

	# Initialize the model and processor
	processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
	model = BlipForQuestionAnswering.from_pretrained("ManishThota/InstructBlip-VQA").to("cuda")
	# model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")

	def predict_answer(image, question):
	# Convert PIL image to RGB if not already
	image = image.convert("RGB")

	# Prepare inputs
	encoding = processor(image, question, return_tensors="pt").to("cuda:0", torch.float16)

	out = model.generate(**encoding)
	generated_text = processor.decode(out[0], skip_special_tokens=True)

	return generated_text

	def gradio_predict(image, question):
	answer = predict_answer(image, question)
	return answer

	# Define the Gradio interface
	iface = gr.Interface(
	fn=gradio_predict,
	inputs=[gr.Image(type="pil", label="Upload or Drag an Image"), gr.Textbox(label="Question", placeholder="e.g. What is this?", scale=4)],
	outputs=gr.TextArea(label="Answer"),
	title="Instruct Visual Question Answering",
	description="Tiny 1B parameter Vision Language Model.",
	)

	# Launch the app
	iface.queue().launch(debug=True)