Spaces:

KasKniesmeijer
/

FAAM-demo

Running

App Files Files Community

FAAM-demo / app.py

KasKniesmeijer

updated app.py

cd7c5fe 2 months ago

raw

history blame

2.48 kB

	import torch
	from PIL import Image
	from transformers import AutoProcessor, AutoModelForVision2Seq
	from transformers.image_utils import load_image
	import numpy as np
	import gradio as gr

	# Set the device (GPU or CPU)
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

	# Initialize processor and model
	try:
	processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
	model = AutoModelForVision2Seq.from_pretrained(
	"HuggingFaceTB/SmolVLM-Instruct",
	torch_dtype=torch.bfloat16,
	_attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager",
	).to(DEVICE)
	except Exception as e:
	print(f"Error loading model or processor: {str(e)}")
	exit(1)


	# Define the function to answer questions
	def answer_question(image, question):
	# Check if the image is provided
	if image is None:
	return "Error: Please upload an image."

	# Convert NumPy array to PIL Image
	try:
	if isinstance(image, np.ndarray):
	image = Image.fromarray(image)
	except Exception as e:
	return f"Error: Unable to process the image. {str(e)}"

	# Ensure question is provided
	if not question.strip():
	return "Error: Please provide a question."

	# Create input message for the model
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image", "image": image},
	{"type": "text", "text": question},
	],
	},
	]

	# Apply chat template and prepare inputs
	try:
	prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
	inputs = processor(text=prompt, images=[image], return_tensors="pt").to(DEVICE)
	except Exception as e:
	return f"Error: Failed to prepare inputs. {str(e)}"

	# Generate answer
	try:
	outputs = model.generate(**inputs, max_new_tokens=400)
	answer = processor.decode(outputs[0], skip_special_tokens=True)
	return answer
	except Exception as e:
	return f"Error: Failed to generate answer. {str(e)}"


	# Create Gradio interface
	iface = gr.Interface(
	fn=answer_question,
	inputs=[
	gr.Image(type="numpy"),
	gr.Textbox(lines=2, placeholder="Enter your question here..."),
	],
	outputs="text",
	title="FAAM-demo \| Vision Language Model \| SmolVLM",
	description="Upload an image and ask a question about it.",
	)

	if __name__ == "__main__":
	iface.launch()