image-captioning-cloned

Running

App Files Files Community

image-captioning-cloned / app.py

Anandhju-jayan

Duplicate from chats-bug/ai-image-captioning

1673a2d about 1 year ago

raw history blame contribute delete

No virus

3.05 kB

	import gradio as gr
	import torch
	from PIL import Image

	from model import BlipBaseModel, GitBaseCocoModel

	MODELS = {
	"Git-Base-COCO": GitBaseCocoModel,
	"Blip Base": BlipBaseModel,
	}

	# examples = [["Image1.png"], ["Image2.png"], ["Image3.png"]]

	def generate_captions(
	image,
	num_captions,
	model_name,
	max_length,
	temperature,
	top_k,
	top_p,
	repetition_penalty,
	diversity_penalty,
	):
	"""
	Generates captions for the given image.

	-----
	Parameters:
	image: PIL.Image
	The image to generate captions for.
	num_captions: int
	The number of captions to generate.
	Rest of the parameters are the same as in the model.generate method.
	-----
	Returns:
	list[str]
	"""
	# Convert the numerical values to their corresponding types.
	# Gradio Slider returns values as floats: except when the value is a whole number, in which case it returns an int.
	# Only float values suffer from this issue.
	temperature = float(temperature)
	top_p = float(top_p)
	repetition_penalty = float(repetition_penalty)
	diversity_penalty = float(diversity_penalty)

	device = "cuda" if torch.cuda.is_available() else "cpu"

	model = MODELS[model_name](device)

	captions = model.generate(
	image=image,
	max_length=max_length,
	num_captions=num_captions,
	temperature=temperature,
	top_k=top_k,
	top_p=top_p,
	repetition_penalty=repetition_penalty,
	diversity_penalty=diversity_penalty,
	)

	# Convert list to a single string separated by newlines.
	captions = "\n".join(captions)
	return captions

	title = "AI tool for generating captions for images"
	description = "This tool uses pretrained models to generate captions for images."

	interface = gr.Interface(
	fn=generate_captions,
	inputs=[
	gr.components.Image(type="pil", label="Image"),
	gr.components.Slider(minimum=1, maximum=10, step=1, value=1, label="Number of Captions to Generate"),
	gr.components.Dropdown(MODELS.keys(), label="Model", value=list(MODELS.keys())[1]), # Default to Blip Base
	gr.components.Slider(minimum=20, maximum=100, step=5, value=50, label="Maximum Caption Length"),
	gr.components.Slider(minimum=0.1, maximum=10.0, step=0.1, value=1.0, label="Temperature"),
	gr.components.Slider(minimum=1, maximum=100, step=1, value=50, label="Top K"),
	gr.components.Slider(minimum=0.1, maximum=5.0, step=0.1, value=1.0, label="Top P"),
	gr.components.Slider(minimum=1.0, maximum=10.0, step=0.1, value=2.0, label="Repetition Penalty"),
	gr.components.Slider(minimum=0.0, maximum=10.0, step=0.1, value=2.0, label="Diversity Penalty"),
	],
	outputs=[
	gr.components.Textbox(label="Caption"),
	],
	# Set image examples to be displayed in the interface.
	examples = [
	["Image1.png", 1, list(MODELS.keys())[1], 50, 1.0, 50, 1.0, 2.0, 2.0],
	["Image2.png", 1, list(MODELS.keys())[1], 50, 1.0, 50, 1.0, 2.0, 2.0],
	["Image3.png", 1, list(MODELS.keys())[1], 50, 1.0, 50, 1.0, 2.0, 2.0],
	],
	title=title,
	description=description,
	allow_flagging="never",
	)


	if __name__ == "__main__":
	# Launch the interface.
	interface.launch(
	enable_queue=True,
	debug=True,
	)