Spaces:

SRDdev
/

EchoSense

Runtime error

App Files Files Community

EchoSense / app.py

SRDdev

Update app.py

bf20e3a verified over 1 year ago

raw

history blame contribute delete

2.34 kB

	import torch
	import gradio as gr
	from PIL import Image
	from gtts import gTTS
	from transformers import BlipProcessor, BlipForConditionalGeneration

	model = "Salesforce/blip-image-captioning-large"
	processor = BlipProcessor.from_pretrained(model)
	head = BlipForConditionalGeneration.from_pretrained(model)

	def predict(image):
	inputs = processor(image, return_tensors="pt")
	output = head.generate(**inputs)
	caption = processor.decode(output[0], skip_special_tokens=True)
	audio = gTTS(caption, lang="en", tld="co.in")
	audio.save('caption.mp3')
	filepath = 'caption.mp3'
	return caption, filepath

	inp = gr.inputs.Image(label="Upload any Image")
	outputs = [
	gr.components.Textbox(type="text",label="Captions"),
	gr.components.Audio(type="filepath",label="audio")
	]

	description = """<div style="text-align: center;">
	<h1>🔉 EchoSense <span style='color: #e6b800;'>Image to Audio</span> Playground</h1>
	<p>This spaces helps generate audio descriptions for input Images</p>
	<p><b>Please note:</b>This space is for demonstration purposes only.</p>
	<p>Visit <a herf="https://shreyasdixit.tech">Shreyas Dixit's</a> personal website for more information about the creator.</p>
	</div>"""

	article="""Echo Sense is an innovative image captioning application that utilizes cutting-edge technology, specifically the powerful Transformer Model Architecture. This state-of-the-art approach has revolutionized Natural Language Processing (NLP) tasks, including image captioning, making it highly accurate and efficient. By leveraging pretrained models from Hugging Face and fine-tuning them on the COCO dataset, Echo Sense achieves exceptional performance while significantly reducing the computational cost and training time. The result is a versatile and reliable solution that not only produces accurate image captions but also generalizes well across various tasks. Experience the power of Echo Sense and witness firsthand the remarkable capabilities of the Transformer Model Architecture."""

	interface = gr.Interface(
	fn=predict,
	inputs=inp,
	outputs=outputs,
	title="",
	description=description,
	article=article,
	theme="grass",
	font=[
	gr.themes.GoogleFont("Open Sans"),
	"ui-sans-serif",
	"system-ui",
	"sans-serif",
	],
	)
	interface.launch()