SRDdev commited on
Commit
eea6d3d
·
verified ·
1 Parent(s): ec0d5bd

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -0
app.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import gradio as gr
3
+ from PIL import Image
4
+ from gtts import gTTS
5
+ from transformers import BlipProcessor, BlipForConditionalGeneration
6
+
7
+ model = "Salesforce/blip-image-captioning-large"
8
+ processor = BlipProcessor.from_pretrained(model)
9
+ head = BlipForConditionalGeneration.from_pretrained(model)
10
+
11
+ def predict(image):
12
+ inputs = processor(image, return_tensors="pt")
13
+ output = head.generate(**inputs)
14
+ caption = processor.decode(output[0], skip_special_tokens=True)
15
+ audio = gTTS(caption, lang="en", tld="co.in")
16
+ audio.save('caption.mp3')
17
+ filepath = 'caption.mp3'
18
+ return caption, filepath
19
+
20
+ inputs = gr.inputs.Image(label="Upload any Image")
21
+ outputs = [
22
+ gr.components.Textbox(type="text",label="Captions"),
23
+ gr.components.Audio(type="filepath",label="audio")
24
+ ]
25
+
26
+ description = """<div style="text-align: center;">
27
+ <h1>🔉 EchoSense <span style='color: #e6b800;'>Image to Audio</span> Playground</h1>
28
+ <p>This spaces helps generate audio descriptions for input Images</p>
29
+ <p><b>Please note:</b>This space is for demonstration purposes only.</p>
30
+ <p>Visit <a herf="https://shreyasdixit.tech">Shreyas Dixit's</a> personal website for more information about the creator.</p>
31
+ </div>"""
32
+
33
+ article="""Echo Sense is an innovative image captioning application that utilizes cutting-edge technology, specifically the powerful Transformer Model Architecture. This state-of-the-art approach has revolutionized Natural Language Processing (NLP) tasks, including image captioning, making it highly accurate and efficient. By leveraging pretrained models from Hugging Face and fine-tuning them on the COCO dataset, Echo Sense achieves exceptional performance while significantly reducing the computational cost and training time. The result is a versatile and reliable solution that not only produces accurate image captions but also generalizes well across various tasks. Experience the power of Echo Sense and witness firsthand the remarkable capabilities of the Transformer Model Architecture."""
34
+
35
+ interface = gr.Interface(
36
+ fn=predict,
37
+ inputs=inputs,
38
+ outputs=outputs,
39
+ title="",
40
+ description=description,
41
+ article=article,
42
+ theme="grass",
43
+ font=[
44
+ gr.themes.GoogleFont("Open Sans"),
45
+ "ui-sans-serif",
46
+ "system-ui",
47
+ "sans-serif",
48
+ ],
49
+ )
50
+ interface.launch()