File size: 2,479 Bytes
2183303
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from gtts import gTTS
import tempfile
import subprocess
import sys
import gradio


def ensure_package_installed(package_name):
    try:
        __import__(package_name)
    except ImportError:
        print(f"{package_name} package not found. Installing...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
        __import__(package_name)

# Check and install openai
ensure_package_installed("gradio")
ensure_package_installed("transformers")
ensure_package_installed("gtts")


# Load the image captioning model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

def generate_description(image):
    """Generates a textual description of the given image using a pre-trained BLIP model."""
    inputs = processor(image, return_tensors="pt").to(model.device)
    output = model.generate(**inputs)
    description = processor.decode(output[0], skip_special_tokens=True)
    return description

def text_to_speech(text):
    """Converts text to speech using gTTS and returns the audio file path."""
    tts = gTTS(text=text, lang='en')
    temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
    tts.save(temp_audio.name)
    return temp_audio.name

def process_image(image):
    """Processes the uploaded image to generate description and return audio file."""
    description = generate_description(image)
    return description

def get_audio(description):
    """Generates the audio file for the given description."""
    return text_to_speech(description)

# Build Gradio Interface
with gradio.Blocks() as demo:
    gradio.Markdown("# Image Description and Audio Transcript App")
    gradio.Markdown("Upload an image to get an AI-generated description. Click the button to hear the description.")
    
    with gradio.Row():
        image_input = gradio.Image(type="pil")
        text_output = gradio.Textbox(label="Generated Description")
    
    generate_btn = gradio.Button("Generate Description")
    audio_btn = gradio.Button("Click here for an audio transcript")
    audio_output = gradio.Audio()
    
    generate_btn.click(process_image, inputs=[image_input], outputs=[text_output])
    audio_btn.click(get_audio, inputs=[text_output], outputs=[audio_output])
    
# Launch the Gradio app
demo.launch()