|
import torch |
|
from transformers import BlipProcessor, BlipForConditionalGeneration |
|
from gtts import gTTS |
|
import tempfile |
|
import subprocess |
|
import sys |
|
import gradio |
|
|
|
|
|
def ensure_package_installed(package_name): |
|
try: |
|
__import__(package_name) |
|
except ImportError: |
|
print(f"{package_name} package not found. Installing...") |
|
subprocess.check_call([sys.executable, "-m", "pip", "install", package_name]) |
|
__import__(package_name) |
|
|
|
|
|
ensure_package_installed("gradio") |
|
ensure_package_installed("transformers") |
|
ensure_package_installed("gtts") |
|
|
|
|
|
|
|
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") |
|
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") |
|
|
|
def generate_description(image): |
|
"""Generates a textual description of the given image using a pre-trained BLIP model.""" |
|
inputs = processor(image, return_tensors="pt").to(model.device) |
|
output = model.generate(**inputs) |
|
description = processor.decode(output[0], skip_special_tokens=True) |
|
return description |
|
|
|
def text_to_speech(text): |
|
"""Converts text to speech using gTTS and returns the audio file path.""" |
|
tts = gTTS(text=text, lang='en') |
|
temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") |
|
tts.save(temp_audio.name) |
|
return temp_audio.name |
|
|
|
def process_image(image): |
|
"""Processes the uploaded image to generate description and return audio file.""" |
|
description = generate_description(image) |
|
return description |
|
|
|
def get_audio(description): |
|
"""Generates the audio file for the given description.""" |
|
return text_to_speech(description) |
|
|
|
|
|
with gradio.Blocks() as demo: |
|
gradio.Markdown("# Image Description and Audio Transcript App") |
|
gradio.Markdown("Upload an image to get an AI-generated description. Click the button to hear the description.") |
|
|
|
with gradio.Row(): |
|
image_input = gradio.Image(type="pil") |
|
text_output = gradio.Textbox(label="Generated Description") |
|
|
|
generate_btn = gradio.Button("Generate Description") |
|
audio_btn = gradio.Button("Click here for an audio transcript") |
|
audio_output = gradio.Audio() |
|
|
|
generate_btn.click(process_image, inputs=[image_input], outputs=[text_output]) |
|
audio_btn.click(get_audio, inputs=[text_output], outputs=[audio_output]) |
|
|
|
|
|
demo.launch() |
|
|