Object-Detection-fb_basic / object_detection.py
Prince9191's picture
Upload 4 files
2183303 verified
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from gtts import gTTS
import tempfile
import subprocess
import sys
import gradio
def ensure_package_installed(package_name):
try:
__import__(package_name)
except ImportError:
print(f"{package_name} package not found. Installing...")
subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
__import__(package_name)
# Check and install openai
ensure_package_installed("gradio")
ensure_package_installed("transformers")
ensure_package_installed("gtts")
# Load the image captioning model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
def generate_description(image):
"""Generates a textual description of the given image using a pre-trained BLIP model."""
inputs = processor(image, return_tensors="pt").to(model.device)
output = model.generate(**inputs)
description = processor.decode(output[0], skip_special_tokens=True)
return description
def text_to_speech(text):
"""Converts text to speech using gTTS and returns the audio file path."""
tts = gTTS(text=text, lang='en')
temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
tts.save(temp_audio.name)
return temp_audio.name
def process_image(image):
"""Processes the uploaded image to generate description and return audio file."""
description = generate_description(image)
return description
def get_audio(description):
"""Generates the audio file for the given description."""
return text_to_speech(description)
# Build Gradio Interface
with gradio.Blocks() as demo:
gradio.Markdown("# Image Description and Audio Transcript App")
gradio.Markdown("Upload an image to get an AI-generated description. Click the button to hear the description.")
with gradio.Row():
image_input = gradio.Image(type="pil")
text_output = gradio.Textbox(label="Generated Description")
generate_btn = gradio.Button("Generate Description")
audio_btn = gradio.Button("Click here for an audio transcript")
audio_output = gradio.Audio()
generate_btn.click(process_image, inputs=[image_input], outputs=[text_output])
audio_btn.click(get_audio, inputs=[text_output], outputs=[audio_output])
# Launch the Gradio app
demo.launch()