Spaces:
Sleeping
Sleeping
| # app.py | |
| import gradio as gr | |
| from transformers import BlipProcessor, BlipForConditionalGeneration | |
| from gtts import gTTS | |
| import io | |
| from PIL import Image | |
| # ------------------------------- | |
| # Load BLIP-base model (lighter version) | |
| # ------------------------------- | |
| processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") | |
| model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") | |
| # ------------------------------- | |
| # Generate caption function | |
| # ------------------------------- | |
| def generate_caption_fn(image): | |
| # Convert uploaded image to PIL | |
| if not isinstance(image, Image.Image): | |
| image = Image.fromarray(image) | |
| # BLIP preprocessing | |
| inputs = processor(images=image, return_tensors="pt") | |
| # Generate caption | |
| out = model.generate(**inputs) | |
| caption = processor.decode(out[0], skip_special_tokens=True) | |
| return caption | |
| # ------------------------------- | |
| # Convert text to speech using gTTS | |
| # ------------------------------- | |
| def text_to_speech(caption): | |
| tts = gTTS(text=caption, lang='en') | |
| mp3_fp = io.BytesIO() | |
| tts.write_to_fp(mp3_fp) | |
| mp3_fp.seek(0) | |
| return mp3_fp | |
| # ------------------------------- | |
| # Gradio interface: Caption + Audio | |
| # ------------------------------- | |
| def generate_caption_tts(image): | |
| caption = generate_caption_fn(image) | |
| audio = text_to_speech(caption) | |
| return caption, audio | |
| interface = gr.Interface( | |
| fn=generate_caption_tts, | |
| inputs=gr.Image(type="numpy"), | |
| outputs=[gr.Textbox(label="Generated Caption"), gr.Audio(type="file", label="TTS Audio")], | |
| title="Blind Assistant: Image Captioning", | |
| description="Upload an image and get a descriptive caption + speech." | |
| ) | |
| interface.launch() | |