import streamlit as st from transformers import pipeline from gtts import gTTS import os from io import BytesIO from PIL import Image import torch st.set_page_config(page_title="Your Image to Audio Story", page_icon="🦜") def img2text(image_path): # Load Hugging Face image captioning model captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") image = Image.open(image_path) caption = captioner(image)[0]['generated_text'] return caption def text2story(text): # Use Hugging Face model to generate a longer story from the caption generator = pipeline("text-generation", model="gpt2") story = generator(text, max_length=150, num_return_sequences=1)[0]['generated_text'] return story def text2audio(text): # Convert text to speech using gTTS tts = gTTS(text=text, lang='en') audio_buffer = BytesIO() tts.write_to_fp(audio_buffer) audio_buffer.seek(0) return audio_buffer st.header("Turn Your Image to Audio Story") uploaded_file = st.file_uploader("Select an Image...", type=["png", "jpg", "jpeg"]) if uploaded_file is not None: image = Image.open(uploaded_file) #st.image(image, caption="Uploaded Image", use_column_width=True) st.image(image, caption="Uploaded Image", use_container_width=True) # Process Image to Text st.text('Processing image to text...') scenario = img2text(uploaded_file) st.write("**Generated Caption:**", scenario) # Generate Story st.text('Generating a story...') story = text2story(scenario) st.write("**Generated Story:**", story) # Convert Story to Audio st.text('Generating audio...') audio_data = text2audio(story) # Play button if st.button("Play Audio"): st.audio(audio_data, format="audio/wav")