import streamlit as st from transformers import pipeline from PIL import Image import io from gtts import gTTS import tempfile st.title("🖼️ → 📖 Image-to-Story Demo") st.write("Upload an image and watch as it’s captioned, turned into a short story, and even read aloud!") @st.cache_resource def load_captioner(): return pipeline("image-to-text", model="unography/blip-large-long-cap") @st.cache_resource def load_story_gen(): return pipeline("text-generation", model="gpt2", tokenizer="gpt2") captioner = load_captioner() story_gen = load_story_gen() uploaded = st.file_uploader("Upload an image", type=["png","jpg","jpeg"], key="image") if uploaded: img = Image.open(uploaded) st.image(img, use_column_width=True) # Caption if "caption" not in st.session_state: with st.spinner("Generating caption…"): caps = captioner(img) st.session_state.caption = caps[0] if isinstance(caps, list) else caps st.write("**Caption:**", st.session_state.caption) # Story if "story" not in st.session_state: with st.spinner("Spinning up a story…"): out = story_gen( st.session_state.caption, max_length=200, num_return_sequences=1, do_sample=True, top_p=0.9 ) st.session_state.story = out[0]["generated_text"] st.write("**Story:**", st.session_state.story) # Prepare audio bytes once if "audio_bytes" not in st.session_state: with st.spinner("Generating audio…"): tts = gTTS(text=st.session_state.story, lang="en") buf = io.BytesIO() tts.write_to_fp(buf) st.session_state.audio_bytes = buf.getvalue() # Play button if st.button("🔊 Play Story Audio"): # Write to a temp file tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") tmp.write(st.session_state.audio_bytes) tmp.flush() tmp_path = tmp.name tmp.close() # Stream it st.audio(tmp_path, format="audio/mp3")