import io # for creating in-memory binary streams import wave # for writing WAV audio files import re # for regular expression utilities import streamlit as st # Streamlit UI library from transformers import pipeline # Hugging Face inference pipelines from PIL import Image # Python Imaging Library for image loading import numpy as np # numerical operations, especially array handling # 1) CACHE & LOAD MODELS (CPU only) @st.cache_resource(show_spinner=False) def load_captioner(): return pipeline( "image-to-text", model="Salesforce/blip-image-captioning-base", device=-1 # force CPU ) @st.cache_resource(show_spinner=False) def load_story_pipe(): return pipeline( "text2text-generation", model="google/flan-t5-base", device=-1 # force CPU ) @st.cache_resource(show_spinner=False) def load_tts_pipe(): return pipeline( "text-to-speech", model="facebook/mms-tts-eng", device=-1 # force CPU ) # 2) HELPER FUNCTIONS def sentence_case(text: str) -> str: parts = re.split(r'([.!?])', text) out = [] for i in range(0, len(parts) - 1, 2): sentence = parts[i].strip() delimiter = parts[i + 1] if sentence: formatted = sentence[0].upper() + sentence[1:] out.append(f"{formatted}{delimiter}") if len(parts) % 2: last = parts[-1].strip() if last: formatted = last[0].upper() + last[1:] out.append(formatted) return " ".join(" ".join(out).split()) def caption_image(img: Image.Image, captioner) -> str: if img.mode != "RGB": img = img.convert("RGB") results = captioner(img) return (results[0].get("generated_text", "") if results else "") def story_from_caption(caption: str, pipe) -> str: if not caption: return "Could not generate a story without a caption." prompt = f"Write a vivid, imaginative ~100-word story about this scene: {caption}\n\nWrite a creative and descriptive short story." results = pipe( prompt, max_length=120, min_length=60, do_sample=True, top_k=100, top_p=0.9, temperature=0.8, repetition_penalty=1.1, no_repeat_ngram_size=4, early_stopping=False ) raw = results[0]["generated_text"].strip() # Remove prompt echo if present raw = re.sub(re.escape(prompt), "", raw, flags=re.IGNORECASE).strip() # Trim to last full sentence idx = max(raw.rfind("."), raw.rfind("!"), raw.rfind("?")) if idx != -1: raw = raw[:idx+1] elif len(raw) > 80: raw = raw[:raw.rfind(" ") if raw.rfind(" ") > 60 else 80] + "..." return sentence_case(raw) def tts_bytes(text: str, tts_pipe) -> bytes: if not text: return b"" cleaned = re.sub(r'^["\']|["\']$', '', text).strip() cleaned = re.sub(r'\.{2,}', '.', cleaned).replace('…', '...') if cleaned[-1] not in ".!?": cleaned += "." cleaned = " ".join(cleaned.split()) output = tts_pipe(cleaned) result = output[0] if isinstance(output, list) else output audio_array = result.get("audio") rate = result.get("sampling_rate") if audio_array is None or rate is None: return b"" if audio_array.ndim == 1: data = audio_array[:, np.newaxis] else: data = audio_array.T pcm = (data * 32767).astype(np.int16) buf = io.BytesIO() wf = wave.open(buf, "wb") wf.setnchannels(data.shape[1]) wf.setsampwidth(2) wf.setframerate(rate) wf.writeframes(pcm.tobytes()) wf.close() buf.seek(0) return buf.read() # 3) STREAMLIT USER INTERFACE st.set_page_config(page_title="✨ Imagine & Narrate", page_icon="✨", layout="centered") # Persist upload across reruns if "uploaded_file" not in st.session_state: st.session_state.uploaded_file = None new_upload = st.file_uploader( "Choose an image file", type=["jpg", "jpeg", "png"] ) if new_upload is not None: st.session_state.uploaded_file = new_upload if st.session_state.uploaded_file is None: st.title("✨ Imagine & Narrate") st.info("➡️ Upload an image above to start the magic!") st.stop() uploaded = st.session_state.uploaded_file try: img = Image.open(uploaded) except Exception as e: st.error(f"Could not load the image: {e}") st.stop() st.title("✨ Imagine & Narrate") st.subheader("📸 Your Visual Input") st.image(img, caption=uploaded.name, use_container_width=True) st.divider() # Step 1: Generate Caption st.subheader("🧠 Generating Caption") with st.spinner("Analyzing image..."): captioner = load_captioner() raw_caption = caption_image(img, captioner) if not raw_caption: st.error("Failed to generate caption.") st.stop() caption = sentence_case(raw_caption) st.markdown(f"**Identified Scene:** {caption}") st.divider() # Step 2: Generate Story st.subheader("📖 Crafting a Story") with st.spinner("Writing story..."): story_pipe = load_story_pipe() story = story_from_caption(caption, story_pipe) if not story or story.strip() in {".", "..", "..."}: st.error("Failed to generate story.") st.stop() st.write(story) st.divider() # Step 3: Synthesize Audio st.subheader("👂 Hear the Story") with st.spinner("Synthesizing audio..."): tts_pipe = load_tts_pipe() audio_bytes = tts_bytes(story, tts_pipe) if not audio_bytes: st.warning("Audio generation failed.") else: st.audio(audio_bytes, format="audio/wav") st.balloons()