Spaces:
Sleeping
Sleeping
import io # for creating in-memory binary streams | |
import wave # for writing WAV audio files | |
import re # for regular expression utilities | |
import streamlit as st # Streamlit UI library | |
from transformers import pipeline # Hugging Face inference pipelines | |
from PIL import Image # Python Imaging Library for image loading | |
import numpy as np # numerical operations, especially array handling | |
# 1) CACHE & LOAD MODELS (CPU only) | |
def load_captioner(): | |
return pipeline( | |
"image-to-text", | |
model="Salesforce/blip-image-captioning-base", | |
device=-1 # force CPU | |
) | |
def load_story_pipe(): | |
return pipeline( | |
"text2text-generation", | |
model="google/flan-t5-base", | |
device=-1 # force CPU | |
) | |
def load_tts_pipe(): | |
return pipeline( | |
"text-to-speech", | |
model="facebook/mms-tts-eng", | |
device=-1 # force CPU | |
) | |
# 2) HELPER FUNCTIONS | |
def sentence_case(text: str) -> str: | |
parts = re.split(r'([.!?])', text) | |
out = [] | |
for i in range(0, len(parts) - 1, 2): | |
sentence = parts[i].strip() | |
delimiter = parts[i + 1] | |
if sentence: | |
formatted = sentence[0].upper() + sentence[1:] | |
out.append(f"{formatted}{delimiter}") | |
if len(parts) % 2: | |
last = parts[-1].strip() | |
if last: | |
formatted = last[0].upper() + last[1:] | |
out.append(formatted) | |
return " ".join(" ".join(out).split()) | |
def caption_image(img: Image.Image, captioner) -> str: | |
if img.mode != "RGB": | |
img = img.convert("RGB") | |
results = captioner(img) | |
return (results[0].get("generated_text", "") if results else "") | |
def story_from_caption(caption: str, pipe) -> str: | |
if not caption: | |
return "Could not generate a story without a caption." | |
prompt = f"Write a vivid, imaginative ~100-word story about this scene: {caption}\n\nWrite a creative and descriptive short story." | |
results = pipe( | |
prompt, | |
max_length=120, | |
min_length=60, | |
do_sample=True, | |
top_k=100, | |
top_p=0.9, | |
temperature=0.8, | |
repetition_penalty=1.1, | |
no_repeat_ngram_size=4, | |
early_stopping=False | |
) | |
raw = results[0]["generated_text"].strip() | |
# Remove prompt echo if present | |
raw = re.sub(re.escape(prompt), "", raw, flags=re.IGNORECASE).strip() | |
# Trim to last full sentence | |
idx = max(raw.rfind("."), raw.rfind("!"), raw.rfind("?")) | |
if idx != -1: | |
raw = raw[:idx+1] | |
elif len(raw) > 80: | |
raw = raw[:raw.rfind(" ") if raw.rfind(" ") > 60 else 80] + "..." | |
return sentence_case(raw) | |
def tts_bytes(text: str, tts_pipe) -> bytes: | |
if not text: | |
return b"" | |
cleaned = re.sub(r'^["\']|["\']$', '', text).strip() | |
cleaned = re.sub(r'\.{2,}', '.', cleaned).replace('β¦', '...') | |
if cleaned[-1] not in ".!?": | |
cleaned += "." | |
cleaned = " ".join(cleaned.split()) | |
output = tts_pipe(cleaned) | |
result = output[0] if isinstance(output, list) else output | |
audio_array = result.get("audio") | |
rate = result.get("sampling_rate") | |
if audio_array is None or rate is None: | |
return b"" | |
if audio_array.ndim == 1: | |
data = audio_array[:, np.newaxis] | |
else: | |
data = audio_array.T | |
pcm = (data * 32767).astype(np.int16) | |
buf = io.BytesIO() | |
wf = wave.open(buf, "wb") | |
wf.setnchannels(data.shape[1]) | |
wf.setsampwidth(2) | |
wf.setframerate(rate) | |
wf.writeframes(pcm.tobytes()) | |
wf.close() | |
buf.seek(0) | |
return buf.read() | |
# 3) STREAMLIT USER INTERFACE | |
st.set_page_config(page_title="β¨ Imagine & Narrate", page_icon="β¨", layout="centered") | |
# Persist upload across reruns | |
if "uploaded_file" not in st.session_state: | |
st.session_state.uploaded_file = None | |
new_upload = st.file_uploader( | |
"Choose an image file", | |
type=["jpg", "jpeg", "png"] | |
) | |
if new_upload is not None: | |
st.session_state.uploaded_file = new_upload | |
if st.session_state.uploaded_file is None: | |
st.title("β¨ Imagine & Narrate") | |
st.info("β‘οΈ Upload an image above to start the magic!") | |
st.stop() | |
uploaded = st.session_state.uploaded_file | |
try: | |
img = Image.open(uploaded) | |
except Exception as e: | |
st.error(f"Could not load the image: {e}") | |
st.stop() | |
st.title("β¨ Imagine & Narrate") | |
st.subheader("πΈ Your Visual Input") | |
st.image(img, caption=uploaded.name, use_container_width=True) | |
st.divider() | |
# Step 1: Generate Caption | |
st.subheader("π§ Generating Caption") | |
with st.spinner("Analyzing image..."): | |
captioner = load_captioner() | |
raw_caption = caption_image(img, captioner) | |
if not raw_caption: | |
st.error("Failed to generate caption.") | |
st.stop() | |
caption = sentence_case(raw_caption) | |
st.markdown(f"**Identified Scene:** {caption}") | |
st.divider() | |
# Step 2: Generate Story | |
st.subheader("π Crafting a Story") | |
with st.spinner("Writing story..."): | |
story_pipe = load_story_pipe() | |
story = story_from_caption(caption, story_pipe) | |
if not story or story.strip() in {".", "..", "..."}: | |
st.error("Failed to generate story.") | |
st.stop() | |
st.write(story) | |
st.divider() | |
# Step 3: Synthesize Audio | |
st.subheader("π Hear the Story") | |
with st.spinner("Synthesizing audio..."): | |
tts_pipe = load_tts_pipe() | |
audio_bytes = tts_bytes(story, tts_pipe) | |
if not audio_bytes: | |
st.warning("Audio generation failed.") | |
else: | |
st.audio(audio_bytes, format="audio/wav") | |
st.balloons() |