Assignment1 / src /streamlit_app.py
justREE's picture
Update src/streamlit_app.py
f136dda verified
import io # for creating in-memory binary streams
import wave # for writing WAV audio files
import re # for regular expression utilities
import streamlit as st # Streamlit UI library
from transformers import pipeline # Hugging Face inference pipelines
from PIL import Image # Python Imaging Library for image loading
import numpy as np # numerical operations, especially array handling
# 1) CACHE & LOAD MODELS (CPU only)
@st.cache_resource(show_spinner=False)
def load_captioner():
return pipeline(
"image-to-text",
model="Salesforce/blip-image-captioning-base",
device=-1 # force CPU
)
@st.cache_resource(show_spinner=False)
def load_story_pipe():
return pipeline(
"text2text-generation",
model="google/flan-t5-base",
device=-1 # force CPU
)
@st.cache_resource(show_spinner=False)
def load_tts_pipe():
return pipeline(
"text-to-speech",
model="facebook/mms-tts-eng",
device=-1 # force CPU
)
# 2) HELPER FUNCTIONS
def sentence_case(text: str) -> str:
parts = re.split(r'([.!?])', text)
out = []
for i in range(0, len(parts) - 1, 2):
sentence = parts[i].strip()
delimiter = parts[i + 1]
if sentence:
formatted = sentence[0].upper() + sentence[1:]
out.append(f"{formatted}{delimiter}")
if len(parts) % 2:
last = parts[-1].strip()
if last:
formatted = last[0].upper() + last[1:]
out.append(formatted)
return " ".join(" ".join(out).split())
def caption_image(img: Image.Image, captioner) -> str:
if img.mode != "RGB":
img = img.convert("RGB")
results = captioner(img)
return (results[0].get("generated_text", "") if results else "")
def story_from_caption(caption: str, pipe) -> str:
if not caption:
return "Could not generate a story without a caption."
prompt = f"Write a vivid, imaginative ~100-word story about this scene: {caption}\n\nWrite a creative and descriptive short story."
results = pipe(
prompt,
max_length=120,
min_length=60,
do_sample=True,
top_k=100,
top_p=0.9,
temperature=0.8,
repetition_penalty=1.1,
no_repeat_ngram_size=4,
early_stopping=False
)
raw = results[0]["generated_text"].strip()
# Remove prompt echo if present
raw = re.sub(re.escape(prompt), "", raw, flags=re.IGNORECASE).strip()
# Trim to last full sentence
idx = max(raw.rfind("."), raw.rfind("!"), raw.rfind("?"))
if idx != -1:
raw = raw[:idx+1]
elif len(raw) > 80:
raw = raw[:raw.rfind(" ") if raw.rfind(" ") > 60 else 80] + "..."
return sentence_case(raw)
def tts_bytes(text: str, tts_pipe) -> bytes:
if not text:
return b""
cleaned = re.sub(r'^["\']|["\']$', '', text).strip()
cleaned = re.sub(r'\.{2,}', '.', cleaned).replace('…', '...')
if cleaned[-1] not in ".!?":
cleaned += "."
cleaned = " ".join(cleaned.split())
output = tts_pipe(cleaned)
result = output[0] if isinstance(output, list) else output
audio_array = result.get("audio")
rate = result.get("sampling_rate")
if audio_array is None or rate is None:
return b""
if audio_array.ndim == 1:
data = audio_array[:, np.newaxis]
else:
data = audio_array.T
pcm = (data * 32767).astype(np.int16)
buf = io.BytesIO()
wf = wave.open(buf, "wb")
wf.setnchannels(data.shape[1])
wf.setsampwidth(2)
wf.setframerate(rate)
wf.writeframes(pcm.tobytes())
wf.close()
buf.seek(0)
return buf.read()
# 3) STREAMLIT USER INTERFACE
st.set_page_config(page_title="✨ Imagine & Narrate", page_icon="✨", layout="centered")
# Persist upload across reruns
if "uploaded_file" not in st.session_state:
st.session_state.uploaded_file = None
new_upload = st.file_uploader(
"Choose an image file",
type=["jpg", "jpeg", "png"]
)
if new_upload is not None:
st.session_state.uploaded_file = new_upload
if st.session_state.uploaded_file is None:
st.title("✨ Imagine & Narrate")
st.info("➑️ Upload an image above to start the magic!")
st.stop()
uploaded = st.session_state.uploaded_file
try:
img = Image.open(uploaded)
except Exception as e:
st.error(f"Could not load the image: {e}")
st.stop()
st.title("✨ Imagine & Narrate")
st.subheader("πŸ“Έ Your Visual Input")
st.image(img, caption=uploaded.name, use_container_width=True)
st.divider()
# Step 1: Generate Caption
st.subheader("🧠 Generating Caption")
with st.spinner("Analyzing image..."):
captioner = load_captioner()
raw_caption = caption_image(img, captioner)
if not raw_caption:
st.error("Failed to generate caption.")
st.stop()
caption = sentence_case(raw_caption)
st.markdown(f"**Identified Scene:** {caption}")
st.divider()
# Step 2: Generate Story
st.subheader("πŸ“– Crafting a Story")
with st.spinner("Writing story..."):
story_pipe = load_story_pipe()
story = story_from_caption(caption, story_pipe)
if not story or story.strip() in {".", "..", "..."}:
st.error("Failed to generate story.")
st.stop()
st.write(story)
st.divider()
# Step 3: Synthesize Audio
st.subheader("πŸ‘‚ Hear the Story")
with st.spinner("Synthesizing audio..."):
tts_pipe = load_tts_pipe()
audio_bytes = tts_bytes(story, tts_pipe)
if not audio_bytes:
st.warning("Audio generation failed.")
else:
st.audio(audio_bytes, format="audio/wav")
st.balloons()