Spaces:

justREE
/

Assignment1

Sleeping

App Files Files Community

Assignment1 / src /streamlit_app.py

justREE

Update src/streamlit_app.py

f136dda verified 2 months ago

raw

history blame contribute delete

5.64 kB

	import io # for creating in-memory binary streams
	import wave # for writing WAV audio files
	import re # for regular expression utilities
	import streamlit as st # Streamlit UI library
	from transformers import pipeline # Hugging Face inference pipelines
	from PIL import Image # Python Imaging Library for image loading
	import numpy as np # numerical operations, especially array handling

	# 1) CACHE & LOAD MODELS (CPU only)
	@st.cache_resource(show_spinner=False)
	def load_captioner():
	return pipeline(
	"image-to-text",
	model="Salesforce/blip-image-captioning-base",
	device=-1 # force CPU
	)

	@st.cache_resource(show_spinner=False)
	def load_story_pipe():
	return pipeline(
	"text2text-generation",
	model="google/flan-t5-base",
	device=-1 # force CPU
	)

	@st.cache_resource(show_spinner=False)
	def load_tts_pipe():
	return pipeline(
	"text-to-speech",
	model="facebook/mms-tts-eng",
	device=-1 # force CPU
	)

	# 2) HELPER FUNCTIONS
	def sentence_case(text: str) -> str:
	parts = re.split(r'([.!?])', text)
	out = []
	for i in range(0, len(parts) - 1, 2):
	sentence = parts[i].strip()
	delimiter = parts[i + 1]
	if sentence:
	formatted = sentence[0].upper() + sentence[1:]
	out.append(f"{formatted}{delimiter}")
	if len(parts) % 2:
	last = parts[-1].strip()
	if last:
	formatted = last[0].upper() + last[1:]
	out.append(formatted)
	return " ".join(" ".join(out).split())

	def caption_image(img: Image.Image, captioner) -> str:
	if img.mode != "RGB":
	img = img.convert("RGB")
	results = captioner(img)
	return (results[0].get("generated_text", "") if results else "")

	def story_from_caption(caption: str, pipe) -> str:
	if not caption:
	return "Could not generate a story without a caption."
	prompt = f"Write a vivid, imaginative ~100-word story about this scene: {caption}\n\nWrite a creative and descriptive short story."
	results = pipe(
	prompt,
	max_length=120,
	min_length=60,
	do_sample=True,
	top_k=100,
	top_p=0.9,
	temperature=0.8,
	repetition_penalty=1.1,
	no_repeat_ngram_size=4,
	early_stopping=False
	)
	raw = results[0]["generated_text"].strip()
	# Remove prompt echo if present
	raw = re.sub(re.escape(prompt), "", raw, flags=re.IGNORECASE).strip()
	# Trim to last full sentence
	idx = max(raw.rfind("."), raw.rfind("!"), raw.rfind("?"))
	if idx != -1:
	raw = raw[:idx+1]
	elif len(raw) > 80:
	raw = raw[:raw.rfind(" ") if raw.rfind(" ") > 60 else 80] + "..."
	return sentence_case(raw)

	def tts_bytes(text: str, tts_pipe) -> bytes:
	if not text:
	return b""
	cleaned = re.sub(r'^["\']\|["\']$', '', text).strip()
	cleaned = re.sub(r'\.{2,}', '.', cleaned).replace('…', '...')
	if cleaned[-1] not in ".!?":
	cleaned += "."
	cleaned = " ".join(cleaned.split())
	output = tts_pipe(cleaned)
	result = output[0] if isinstance(output, list) else output
	audio_array = result.get("audio")
	rate = result.get("sampling_rate")
	if audio_array is None or rate is None:
	return b""
	if audio_array.ndim == 1:
	data = audio_array[:, np.newaxis]
	else:
	data = audio_array.T
	pcm = (data * 32767).astype(np.int16)
	buf = io.BytesIO()
	wf = wave.open(buf, "wb")
	wf.setnchannels(data.shape[1])
	wf.setsampwidth(2)
	wf.setframerate(rate)
	wf.writeframes(pcm.tobytes())
	wf.close()
	buf.seek(0)
	return buf.read()

	# 3) STREAMLIT USER INTERFACE
	st.set_page_config(page_title="✨ Imagine & Narrate", page_icon="✨", layout="centered")

	# Persist upload across reruns
	if "uploaded_file" not in st.session_state:
	st.session_state.uploaded_file = None

	new_upload = st.file_uploader(
	"Choose an image file",
	type=["jpg", "jpeg", "png"]
	)
	if new_upload is not None:
	st.session_state.uploaded_file = new_upload

	if st.session_state.uploaded_file is None:
	st.title("✨ Imagine & Narrate")
	st.info("➡️ Upload an image above to start the magic!")
	st.stop()

	uploaded = st.session_state.uploaded_file
	try:
	img = Image.open(uploaded)
	except Exception as e:
	st.error(f"Could not load the image: {e}")
	st.stop()

	st.title("✨ Imagine & Narrate")
	st.subheader("📸 Your Visual Input")
	st.image(img, caption=uploaded.name, use_container_width=True)
	st.divider()

	# Step 1: Generate Caption
	st.subheader("🧠 Generating Caption")
	with st.spinner("Analyzing image..."):
	captioner = load_captioner()
	raw_caption = caption_image(img, captioner)
	if not raw_caption:
	st.error("Failed to generate caption.")
	st.stop()
	caption = sentence_case(raw_caption)
	st.markdown(f"Identified Scene: {caption}")
	st.divider()

	# Step 2: Generate Story
	st.subheader("📖 Crafting a Story")
	with st.spinner("Writing story..."):
	story_pipe = load_story_pipe()
	story = story_from_caption(caption, story_pipe)
	if not story or story.strip() in {".", "..", "..."}:
	st.error("Failed to generate story.")
	st.stop()
	st.write(story)
	st.divider()

	# Step 3: Synthesize Audio
	st.subheader("👂 Hear the Story")
	with st.spinner("Synthesizing audio..."):
	tts_pipe = load_tts_pipe()
	audio_bytes = tts_bytes(story, tts_pipe)
	if not audio_bytes:
	st.warning("Audio generation failed.")
	else:
	st.audio(audio_bytes, format="audio/wav")
	st.balloons()