Spaces:

justREE
/

Assignment1

Sleeping

File size: 5,639 Bytes

77c4802
 
 
 
 
 
 
 
f136dda
77c4802
 
 
 
 
f136dda
77c4802
 
 
 
 
 
 
f136dda
77c4802
 
 
 
 
 
 
f136dda
77c4802
 
 
 
70a4e6d
77c4802
 
f136dda
 
 
 
 
77c4802
f136dda
 
 
 
77c4802
 
 
70a4e6d
 
f136dda
 
77c4802
 
70a4e6d
 
f136dda
77c4802
 
f136dda
 
 
 
 
 
 
 
77c4802
 
f136dda
 
 
 
 
 
 
 
 
77c4802
 
 
70a4e6d
f136dda
 
 
 
 
 
 
77c4802
f136dda
 
70a4e6d
 
77c4802
f136dda
77c4802
f136dda
77c4802
f136dda
 
 
 
 
 
 
 
 
77c4802
 
f136dda
70a4e6d
f136dda
 
 
77c4802
f136dda
77c4802
f136dda
77c4802
f136dda
 
70a4e6d
f136dda
 
77c4802
f136dda
77c4802
f136dda
 
 
 
 
 
77c4802
f136dda
77c4802
f136dda
77c4802
 
f136dda
 
 
 
 
 
 
70a4e6d
f136dda
77c4802
 
 
f136dda
 
 
 
 
 
 
70a4e6d
77c4802
 
 
f136dda
77c4802
f136dda
 
 
 
 
 
 
77c4802

import io                   # for creating in-memory binary streams
import wave                 # for writing WAV audio files
import re                   # for regular expression utilities
import streamlit as st      # Streamlit UI library
from transformers import pipeline  # Hugging Face inference pipelines
from PIL import Image       # Python Imaging Library for image loading
import numpy as np          # numerical operations, especially array handling

# 1) CACHE & LOAD MODELS (CPU only)
@st.cache_resource(show_spinner=False)
def load_captioner():
    return pipeline(
        "image-to-text",
        model="Salesforce/blip-image-captioning-base",
        device=-1  # force CPU
    )

@st.cache_resource(show_spinner=False)
def load_story_pipe():
    return pipeline(
        "text2text-generation",
        model="google/flan-t5-base",
        device=-1  # force CPU
    )

@st.cache_resource(show_spinner=False)
def load_tts_pipe():
    return pipeline(
        "text-to-speech",
        model="facebook/mms-tts-eng",
        device=-1  # force CPU
    )

# 2) HELPER FUNCTIONS
def sentence_case(text: str) -> str:
    parts = re.split(r'([.!?])', text)
    out = []
    for i in range(0, len(parts) - 1, 2):
        sentence = parts[i].strip()
        delimiter = parts[i + 1]
        if sentence:
            formatted = sentence[0].upper() + sentence[1:]
            out.append(f"{formatted}{delimiter}")
    if len(parts) % 2:
        last = parts[-1].strip()
        if last:
            formatted = last[0].upper() + last[1:]
            out.append(formatted)
    return " ".join(" ".join(out).split())

def caption_image(img: Image.Image, captioner) -> str:
    if img.mode != "RGB":
        img = img.convert("RGB")
    results = captioner(img)
    return (results[0].get("generated_text", "") if results else "")

def story_from_caption(caption: str, pipe) -> str:
    if not caption:
        return "Could not generate a story without a caption."
    prompt = f"Write a vivid, imaginative ~100-word story about this scene: {caption}\n\nWrite a creative and descriptive short story."
    results = pipe(
        prompt,
        max_length=120,
        min_length=60,
        do_sample=True,
        top_k=100,
        top_p=0.9,
        temperature=0.8,
        repetition_penalty=1.1,
        no_repeat_ngram_size=4,
        early_stopping=False
    )
    raw = results[0]["generated_text"].strip()
    # Remove prompt echo if present
    raw = re.sub(re.escape(prompt), "", raw, flags=re.IGNORECASE).strip()
    # Trim to last full sentence
    idx = max(raw.rfind("."), raw.rfind("!"), raw.rfind("?"))
    if idx != -1:
        raw = raw[:idx+1]
    elif len(raw) > 80:
        raw = raw[:raw.rfind(" ") if raw.rfind(" ") > 60 else 80] + "..."
    return sentence_case(raw)

def tts_bytes(text: str, tts_pipe) -> bytes:
    if not text:
        return b""
    cleaned = re.sub(r'^["\']|["\']$', '', text).strip()
    cleaned = re.sub(r'\.{2,}', '.', cleaned).replace('…', '...')
    if cleaned[-1] not in ".!?":
        cleaned += "."
    cleaned = " ".join(cleaned.split())
    output = tts_pipe(cleaned)
    result = output[0] if isinstance(output, list) else output
    audio_array = result.get("audio")
    rate = result.get("sampling_rate")
    if audio_array is None or rate is None:
        return b""
    if audio_array.ndim == 1:
        data = audio_array[:, np.newaxis]
    else:
        data = audio_array.T
    pcm = (data * 32767).astype(np.int16)
    buf = io.BytesIO()
    wf = wave.open(buf, "wb")
    wf.setnchannels(data.shape[1])
    wf.setsampwidth(2)
    wf.setframerate(rate)
    wf.writeframes(pcm.tobytes())
    wf.close()
    buf.seek(0)
    return buf.read()

# 3) STREAMLIT USER INTERFACE
st.set_page_config(page_title="✨ Imagine & Narrate", page_icon="✨", layout="centered")

# Persist upload across reruns
if "uploaded_file" not in st.session_state:
    st.session_state.uploaded_file = None

new_upload = st.file_uploader(
    "Choose an image file",
    type=["jpg", "jpeg", "png"]
)
if new_upload is not None:
    st.session_state.uploaded_file = new_upload

if st.session_state.uploaded_file is None:
    st.title("✨ Imagine & Narrate")
    st.info("➡️ Upload an image above to start the magic!")
    st.stop()

uploaded = st.session_state.uploaded_file
try:
    img = Image.open(uploaded)
except Exception as e:
    st.error(f"Could not load the image: {e}")
    st.stop()

st.title("✨ Imagine & Narrate")
st.subheader("📸 Your Visual Input")
st.image(img, caption=uploaded.name, use_container_width=True)
st.divider()

# Step 1: Generate Caption
st.subheader("🧠 Generating Caption")
with st.spinner("Analyzing image..."):
    captioner = load_captioner()
    raw_caption = caption_image(img, captioner)
    if not raw_caption:
        st.error("Failed to generate caption.")
        st.stop()
    caption = sentence_case(raw_caption)
st.markdown(f"**Identified Scene:** {caption}")
st.divider()

# Step 2: Generate Story
st.subheader("📖 Crafting a Story")
with st.spinner("Writing story..."):
    story_pipe = load_story_pipe()
    story = story_from_caption(caption, story_pipe)
    if not story or story.strip() in {".", "..", "..."}:
        st.error("Failed to generate story.")
        st.stop()
st.write(story)
st.divider()

# Step 3: Synthesize Audio
st.subheader("👂 Hear the Story")
with st.spinner("Synthesizing audio..."):
    tts_pipe = load_tts_pipe()
    audio_bytes = tts_bytes(story, tts_pipe)
    if not audio_bytes:
        st.warning("Audio generation failed.")
    else:
        st.audio(audio_bytes, format="audio/wav")
st.balloons()