import io                   # for creating in-memory binary streams
import wave                 # for writing WAV audio files
import re                   # for regular expression utilities
import streamlit as st      # Streamlit UI library
from transformers import pipeline  # Hugging Face inference pipelines
from PIL import Image       # Python Imaging Library for image loading
import numpy as np          # numerical operations, especially array handling

# 1) CACHE & LOAD MODELS (CPU only)
@st.cache_resource(show_spinner=False)
def load_captioner():
    return pipeline(
        "image-to-text",
        model="Salesforce/blip-image-captioning-base",
        device=-1  # force CPU
    )

@st.cache_resource(show_spinner=False)
def load_story_pipe():
    return pipeline(
        "text2text-generation",
        model="google/flan-t5-base",
        device=-1  # force CPU
    )

@st.cache_resource(show_spinner=False)
def load_tts_pipe():
    return pipeline(
        "text-to-speech",
        model="facebook/mms-tts-eng",
        device=-1  # force CPU
    )

# 2) HELPER FUNCTIONS
def sentence_case(text: str) -> str:
    parts = re.split(r'([.!?])', text)
    out = []
    for i in range(0, len(parts) - 1, 2):
        sentence = parts[i].strip()
        delimiter = parts[i + 1]
        if sentence:
            formatted = sentence[0].upper() + sentence[1:]
            out.append(f"{formatted}{delimiter}")
    if len(parts) % 2:
        last = parts[-1].strip()
        if last:
            formatted = last[0].upper() + last[1:]
            out.append(formatted)
    return " ".join(" ".join(out).split())

def caption_image(img: Image.Image, captioner) -> str:
    if img.mode != "RGB":
        img = img.convert("RGB")
    results = captioner(img)
    return (results[0].get("generated_text", "") if results else "")

def story_from_caption(caption: str, pipe) -> str:
    if not caption:
        return "Could not generate a story without a caption."
    prompt = f"Write a vivid, imaginative ~100-word story about this scene: {caption}\n\nWrite a creative and descriptive short story."
    results = pipe(
        prompt,
        max_length=120,
        min_length=60,
        do_sample=True,
        top_k=100,
        top_p=0.9,
        temperature=0.8,
        repetition_penalty=1.1,
        no_repeat_ngram_size=4,
        early_stopping=False
    )
    raw = results[0]["generated_text"].strip()
    # Remove prompt echo if present
    raw = re.sub(re.escape(prompt), "", raw, flags=re.IGNORECASE).strip()
    # Trim to last full sentence
    idx = max(raw.rfind("."), raw.rfind("!"), raw.rfind("?"))
    if idx != -1:
        raw = raw[:idx+1]
    elif len(raw) > 80:
        raw = raw[:raw.rfind(" ") if raw.rfind(" ") > 60 else 80] + "..."
    return sentence_case(raw)

def tts_bytes(text: str, tts_pipe) -> bytes:
    if not text:
        return b""
    cleaned = re.sub(r'^["\']|["\']$', '', text).strip()
    cleaned = re.sub(r'\.{2,}', '.', cleaned).replace('…', '...')
    if cleaned[-1] not in ".!?":
        cleaned += "."
    cleaned = " ".join(cleaned.split())
    output = tts_pipe(cleaned)
    result = output[0] if isinstance(output, list) else output
    audio_array = result.get("audio")
    rate = result.get("sampling_rate")
    if audio_array is None or rate is None:
        return b""
    if audio_array.ndim == 1:
        data = audio_array[:, np.newaxis]
    else:
        data = audio_array.T
    pcm = (data * 32767).astype(np.int16)
    buf = io.BytesIO()
    wf = wave.open(buf, "wb")
    wf.setnchannels(data.shape[1])
    wf.setsampwidth(2)
    wf.setframerate(rate)
    wf.writeframes(pcm.tobytes())
    wf.close()
    buf.seek(0)
    return buf.read()

# 3) STREAMLIT USER INTERFACE
st.set_page_config(page_title="✨ Imagine & Narrate", page_icon="✨", layout="centered")

# Persist upload across reruns
if "uploaded_file" not in st.session_state:
    st.session_state.uploaded_file = None

new_upload = st.file_uploader(
    "Choose an image file",
    type=["jpg", "jpeg", "png"]
)
if new_upload is not None:
    st.session_state.uploaded_file = new_upload

if st.session_state.uploaded_file is None:
    st.title("✨ Imagine & Narrate")
    st.info("➡️ Upload an image above to start the magic!")
    st.stop()

uploaded = st.session_state.uploaded_file
try:
    img = Image.open(uploaded)
except Exception as e:
    st.error(f"Could not load the image: {e}")
    st.stop()

st.title("✨ Imagine & Narrate")
st.subheader("📸 Your Visual Input")
st.image(img, caption=uploaded.name, use_container_width=True)
st.divider()

# Step 1: Generate Caption
st.subheader("🧠 Generating Caption")
with st.spinner("Analyzing image..."):
    captioner = load_captioner()
    raw_caption = caption_image(img, captioner)
    if not raw_caption:
        st.error("Failed to generate caption.")
        st.stop()
    caption = sentence_case(raw_caption)
st.markdown(f"**Identified Scene:** {caption}")
st.divider()

# Step 2: Generate Story
st.subheader("📖 Crafting a Story")
with st.spinner("Writing story..."):
    story_pipe = load_story_pipe()
    story = story_from_caption(caption, story_pipe)
    if not story or story.strip() in {".", "..", "..."}:
        st.error("Failed to generate story.")
        st.stop()
st.write(story)
st.divider()

# Step 3: Synthesize Audio
st.subheader("👂 Hear the Story")
with st.spinner("Synthesizing audio..."):
    tts_pipe = load_tts_pipe()
    audio_bytes = tts_bytes(story, tts_pipe)
    if not audio_bytes:
        st.warning("Audio generation failed.")
    else:
        st.audio(audio_bytes, format="audio/wav")
st.balloons()