Spaces:
Sleeping
Sleeping
File size: 9,075 Bytes
0e3c4ab 4f563d7 0e3c4ab 4f563d7 8140f15 768e131 976f7e9 4f563d7 b8eee6f 976f7e9 768e131 976f7e9 4f563d7 b8eee6f 976f7e9 768e131 976f7e9 4f563d7 f1b936e 976f7e9 4f563d7 768e131 4f563d7 aa50810 0e3c4ab 768e131 4f563d7 e9bd2c8 4f563d7 aa50810 1058cc3 aa50810 b8eee6f f956ff5 aa50810 0e3c4ab 4f563d7 d35bab6 768e131 7f850fd aa50810 7f850fd 768e131 98664f0 4f563d7 aa50810 0e3c4ab 4f563d7 768e131 4f563d7 0e3c4ab 4f563d7 aa50810 4f563d7 aa50810 0e3c4ab aa50810 0e3c4ab aa50810 0e3c4ab 4f563d7 aa50810 768e131 aa50810 0e3c4ab 4f563d7 aa50810 4f563d7 768e131 aa50810 4f563d7 aa50810 4f563d7 aa50810 4f563d7 aa50810 4f563d7 aa50810 4f563d7 aa50810 4f563d7 aa50810 4f563d7 aa50810 4f563d7 aa50810 4f563d7 aa50810 4f563d7 aa50810 4f563d7 aa50810 4f563d7 aa50810 4f563d7 aa50810 d753fd9 aa50810 4f563d7 aa50810 0e3c4ab |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 |
# ======================
# Import Section
# ======================
# Core Libraries
import io # Input/output operations for byte streams
# AI/ML Frameworks
from transformers import pipeline # Hugging Face transformers pipeline
import torch # PyTorch tensor operations
# Audio Processing
import soundfile as sf # Audio file I/O operations
# Image Processing
from PIL import Image # Image manipulation library
# Data Handling
from datasets import load_dataset # Hugging Face datasets loader
# Web Interface
import streamlit as st # Web app framework
# ======================
# Model Loading Functions
# ======================
@st.cache_resource
def load_caption_pipeline():
"""Initialize and cache the image captioning pipeline.
Returns:
Pipeline: BLIP model for image-to-text generation
"""
return pipeline("image-to-text", model="Salesforce/blip-image-captioning-large",use_fast=True)
@st.cache_resource
def load_story_pipeline():
"""Initialize and cache the story generation pipeline.
Returns:
Pipeline: Fine-tuned LLaMA model for children's story generation
"""
return pipeline("text-generation", model="wy2001/storygenratorllama3.21b",use_fast=True)
@st.cache_resource
def load_tts_pipeline():
"""Initialize and cache the text-to-speech pipeline.
Returns:
Pipeline: Microsoft's SpeechT5 for high-quality speech synthesis
"""
return pipeline("text-to-speech", model="microsoft/speecht5_tts",use_fast=True)
# ======================
# Core Processing Functions
# ======================
@st.cache_data(show_spinner=False, max_entries=3)
def generate_image_caption(image: Image.Image) -> str:
"""Generate descriptive caption for uploaded image.
Args:
image (PIL.Image): RGB formatted input image
Returns:
str: Generated image caption
Raises:
StreamlitError: If caption generation fails
"""
try:
img2caption = load_caption_pipeline()
# Generate caption
caption = img2caption(image)[0]['generated_text']
return caption
except Exception as e:
st.error(f"π The caption fairy is confused about the picture! says: {str(e)}")
st.stop()
@st.cache_data(show_spinner=False, max_entries=3)
def generate_story(caption: str) -> str:
"""Generate child-friendly story from image caption.
Args:
caption (str): Image description from previous step
Returns:
str: Generated story (60-80 words) with happy ending
Raises:
StreamlitError: If story generation fails
"""
try:
messages = [{"role": "user", "content": f"Creating a story for 3-10 years old kids about {caption} between 60 to 80 words with friendly words and happy ending. present the story itself only."},]
cap2story = load_story_pipeline()
output = cap2story(messages,max_new_tokens=200,num_return_sequences=1)
story = output[0]['generated_text'][1]['content']
return story
except Exception as e:
st.error(f"π§ The writing fairy is sleeping! says: {str(e)}")
st.stop()
@st.cache_resource
def load_speaker_embeddings():
"""loading the embedding dataset that model required"""
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
return speaker_embeddings
@st.cache_data(show_spinner=False, max_entries=3)
def read_story(story):
"""Convert generated story to speech audio.
Args:
story (str): Generated story text
Returns:
io.BytesIO: Audio buffer in WAV format
Raises:
StreamlitError: If audio generation fails
"""
try:
text2speech = load_tts_pipeline()
audio_data = text2speech(story,forward_params={"speaker_embeddings": load_speaker_embeddings()})
audio_buffer = io.BytesIO()
sf.write(audio_buffer, audio_data["audio"], samplerate=audio_data["sampling_rate"],format='WAV')
audio_buffer.seek(0)
return audio_buffer
except Exception as e:
st.error(f"π The reading fairy is sneezing! says: {str(e)}")
st.stop()
# ======================
# Main Application
# ======================
def main():
"""Main application flow and UI configuration."""
# Configure page settings
st.set_page_config(
page_title="Magic Story Time",
page_icon="π§",
layout="centered",
initial_sidebar_state="expanded"
)
# Custom CSS styling
st.markdown("""
<style>
.story-box {
background: linear-gradient(145deg, #fff1eb 0%, #ace0f9 100%);
border-radius: 15px;
padding: 25px;
font-size: 1.1em;
line-height: 1.8;
color: #2c3e50;
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
margin: 20px 0;
}
.upload-section {
border: 2px dashed #4CAF50;
border-radius: 10px;
padding: 20px;
background: rgba(76, 175, 80, 0.05);
}
</style>
""", unsafe_allow_html=True)
# Sidebar - Image Upload
with st.sidebar:
st.header("πΌοΈ Upload Your Magic Drawing Paper")
uploaded_image = st.file_uploader(
label="Upload an image",
type=["jpg", "jpeg", "png"],
help="Format in JPEG/JPG/PNG, max 1MB",
key="image_uploader",
accept_multiple_files=False
)
if uploaded_image:
st.success(f"π The caption fairy received your image: {uploaded_image.name}")
# Main Content Area
# App title
st.title("π§ Magic Story Camp")
st.markdown("---")
# input validation
if uploaded_image:
# Validate file specifications
if uploaded_image.size > 1024* 1024:
st.error("π The caption fairy says the image is too big! please give me image under 1MB")
st.stop()
if uploaded_image.type not in ["image/jpeg", "image/png"]:
st.error("π The caption fairy says only JPG/PNG allowed!")
st.stop()
# Processing pipeline
with st.spinner("π§ The fairies are casting magic spells, it may take some timeβ³..."):
try:
# Convert to RGB format for model compatibility
image = Image.open(uploaded_image).convert("RGB")
# Display processing UI elements
status_display = st.empty()
progress_bar = st.progress(0)
# Image preview expander
with st.expander("view the image", expanded=True):
st.image(image, use_container_width=True)
# Processing stages
# Stage 1: Image Captioning
status_display.markdown("π **The caption fairy is viewing the image...**")
progress_bar.progress(25)
caption = generate_image_caption(image)
# Stage 2: Story Generation
status_display.markdown("π§ **The writing fairy is writing the story...**")
progress_bar.progress(50)
story = generate_story(caption)
# Stage 3: Audio Synthesis
status_display.markdown("π **The reading fairy is preparing audio magic...**")
progress_bar.progress(75)
speech = read_story(story)
# Finish
progress_bar.progress(100)
status_display.markdown("π§ **The Story is ready!**")
# Display formatted story
st.markdown("### π Your Magic story")
st.markdown(f'<div class="story-box">{story}</div>', unsafe_allow_html=True)
# Audio playback and download
st.audio(speech, format="audio/wav")
st.download_button(
"π΅ Download Story",
data=speech,
file_name="magic_story.wav",
mime="audio/wav",
help="click to download your story"
)
except Exception as e:
st.error(f"π₯ The magic spell broke! Please try again. {str(e)}")
st.stop()
else:
# Page instructions
st.markdown("""
<div class="upload-section">
<h3 style="color:#4CAF50; text-align:center;">β guidance</h3>
1. πΌοΈ Upload Your Picture in the sidebar<br>
2. Wait for the magic sparkles may take 10 min β¨οΌ<br>
3. Read/listen to your story and download with π΅ button!<br>
<br>
Note: First-time model loading may take longer.<br>
Please have a glass of juice and be patient for a few moments<br>
</div>
""", unsafe_allow_html=True)
if __name__ == "__main__":
main()
|