File size: 9,075 Bytes
0e3c4ab
4f563d7
 
 
0e3c4ab
4f563d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8140f15
768e131
976f7e9
4f563d7
 
 
 
 
b8eee6f
976f7e9
768e131
976f7e9
4f563d7
 
 
 
 
b8eee6f
976f7e9
768e131
976f7e9
4f563d7
 
 
 
 
f1b936e
976f7e9
 
4f563d7
 
 
 
768e131
4f563d7
 
 
 
 
 
 
 
 
 
 
 
aa50810
 
 
 
 
 
 
 
0e3c4ab
768e131
4f563d7
e9bd2c8
4f563d7
 
 
 
 
 
 
 
 
 
aa50810
1058cc3
aa50810
b8eee6f
f956ff5
aa50810
 
 
 
0e3c4ab
4f563d7
d35bab6
768e131
7f850fd
aa50810
 
 
 
7f850fd
768e131
98664f0
4f563d7
 
 
 
 
 
 
 
 
 
 
aa50810
 
 
 
 
 
 
 
 
 
0e3c4ab
4f563d7
 
768e131
4f563d7
 
 
 
0e3c4ab
4f563d7
 
 
aa50810
 
 
 
 
 
4f563d7
 
aa50810
 
 
0e3c4ab
 
 
 
 
 
 
 
aa50810
 
0e3c4ab
 
 
 
aa50810
 
 
0e3c4ab
4f563d7
 
aa50810
 
 
 
 
768e131
aa50810
 
 
 
 
 
0e3c4ab
4f563d7
aa50810
 
 
 
 
 
 
4f563d7
 
768e131
 
aa50810
 
 
 
4f563d7
 
 
aa50810
4f563d7
 
 
 
aa50810
 
4f563d7
 
aa50810
 
 
4f563d7
 
 
aa50810
 
 
4f563d7
aa50810
4f563d7
aa50810
 
 
4f563d7
aa50810
4f563d7
aa50810
 
 
4f563d7
aa50810
4f563d7
aa50810
 
4f563d7
 
aa50810
 
 
4f563d7
aa50810
 
 
 
 
 
 
 
 
 
 
 
4f563d7
aa50810
 
 
 
d753fd9
aa50810
 
4f563d7
aa50810
 
 
0e3c4ab
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282

# ======================
# Import Section
# ======================

# Core Libraries
import io # Input/output operations for byte streams

# AI/ML Frameworks
from transformers import pipeline  # Hugging Face transformers pipeline
import torch  # PyTorch tensor operations

# Audio Processing
import soundfile as sf  # Audio file I/O operations

# Image Processing
from PIL import Image  # Image manipulation library

# Data Handling
from datasets import load_dataset  # Hugging Face datasets loader

# Web Interface
import streamlit as st  # Web app framework


# ======================
# Model Loading Functions
# ======================

@st.cache_resource
def load_caption_pipeline():
    """Initialize and cache the image captioning pipeline.
    
    Returns:
        Pipeline: BLIP model for image-to-text generation
    """
    return pipeline("image-to-text", model="Salesforce/blip-image-captioning-large",use_fast=True)

@st.cache_resource
def load_story_pipeline():
    """Initialize and cache the story generation pipeline.
    
    Returns:
        Pipeline: Fine-tuned LLaMA model for children's story generation
    """
    return pipeline("text-generation", model="wy2001/storygenratorllama3.21b",use_fast=True)

@st.cache_resource
def load_tts_pipeline():
    """Initialize and cache the text-to-speech pipeline.
    
    Returns:
        Pipeline: Microsoft's SpeechT5 for high-quality speech synthesis
    """
    return pipeline("text-to-speech", model="microsoft/speecht5_tts",use_fast=True)


# ======================
# Core Processing Functions
# ======================

@st.cache_data(show_spinner=False, max_entries=3)
def generate_image_caption(image: Image.Image) -> str:
    """Generate descriptive caption for uploaded image.
    
    Args:
        image (PIL.Image): RGB formatted input image
        
    Returns:
        str: Generated image caption
        
    Raises:
        StreamlitError: If caption generation fails
    """
    try:
        img2caption = load_caption_pipeline()
        # Generate caption
        caption = img2caption(image)[0]['generated_text']
        return caption
    except Exception as e:
        st.error(f"πŸ” The caption fairy is confused about the picture!  says: {str(e)}")
        st.stop()

@st.cache_data(show_spinner=False, max_entries=3)
def generate_story(caption: str) -> str:
    """Generate child-friendly story from image caption.
    
    Args:
        caption (str): Image description from previous step
        
    Returns:
        str: Generated story (60-80 words) with happy ending
        
    Raises:
        StreamlitError: If story generation fails
    """
    try:
        messages = [{"role": "user", "content": f"Creating a story for 3-10 years old kids about {caption} between 60 to 80 words with friendly words and happy ending. present the story itself only."},]
        cap2story = load_story_pipeline()
        output = cap2story(messages,max_new_tokens=200,num_return_sequences=1)
        story = output[0]['generated_text'][1]['content']
        return story
    except Exception as e:
        st.error(f"🧚 The writing fairy is sleeping! says: {str(e)}")
        st.stop()



@st.cache_resource
def load_speaker_embeddings():
    """loading the embedding dataset that model required"""
    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
    speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
    return speaker_embeddings

@st.cache_data(show_spinner=False, max_entries=3)
def read_story(story):
    """Convert generated story to speech audio.
    
    Args:
        story (str): Generated story text
        
    Returns:
        io.BytesIO: Audio buffer in WAV format
        
    Raises:
        StreamlitError: If audio generation fails
    """
    try:
        text2speech = load_tts_pipeline()
        audio_data = text2speech(story,forward_params={"speaker_embeddings": load_speaker_embeddings()})
        audio_buffer = io.BytesIO()
        sf.write(audio_buffer, audio_data["audio"], samplerate=audio_data["sampling_rate"],format='WAV')
        audio_buffer.seek(0)
        return audio_buffer
    except Exception as e:
        st.error(f"πŸ”Š The reading fairy is sneezing! says: {str(e)}")
        st.stop()  




# ======================
# Main Application
# ======================

def main():
    """Main application flow and UI configuration."""
    
    # Configure page settings
    st.set_page_config(
    page_title="Magic Story Time",
    page_icon="🧚",
    layout="centered",
    initial_sidebar_state="expanded"
    )

    # Custom CSS styling
    st.markdown("""
    <style>
    .story-box {
    background: linear-gradient(145deg, #fff1eb 0%, #ace0f9 100%);
    border-radius: 15px;
    padding: 25px;
    font-size: 1.1em;
    line-height: 1.8;
    color: #2c3e50;
    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
    margin: 20px 0;
    }
    .upload-section {
    border: 2px dashed #4CAF50;
    border-radius: 10px;
    padding: 20px;
    background: rgba(76, 175, 80, 0.05);
    }
    </style>
    """, unsafe_allow_html=True)

    
    # Sidebar - Image Upload
    with st.sidebar:
        st.header("πŸ–ΌοΈ Upload Your Magic Drawing Paper")
        uploaded_image = st.file_uploader(
            label="Upload an image",
            type=["jpg", "jpeg", "png"],
            help="Format in JPEG/JPG/PNG, max 1MB",
            key="image_uploader",
            accept_multiple_files=False
        )

        if uploaded_image:
          st.success(f"πŸ” The caption fairy received your image: {uploaded_image.name}")
  
    # Main Content Area
    # App title
    st.title("🧚 Magic Story Camp")
    st.markdown("---")


    # input validation
    if uploaded_image:
        
        # Validate file specifications
        if uploaded_image.size > 1024* 1024:
            st.error("πŸ” The caption fairy says the image is too big! please give me image under 1MB")
            st.stop()
        if uploaded_image.type not in ["image/jpeg", "image/png"]:
            st.error("πŸ” The caption fairy says only JPG/PNG allowed!")
            st.stop()

        # Processing pipeline
        with st.spinner("πŸ§™ The fairies are casting magic spells, it may take some time⏳..."):
            try:
                # Convert to RGB format for model compatibility
                image = Image.open(uploaded_image).convert("RGB") 

                # Display processing UI elements
                status_display = st.empty()
                progress_bar = st.progress(0)

                # Image preview expander
                with st.expander("view the image", expanded=True):
                    st.image(image, use_container_width=True)
        
                
                # Processing stages
                # Stage 1: Image Captioning
                status_display.markdown("πŸ” **The caption fairy is viewing the image...**")
                progress_bar.progress(25)
                caption = generate_image_caption(image)
                
        
                # Stage 2: Story Generation
                status_display.markdown("🧚 **The writing fairy is writing the story...**")
                progress_bar.progress(50)
                story = generate_story(caption)
                
        
                # Stage 3: Audio Synthesis
                status_display.markdown("πŸ”Š **The reading fairy is preparing audio magic...**")
                progress_bar.progress(75)
                speech = read_story(story)
                
        
                # Finish
                progress_bar.progress(100)
                status_display.markdown("🧚 **The Story is ready!**")

                # Display formatted story
                st.markdown("### πŸ“– Your Magic story")
                st.markdown(f'<div class="story-box">{story}</div>', unsafe_allow_html=True)
        
                # Audio playback and download
                st.audio(speech, format="audio/wav")
                st.download_button(
                      "🎡 Download Story",
                      data=speech,
                      file_name="magic_story.wav",
                      mime="audio/wav",
                      help="click to download your story"
                      )
            except Exception as e:
                st.error(f"πŸ’₯ The magic spell broke! Please try again. {str(e)}")
                st.stop()
    else:
        # Page instructions
        st.markdown("""
        <div class="upload-section">
          <h3 style="color:#4CAF50; text-align:center;">❓ guidance</h3>
          1. πŸ–ΌοΈ Upload Your Picture in the sidebar<br>
          2. Wait for the magic sparkles may take 10 min βœ¨οΌ‰<br>
          3. Read/listen to your story and download with 🎡 button!<br>
          <br>
          Note: First-time model loading may take longer.<br> 
          Please have a glass of juice and be patient for a few moments<br>
        </div>
        """, unsafe_allow_html=True)

if __name__ == "__main__":
    main()