File size: 6,405 Bytes
ac70fac
 
 
 
70fa8c1
 
 
243f6f7
70fa8c1
ac70fac
 
 
 
70fa8c1
 
 
4204a24
70fa8c1
 
 
44bce0a
70fa8c1
 
1351e1c
70fa8c1
ac70fac
 
 
 
70fa8c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38c8a94
70fa8c1
5bd24ba
 
 
 
 
38c8a94
70fa8c1
 
5bd24ba
70fa8c1
 
 
 
1351e1c
70fa8c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1351e1c
 
 
 
70fa8c1
 
 
 
ac70fac
 
 
644d364
 
 
 
 
 
 
 
0f3e58e
644d364
 
 
 
 
 
 
 
 
0f3e58e
 
 
 
 
 
cf27fca
 
 
0f3e58e
644d364
 
 
 
 
 
 
 
 
 
ac70fac
 
cf27fca
ac70fac
644d364
 
 
a200433
644d364
 
0f3e58e
 
 
644d364
 
0f3e58e
 
 
 
644d364
 
0f3e58e
 
 
 
644d364
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
# ======================================
# Package Import
# ======================================

import streamlit as st
from PIL import Image
import time
from transformers import pipeline

# ======================================
# Basic Initialization
# ======================================

# Initialize image captioning pipeline with pretrained model
_image_caption_pipeline = pipeline(
    task="image-to-text",
    model="cnmoro/tiny-image-captioning"
)

# Global model configuration constants
_text_generation_pipeline = pipeline("text-generation", model="Qwen/Qwen3-0.6B",max_new_tokens=100)

# Initialize TTS components once to avoid reloading
_SPEECH_PIPELINE = pipeline("text-to-speech", model="facebook/mms-tts-eng")

# ======================================
# Function settings
# ======================================

def generate_image_caption(input_image):
    """
    Generate a textual description for an input image using a pretrained model.
    
    Args:
        input_image (Union[PIL.Image.Image, str]): Image to process. Can be either:
            - A PIL Image object
            - A string containing a filesystem path to an image file
    
    Returns:
        str: Generated caption text in natural language
        
    Example:
        >>> from PIL import Image
        >>> img = Image.open("photo.jpg")
        >>> caption = generate_image_caption(img)
        >>> print(f"Caption: {caption}")
    """
    # Process image through the captioning pipeline
    inference_results = _image_caption_pipeline(input_image)
    
    # Extract text from the first (and only) result dictionary
    caption_text = inference_results[0]['generated_text']
    
    return caption_text

def generate_story_content(system_prompt: str, user_prompt: str) -> str:
    """
    Generates a children's story based on provided system and user prompts.
    
    Args:
        system_prompt: Defines the assistant's role and writing constraints
        user_prompt: Describes the story scenario and specific elements to include
        
    Returns:
        Generated story text without any thinking process metadata
        
    Raises:
        RuntimeError: If text generation fails at any stage
    
    Example:
        >>> story = generate_story_content(
        ...     "You are a helpful children's author...",
        ...     "Kids playing with dogs in a sunny meadow..."
        ... )
    """
    try:
        # Prepare chat message structure
        conversation_history = [
            {"role": "user", "content": system_prompt+user_prompt+"/no_think"},
        ]

        # Generate the story
        story=_text_generation_pipeline(conversation_history)

        # Extract the stroy result
        stroy_result=story[0]["generated_text"][1]["content"][19:]
        
        # Process and clean output
        return stroy_result
        
    except Exception as error:
        raise RuntimeError(f"Story generation failed: {str(error)}") from error

def generate_audio_from_story(story_text: str) -> str:
    """
    Convert text story to speech audio file using text-to-speech synthesis.
    
    Args:
        story_text: Input story text to synthesize
        
    Returns:
        Path to generated audio file
        
    Raises:
        ValueError: For empty/invalid input text
        RuntimeError: If audio generation fails
        
    Example:
        >>> generate_audio_from_story("Children playing in the park", "story_audio.wav")
        'story_audio.wav'
    """
    # Validate input text
    if not isinstance(story_text, str) or not story_text.strip():
        raise ValueError("Input story text must be a non-empty string")
    
    try:
        # Generate speech 
        speech_output = _SPEECH_PIPELINE( story_text )
              
        return speech_output
        
    except Exception as error:
        raise RuntimeError(f"Audio synthesis failed: {str(error)}") from error

# ======================================
# Main Application Interface
# ======================================
def main():
    """Main application interface for Streamlit"""
    # Page configuration
    st.set_page_config(
        page_title="Fantasy Adventure Generator",
        layout="wide",
        initial_sidebar_state="collapsed"
    )

    
    # Title and description
    st.title("πŸ§™β€β™‚οΈ Fantasy Adventure Story Generator")
    st.markdown("""
    Upload an image and get:
    - Automatic scene description
    - AI-generated adventure story
    - Audio version of the story
    """)

    # Help section
    st.markdown("---")
    st.subheader("🌟 How to Use:")
    st.info("""
    1. Upload any picture (animals, nature, or people work best!)
    2. Click the generating button
    3. Wait for image analysis to complete
    4. Enjoy your story and audio!
    """)    
    
    # File uploader
    uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
    
    if uploaded_file is not None:
        # Process image
        image = Image.open(uploaded_file).convert("RGB")
        
        # Layout columns
        col1, col2 = st.columns(2)
        
        with col1:
            st.image(image, caption="Uploaded Image", use_container_width=True)
        
        # Generation button
        if st.button("✨ Generate Story & Audio"):
            with st.spinner("Processing your request..."):
                # Generate outputs and Display results
                with col2:
                    st.subheader("πŸ” Scene Description")
                    with st.spinner("Preparing story caption..."):
                        caption = generate_image_caption(image)
                        st.write(caption)
                    
                    st.subheader("πŸ“– Generated Story")
                    with st.spinner("Preparing story..."):
                        sys_prompt = "You are a fantasy writer. Create a 100-word adventure story about "
                        story = generate_story_content(sys_prompt, caption)
                        st.write(story)
                    
                    st.subheader("πŸ”Š Audio Playback")
                    with st.spinner("Preparing speech..."):
                        speech = generate_audio_from_story(story)
                        st.audio(speech["audio"], sample_rate=speech["sampling_rate"], format='audio/wav')
            
if __name__ == "__main__":
    main()