Spaces:

ccclllwww
/

Assignment_V1

Sleeping

File size: 6,405 Bytes

# ======================================
# Package Import
# ======================================

import streamlit as st
from PIL import Image
import time
from transformers import pipeline

# ======================================
# Basic Initialization
# ======================================

# Initialize image captioning pipeline with pretrained model
_image_caption_pipeline = pipeline(
    task="image-to-text",
    model="cnmoro/tiny-image-captioning"
)

# Global model configuration constants
_text_generation_pipeline = pipeline("text-generation", model="Qwen/Qwen3-0.6B",max_new_tokens=100)

# Initialize TTS components once to avoid reloading
_SPEECH_PIPELINE = pipeline("text-to-speech", model="facebook/mms-tts-eng")

# ======================================
# Function settings
# ======================================

def generate_image_caption(input_image):
    """
    Generate a textual description for an input image using a pretrained model.
    
    Args:
        input_image (Union[PIL.Image.Image, str]): Image to process. Can be either:
            - A PIL Image object
            - A string containing a filesystem path to an image file
    
    Returns:
        str: Generated caption text in natural language
        
    Example:
        >>> from PIL import Image
        >>> img = Image.open("photo.jpg")
        >>> caption = generate_image_caption(img)
        >>> print(f"Caption: {caption}")
    """
    # Process image through the captioning pipeline
    inference_results = _image_caption_pipeline(input_image)
    
    # Extract text from the first (and only) result dictionary
    caption_text = inference_results[0]['generated_text']
    
    return caption_text

def generate_story_content(system_prompt: str, user_prompt: str) -> str:
    """
    Generates a children's story based on provided system and user prompts.
    
    Args:
        system_prompt: Defines the assistant's role and writing constraints
        user_prompt: Describes the story scenario and specific elements to include
        
    Returns:
        Generated story text without any thinking process metadata
        
    Raises:
        RuntimeError: If text generation fails at any stage
    
    Example:
        >>> story = generate_story_content(
        ...     "You are a helpful children's author...",
        ...     "Kids playing with dogs in a sunny meadow..."
        ... )
    """
    try:
        # Prepare chat message structure
        conversation_history = [
            {"role": "user", "content": system_prompt+user_prompt+"/no_think"},
        ]

        # Generate the story
        story=_text_generation_pipeline(conversation_history)

        # Extract the stroy result
        stroy_result=story[0]["generated_text"][1]["content"][19:]
        
        # Process and clean output
        return stroy_result
        
    except Exception as error:
        raise RuntimeError(f"Story generation failed: {str(error)}") from error

def generate_audio_from_story(story_text: str) -> str:
    """
    Convert text story to speech audio file using text-to-speech synthesis.
    
    Args:
        story_text: Input story text to synthesize
        
    Returns:
        Path to generated audio file
        
    Raises:
        ValueError: For empty/invalid input text
        RuntimeError: If audio generation fails
        
    Example:
        >>> generate_audio_from_story("Children playing in the park", "story_audio.wav")
        'story_audio.wav'
    """
    # Validate input text
    if not isinstance(story_text, str) or not story_text.strip():
        raise ValueError("Input story text must be a non-empty string")
    
    try:
        # Generate speech 
        speech_output = _SPEECH_PIPELINE( story_text )
              
        return speech_output
        
    except Exception as error:
        raise RuntimeError(f"Audio synthesis failed: {str(error)}") from error

# ======================================
# Main Application Interface
# ======================================
def main():
    """Main application interface for Streamlit"""
    # Page configuration
    st.set_page_config(
        page_title="Fantasy Adventure Generator",
        layout="wide",
        initial_sidebar_state="collapsed"
    )

    
    # Title and description
    st.title("🧙‍♂️ Fantasy Adventure Story Generator")
    st.markdown("""
    Upload an image and get:
    - Automatic scene description
    - AI-generated adventure story
    - Audio version of the story
    """)

    # Help section
    st.markdown("---")
    st.subheader("🌟 How to Use:")
    st.info("""
    1. Upload any picture (animals, nature, or people work best!)
    2. Click the generating button
    3. Wait for image analysis to complete
    4. Enjoy your story and audio!
    """)    
    
    # File uploader
    uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
    
    if uploaded_file is not None:
        # Process image
        image = Image.open(uploaded_file).convert("RGB")
        
        # Layout columns
        col1, col2 = st.columns(2)
        
        with col1:
            st.image(image, caption="Uploaded Image", use_container_width=True)
        
        # Generation button
        if st.button("✨ Generate Story & Audio"):
            with st.spinner("Processing your request..."):
                # Generate outputs and Display results
                with col2:
                    st.subheader("🔍 Scene Description")
                    with st.spinner("Preparing story caption..."):
                        caption = generate_image_caption(image)
                        st.write(caption)
                    
                    st.subheader("📖 Generated Story")
                    with st.spinner("Preparing story..."):
                        sys_prompt = "You are a fantasy writer. Create a 100-word adventure story about "
                        story = generate_story_content(sys_prompt, caption)
                        st.write(story)
                    
                    st.subheader("🔊 Audio Playback")
                    with st.spinner("Preparing speech..."):
                        speech = generate_audio_from_story(story)
                        st.audio(speech["audio"], sample_rate=speech["sampling_rate"], format='audio/wav')
            
if __name__ == "__main__":
    main()