Spaces:

IrisDeng
/

UST_Deep_learning_asm1

Running

File size: 2,813 Bytes

a4a0cba
 
 
 
 
 
8948605
6150d17
a4a0cba
f1d429a
a4a0cba
f1d429a
 
8948605
16f5ed7
a4a0cba
 
2e3e400
 
 
 
 
 
 
 
 
6150d17
2e3e400
 
 
 
 
 
 
 
6150d17
8498664
a4a0cba
16f5ed7
 
 
f1d429a
 
 
a4a0cba
16f5ed7
a4a0cba
f1d429a
 
16f5ed7
f1d429a
16f5ed7
f1d429a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16f5ed7
75961c1

import streamlit as st
from PIL import Image
from transformers import pipeline
from gtts import gTTS

st.set_page_config(page_title="Image to Audio Story", page_icon="🦜")
caption_pipeline = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
story_pipeline = pipeline("text-generation", model="Qwen/Qwen2.5-0.5B-Instruct")


def extract_image_caption(image_data):
    img_obj = Image.open(image_data)
    caption_results = caption_pipeline(img_obj)
    return caption_results[0]['generated_text']


def compose_story_from_caption(caption_detail):
    while True:
        prompt_text = (
            "You are a talented and imaginative storyteller for children aged 3 to 10. "
            "Using the details derived from the image below, craft a complete and captivating tale that includes three main characters, "
            "an adventurous journey, and delightful surprises. "
            "Your story should have a clear beginning, middle, and end, and be between 80 and 100 words in length.\n\n"
            f"Image Details: {caption_detail}\n\nStory:"
        )
        
        story_results = story_pipeline(prompt_text, num_return_sequences=1, max_new_tokens=150)
        story_text = story_results[0]['generated_text']
        
        # 提取故事
        if "Story:" in story_text:
            story = story_text.split("Story:", 1)[1].strip()
        else:
            story = story_text.strip()
        
        return story
    
def convert_text_to_audio(text_content, audio_path="output.mp3"):
    """
    将文本转换为音频文件。
    """
    tts_engine = gTTS(text=text_content, lang="en")
    tts_engine.save(audio_path)
    return audio_path


def run_app():
    st.markdown("<h1 style='text-align: center;'>Your Image to Audio Story 🦜</h1>", unsafe_allow_html=True)
    st.write("Upload an image below and we will generate an engaging story from the picture, then convert the story into an audio playback!")

    uploaded_image = st.file_uploader("Select an Image", type=["png", "jpg", "jpeg"])

    if uploaded_image is not None:
        image_display = Image.open(uploaded_image)
        st.image(image_display, caption="Uploaded Image", use_container_width=True)
        
        with st.spinner("Generating caption for the image..."):
            caption_text = extract_image_caption(uploaded_image)
        st.write("**Generated Caption:**", caption_text)
        
        with st.spinner("Composing story..."):
            story_text = compose_story_from_caption(caption_text)
        st.write("**Story:**")
        st.write(story_text)
        
        with st.spinner("Converting text to audio..."):
            audio_file = convert_text_to_audio(story_text)
        st.audio(audio_file, format="audio/mp3")


if __name__ == "__main__":
    run_app()