Spaces:

IrisDeng
/

UST_Deep_learning_asm1

Running

File size: 2,945 Bytes

a4a0cba
 
 
 
 
 
 
f1d429a
a4a0cba
16f5ed7
 
 
f1d429a
16f5ed7
 
 
 
f1d429a
16f5ed7
 
 
a4a0cba
 
16f5ed7
 
 
 
 
 
 
f1d429a
 
 
 
 
 
 
 
 
16f5ed7
 
 
 
 
 
 
 
a4a0cba
 
16f5ed7
 
 
f1d429a
 
 
a4a0cba
16f5ed7
a4a0cba
f1d429a
 
16f5ed7
f1d429a
16f5ed7
f1d429a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16f5ed7
75961c1

import streamlit as st
from PIL import Image
from transformers import pipeline
from gtts import gTTS

st.set_page_config(page_title="Image to Audio Story", page_icon="🦜")


def extract_image_caption(image_data):
    """
    利用预训练模型从图像中提取描述性文字。
    """
    img_obj = Image.open(image_data)
    caption_pipeline = pipeline(
        "image-to-text",
        model="Salesforce/blip-image-captioning-base",
    )
    caption_results = caption_pipeline(img_obj)
    caption_text = caption_results[0]['generated_text']
    return caption_text


def compose_story_from_caption(caption_detail):
    """
    根据图像描述创作一篇充满创意的儿童故事。
    """
    story_pipeline = pipeline(
        "text-generation",
        model="Qwen/Qwen2-1.5B",
    )
    prompt_text = (
        "You are a talented and imaginative storyteller for children aged 3 to 10. "
        "Using the details derived from the image below, craft a captivating tale that goes beyond merely describing the scene. "
        "Let your creativity shine by introducing engaging characters, adventurous journeys, and delightful surprises. "
        "Your story should be vivid, original, and between 100 and 300 words in length.\n\n"
        f"Image Details: {caption_detail}\n\nStory:"
    )
    story_results = story_pipeline(prompt_text, num_return_sequences=1)
    story_text = story_results[0]['generated_text']
    
    if "Story:" in story_text:
        story = story_text.split("Story:", 1)[1].strip()
    else:
        story = story_text.strip()
    
    return story


def convert_text_to_audio(text_content, audio_path="output.mp3"):
    """
    将文本转换为音频文件。
    """
    tts_engine = gTTS(text=text_content, lang="en")
    tts_engine.save(audio_path)
    return audio_path


def run_app():
    st.markdown("<h1 style='text-align: center;'>Your Image to Audio Story 🦜</h1>", unsafe_allow_html=True)
    st.write("Upload an image below and we will generate an engaging story from the picture, then convert the story into an audio playback!")

    uploaded_image = st.file_uploader("Select an Image", type=["png", "jpg", "jpeg"])

    if uploaded_image is not None:
        image_display = Image.open(uploaded_image)
        st.image(image_display, caption="Uploaded Image", use_container_width=True)
        
        with st.spinner("Generating caption for the image..."):
            caption_text = extract_image_caption(uploaded_image)
        st.write("**Generated Caption:**", caption_text)
        
        with st.spinner("Composing story..."):
            story_text = compose_story_from_caption(caption_text)
        st.write("**Story:**")
        st.write(story_text)
        
        with st.spinner("Converting text to audio..."):
            audio_file = convert_text_to_audio(story_text)
        st.audio(audio_file, format="audio/mp3")


if __name__ == "__main__":
    run_app()