Spaces:
Sleeping
Sleeping
File size: 6,405 Bytes
ac70fac 70fa8c1 243f6f7 70fa8c1 ac70fac 70fa8c1 4204a24 70fa8c1 44bce0a 70fa8c1 1351e1c 70fa8c1 ac70fac 70fa8c1 38c8a94 70fa8c1 5bd24ba 38c8a94 70fa8c1 5bd24ba 70fa8c1 1351e1c 70fa8c1 1351e1c 70fa8c1 ac70fac 644d364 0f3e58e 644d364 0f3e58e cf27fca 0f3e58e 644d364 ac70fac cf27fca ac70fac 644d364 a200433 644d364 0f3e58e 644d364 0f3e58e 644d364 0f3e58e 644d364 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 |
# ======================================
# Package Import
# ======================================
import streamlit as st
from PIL import Image
import time
from transformers import pipeline
# ======================================
# Basic Initialization
# ======================================
# Initialize image captioning pipeline with pretrained model
_image_caption_pipeline = pipeline(
task="image-to-text",
model="cnmoro/tiny-image-captioning"
)
# Global model configuration constants
_text_generation_pipeline = pipeline("text-generation", model="Qwen/Qwen3-0.6B",max_new_tokens=100)
# Initialize TTS components once to avoid reloading
_SPEECH_PIPELINE = pipeline("text-to-speech", model="facebook/mms-tts-eng")
# ======================================
# Function settings
# ======================================
def generate_image_caption(input_image):
"""
Generate a textual description for an input image using a pretrained model.
Args:
input_image (Union[PIL.Image.Image, str]): Image to process. Can be either:
- A PIL Image object
- A string containing a filesystem path to an image file
Returns:
str: Generated caption text in natural language
Example:
>>> from PIL import Image
>>> img = Image.open("photo.jpg")
>>> caption = generate_image_caption(img)
>>> print(f"Caption: {caption}")
"""
# Process image through the captioning pipeline
inference_results = _image_caption_pipeline(input_image)
# Extract text from the first (and only) result dictionary
caption_text = inference_results[0]['generated_text']
return caption_text
def generate_story_content(system_prompt: str, user_prompt: str) -> str:
"""
Generates a children's story based on provided system and user prompts.
Args:
system_prompt: Defines the assistant's role and writing constraints
user_prompt: Describes the story scenario and specific elements to include
Returns:
Generated story text without any thinking process metadata
Raises:
RuntimeError: If text generation fails at any stage
Example:
>>> story = generate_story_content(
... "You are a helpful children's author...",
... "Kids playing with dogs in a sunny meadow..."
... )
"""
try:
# Prepare chat message structure
conversation_history = [
{"role": "user", "content": system_prompt+user_prompt+"/no_think"},
]
# Generate the story
story=_text_generation_pipeline(conversation_history)
# Extract the stroy result
stroy_result=story[0]["generated_text"][1]["content"][19:]
# Process and clean output
return stroy_result
except Exception as error:
raise RuntimeError(f"Story generation failed: {str(error)}") from error
def generate_audio_from_story(story_text: str) -> str:
"""
Convert text story to speech audio file using text-to-speech synthesis.
Args:
story_text: Input story text to synthesize
Returns:
Path to generated audio file
Raises:
ValueError: For empty/invalid input text
RuntimeError: If audio generation fails
Example:
>>> generate_audio_from_story("Children playing in the park", "story_audio.wav")
'story_audio.wav'
"""
# Validate input text
if not isinstance(story_text, str) or not story_text.strip():
raise ValueError("Input story text must be a non-empty string")
try:
# Generate speech
speech_output = _SPEECH_PIPELINE( story_text )
return speech_output
except Exception as error:
raise RuntimeError(f"Audio synthesis failed: {str(error)}") from error
# ======================================
# Main Application Interface
# ======================================
def main():
"""Main application interface for Streamlit"""
# Page configuration
st.set_page_config(
page_title="Fantasy Adventure Generator",
layout="wide",
initial_sidebar_state="collapsed"
)
# Title and description
st.title("π§ββοΈ Fantasy Adventure Story Generator")
st.markdown("""
Upload an image and get:
- Automatic scene description
- AI-generated adventure story
- Audio version of the story
""")
# Help section
st.markdown("---")
st.subheader("π How to Use:")
st.info("""
1. Upload any picture (animals, nature, or people work best!)
2. Click the generating button
3. Wait for image analysis to complete
4. Enjoy your story and audio!
""")
# File uploader
uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
if uploaded_file is not None:
# Process image
image = Image.open(uploaded_file).convert("RGB")
# Layout columns
col1, col2 = st.columns(2)
with col1:
st.image(image, caption="Uploaded Image", use_container_width=True)
# Generation button
if st.button("β¨ Generate Story & Audio"):
with st.spinner("Processing your request..."):
# Generate outputs and Display results
with col2:
st.subheader("π Scene Description")
with st.spinner("Preparing story caption..."):
caption = generate_image_caption(image)
st.write(caption)
st.subheader("π Generated Story")
with st.spinner("Preparing story..."):
sys_prompt = "You are a fantasy writer. Create a 100-word adventure story about "
story = generate_story_content(sys_prompt, caption)
st.write(story)
st.subheader("π Audio Playback")
with st.spinner("Preparing speech..."):
speech = generate_audio_from_story(story)
st.audio(speech["audio"], sample_rate=speech["sampling_rate"], format='audio/wav')
if __name__ == "__main__":
main() |