Spaces:
Running
Running
File size: 7,518 Bytes
2a7d044 c5c0d60 007d471 c5c0d60 007d471 c5c0d60 007d471 c5c0d60 8e93ddc c5c0d60 8e93ddc c5c0d60 007d471 8e93ddc c5c0d60 ec1031e c5c0d60 007d471 c5c0d60 007d471 c5c0d60 007d471 c5c0d60 007d471 c5c0d60 007d471 c5c0d60 007d471 c5c0d60 007d471 3a947b7 c5c0d60 8e93ddc c5c0d60 8e93ddc 8621d6e 8e93ddc c5c0d60 8e93ddc c5c0d60 8e93ddc c5c0d60 b693fac 8e93ddc c5c0d60 8e93ddc c5c0d60 8e93ddc c5c0d60 8e93ddc c5c0d60 8e93ddc c5c0d60 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
import streamlit as st
from PIL import Image
import io
import soundfile as sf
import numpy as np
import torch
from transformers import pipeline
from diffusers import StableAudioPipeline
# --- Configuration ---
# Determine the optimal device for model inference
# Prioritize CUDA (NVIDIA GPUs), then MPS (Apple Silicon), fallback to CPU
DEVICE = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu")
# Use float16 for reduced memory and faster inference on compatible hardware (GPU/MPS)
# Fallback to float32 for CPU for better stability
TORCH_DTYPE = torch.float16 if DEVICE in ["cuda", "mps"] else torch.float32
# --- Cached Model Loading Functions ---
@st.cache_resource(show_spinner="Loading Image Captioning Model (BLIP)...")
def load_blip_model():
"""
Loads the BLIP image captioning model using Hugging Face transformers pipeline.
The model is cached to prevent reloading on every Streamlit rerun.
"""
try:
captioner = pipeline(
"image-to-text",
model="Salesforce/blip-image-captioning-base",
torch_dtype=TORCH_DTYPE,
device=DEVICE
)
return captioner
except Exception as e:
st.error(f"Failed to load BLIP model: {e}")
return None
@st.cache_resource(show_spinner="Loading Audio Generation Model (Stable Audio Open Small)...")
def load_stable_audio_model():
"""
Loads the Stable Audio Open Small pipeline using Hugging Face diffusers.
The pipeline is cached to prevent reloading on every Streamlit rerun.
"""
try:
# Changed model to stabilityai/stable-audio-open-small
audio_pipeline = StableAudioPipeline.from_pretrained(
"stabilityai/stable-audio-open-1.0",
torch_dtype=TORCH_DTYPE
).to(DEVICE)
return audio_pipeline
except Exception as e:
st.error(f"Failed to load Stable Audio model: {e}")
return None
# --- Audio Conversion Utility ---
def convert_numpy_to_wav_bytes(audio_array: np.ndarray, sample_rate: int) -> bytes:
"""
Converts a NumPy audio array to an in-memory WAV byte stream.
This avoids writing temporary files to disk, which is efficient and
suitable for ephemeral environments like Hugging Face Spaces.
"""
byte_io = io.BytesIO()
# Stable Audio Open's diffusers output is (channels, frames).
# soundfile typically expects (frames, channels) for stereo.
# Transpose if it's a 2D array (stereo) to match soundfile's expectation.
if audio_array.ndim == 2 and audio_array.shape == 2: # Check if stereo (2 channels)
audio_array = audio_array.T # Transpose to (frames, channels) [1]
# Write the NumPy array to the in-memory BytesIO object as a WAV file [1, 2]
sf.write(byte_io, audio_array, sample_rate, format='WAV', subtype='FLOAT')
# IMPORTANT: Reset the stream position to the beginning before reading [3]
byte_io.seek(0)
return byte_io.read()
# --- Streamlit App Layout ---
st.set_page_config(layout="centered", page_title="Image-to-Soundscape Generator")
st.title("๐๏ธ Image-to-Soundscape Generator ๐ถ")
st.markdown("Upload a landscape image, and let AI transform it into a unique soundscape!")
# Initialize session state for persistence across reruns [4]
if "audio_bytes" not in st.session_state:
st.session_state.audio_bytes = None
if "image_uploaded" not in st.session_state:
st.session_state.image_uploaded = False
# --- UI Components ---
uploaded_file = st.file_uploader("Choose a landscape image...", type=["jpg", "jpeg", "png"]) # [5]
if uploaded_file is not None:
st.session_state.image_uploaded = True
image = Image.open(uploaded_file).convert("RGB") # Ensure image is in RGB format
st.image(image, caption="Uploaded Image", use_container_width=True) # Updated deprecated parameter [6]
# Button to trigger the generation pipeline
if st.button("Generate Soundscape"):
st.session_state.audio_bytes = None # Clear previous audio
with st.spinner("Generating soundscape... This may take a moment."): # [4]
try:
# 1. Load BLIP model and generate caption (hidden from user)
captioner = load_blip_model()
if captioner is None:
st.error("Image captioning model could not be loaded. Please try again.")
st.session_state.image_uploaded = False # Reset to allow re-upload
st.stop()
# Generate caption
# The BLIP pipeline expects a PIL Image object directly
caption_results = captioner(image)
# Extract the generated text from the pipeline's output
generated_caption = caption_results[0]['generated_text']
# Optional: Enhance prompt for soundscape generation
# This helps guide the audio model towards environmental sounds
soundscape_prompt = f"A soundscape of {generated_caption}"
# 2. Load Stable Audio model and generate audio
audio_pipeline = load_stable_audio_model()
if audio_pipeline is None:
st.error("Audio generation model could not be loaded. Please try again.")
st.session_state.image_uploaded = False # Reset to allow re-upload
st.stop()
# Generate audio with optimized parameters for speed [7, 8]
# num_inference_steps: Lower for faster generation, higher for better quality
# audio_end_in_s: Shorter audio for faster generation (max 11s for stable-audio-open-small) [10, 11, 12]
# negative_prompt: Helps improve perceived quality [8]
audio_output = audio_pipeline(
prompt=soundscape_prompt,
num_inference_steps=10, # Tuned for faster generation [8]
audio_end_in_s=5, # 10 seconds audio length (within 11s limit for small model) [10, 11, 12]
negative_prompt="low quality, average quality, distorted" # [8]
)
# Extract the NumPy array and sample rate [9]
audio_numpy_array = audio_output.audios
sample_rate = audio_pipeline.config.sampling_rate
# 3. Convert NumPy array to WAV bytes and store in session state
st.session_state.audio_bytes = convert_numpy_to_wav_bytes(audio_numpy_array, sample_rate)
st.success("Soundscape generated successfully!")
except Exception as e:
st.error(f"An error occurred during generation: {e}") #
st.session_state.audio_bytes = None # Clear any partial audio
st.session_state.image_uploaded = False # Reset to allow re-upload
st.exception(e) # Display full traceback for debugging
# Display generated soundscape if available in session state
if st.session_state.audio_bytes:
st.subheader("Generated Soundscape:")
st.audio(st.session_state.audio_bytes, format='audio/wav') #
st.markdown("You can download the audio using the controls above.")
# Reset button for new image upload
if st.session_state.image_uploaded and st.button("Upload New Image"):
st.session_state.audio_bytes = None
st.session_state.image_uploaded = False
st.rerun() # Rerun the app to clear the file uploader |