Spaces:
Running
Running
import streamlit as st | |
from PIL import Image | |
import io | |
import soundfile as sf | |
import numpy as np | |
import torch | |
from transformers import pipeline | |
from diffusers import StableAudioPipeline | |
# --- Configuration --- | |
# Determine the optimal device for model inference | |
# Prioritize CUDA (NVIDIA GPUs), then MPS (Apple Silicon), fallback to CPU | |
DEVICE = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu") | |
# Use float16 for reduced memory and faster inference on compatible hardware (GPU/MPS) | |
# Fallback to float32 for CPU for better stability | |
TORCH_DTYPE = torch.float16 if DEVICE in ["cuda", "mps"] else torch.float32 | |
# --- Cached Model Loading Functions --- | |
def load_blip_model(): | |
""" | |
Loads the BLIP image captioning model using Hugging Face transformers pipeline. | |
The model is cached to prevent reloading on every Streamlit rerun. | |
""" | |
try: | |
captioner = pipeline( | |
"image-to-text", | |
model="Salesforce/blip-image-captioning-base", | |
torch_dtype=TORCH_DTYPE, | |
device=DEVICE | |
) | |
return captioner | |
except Exception as e: | |
st.error(f"Failed to load BLIP model: {e}") | |
return None | |
def load_stable_audio_model(): | |
""" | |
Loads the Stable Audio Open Small pipeline using Hugging Face diffusers. | |
The pipeline is cached to prevent reloading on every Streamlit rerun. | |
""" | |
try: | |
# Changed model to stabilityai/stable-audio-open-small | |
audio_pipeline = StableAudioPipeline.from_pretrained( | |
"stabilityai/stable-audio-open-1.0", | |
torch_dtype=TORCH_DTYPE | |
).to(DEVICE) | |
return audio_pipeline | |
except Exception as e: | |
st.error(f"Failed to load Stable Audio model: {e}") | |
return None | |
# --- Audio Conversion Utility --- | |
def convert_numpy_to_wav_bytes(audio_array: np.ndarray, sample_rate: int) -> bytes: | |
""" | |
Converts a NumPy audio array to an in-memory WAV byte stream. | |
This avoids writing temporary files to disk, which is efficient and | |
suitable for ephemeral environments like Hugging Face Spaces. | |
""" | |
byte_io = io.BytesIO() | |
# Stable Audio Open's diffusers output is (channels, frames). | |
# soundfile typically expects (frames, channels) for stereo. | |
# Transpose if it's a 2D array (stereo) to match soundfile's expectation. | |
if audio_array.ndim == 2 and audio_array.shape == 2: # Check if stereo (2 channels) | |
audio_array = audio_array.T # Transpose to (frames, channels) [1] | |
# Write the NumPy array to the in-memory BytesIO object as a WAV file [1, 2] | |
sf.write(byte_io, audio_array, sample_rate, format='WAV', subtype='FLOAT') | |
# IMPORTANT: Reset the stream position to the beginning before reading [3] | |
byte_io.seek(0) | |
return byte_io.read() | |
# --- Streamlit App Layout --- | |
st.set_page_config(layout="centered", page_title="Image-to-Soundscape Generator") | |
st.title("ποΈ Image-to-Soundscape Generator πΆ") | |
st.markdown("Upload a landscape image, and let AI transform it into a unique soundscape!") | |
# Initialize session state for persistence across reruns [4] | |
if "audio_bytes" not in st.session_state: | |
st.session_state.audio_bytes = None | |
if "image_uploaded" not in st.session_state: | |
st.session_state.image_uploaded = False | |
# --- UI Components --- | |
uploaded_file = st.file_uploader("Choose a landscape image...", type=["jpg", "jpeg", "png"]) # [5] | |
if uploaded_file is not None: | |
st.session_state.image_uploaded = True | |
image = Image.open(uploaded_file).convert("RGB") # Ensure image is in RGB format | |
st.image(image, caption="Uploaded Image", use_container_width=True) # Updated deprecated parameter [6] | |
# Button to trigger the generation pipeline | |
if st.button("Generate Soundscape"): | |
st.session_state.audio_bytes = None # Clear previous audio | |
with st.spinner("Generating soundscape... This may take a moment."): # [4] | |
try: | |
# 1. Load BLIP model and generate caption (hidden from user) | |
captioner = load_blip_model() | |
if captioner is None: | |
st.error("Image captioning model could not be loaded. Please try again.") | |
st.session_state.image_uploaded = False # Reset to allow re-upload | |
st.stop() | |
# Generate caption | |
# The BLIP pipeline expects a PIL Image object directly | |
caption_results = captioner(image) | |
# Extract the generated text from the pipeline's output | |
generated_caption = caption_results[0]['generated_text'] | |
# Optional: Enhance prompt for soundscape generation | |
# This helps guide the audio model towards environmental sounds | |
soundscape_prompt = f"A soundscape of {generated_caption}" | |
# 2. Load Stable Audio model and generate audio | |
audio_pipeline = load_stable_audio_model() | |
if audio_pipeline is None: | |
st.error("Audio generation model could not be loaded. Please try again.") | |
st.session_state.image_uploaded = False # Reset to allow re-upload | |
st.stop() | |
# Generate audio with optimized parameters for speed [7, 8] | |
# num_inference_steps: Lower for faster generation, higher for better quality | |
# audio_end_in_s: Shorter audio for faster generation (max 11s for stable-audio-open-small) [10, 11, 12] | |
# negative_prompt: Helps improve perceived quality [8] | |
audio_output = audio_pipeline( | |
prompt=soundscape_prompt, | |
num_inference_steps=10, # Tuned for faster generation [8] | |
audio_end_in_s=5, # 10 seconds audio length (within 11s limit for small model) [10, 11, 12] | |
negative_prompt="low quality, average quality, distorted" # [8] | |
) | |
# Extract the NumPy array and sample rate [9] | |
audio_numpy_array = audio_output.audios | |
sample_rate = audio_pipeline.config.sampling_rate | |
# 3. Convert NumPy array to WAV bytes and store in session state | |
st.session_state.audio_bytes = convert_numpy_to_wav_bytes(audio_numpy_array, sample_rate) | |
st.success("Soundscape generated successfully!") | |
except Exception as e: | |
st.error(f"An error occurred during generation: {e}") # | |
st.session_state.audio_bytes = None # Clear any partial audio | |
st.session_state.image_uploaded = False # Reset to allow re-upload | |
st.exception(e) # Display full traceback for debugging | |
# Display generated soundscape if available in session state | |
if st.session_state.audio_bytes: | |
st.subheader("Generated Soundscape:") | |
st.audio(st.session_state.audio_bytes, format='audio/wav') # | |
st.markdown("You can download the audio using the controls above.") | |
# Reset button for new image upload | |
if st.session_state.image_uploaded and st.button("Upload New Image"): | |
st.session_state.audio_bytes = None | |
st.session_state.image_uploaded = False | |
st.rerun() # Rerun the app to clear the file uploader |