import gradio as gr import torch import torchaudio import tempfile import os import json def generate_audio(prompt, duration=10): try: # Import required modules from stable_audio_tools.inference.generation import generate_diffusion_cond from stable_audio_tools.models.utils import load_ckpt_state_dict from stable_audio_tools.models.factory import create_model_from_config from huggingface_hub import hf_hub_download # Get token token = os.getenv("HF_TOKEN") # Download model files model_config_path = hf_hub_download( repo_id="stabilityai/stable-audio-open-1.0", filename="model_config.json", token=token ) model_ckpt_path = hf_hub_download( repo_id="stabilityai/stable-audio-open-1.0", filename="model.safetensors", token=token ) # Load model config with open(model_config_path) as f: model_config = json.load(f) # Create and load model model = create_model_from_config(model_config) model.load_state_dict(load_ckpt_state_dict(model_ckpt_path)) device = "cuda" if torch.cuda.is_available() else "cpu" model = model.to(device) model.eval() # Generate audio conditioning = [{ "prompt": prompt, "seconds_start": 0, "seconds_total": duration }] # Generate the audio output = generate_diffusion_cond( model, steps=100, cfg_scale=7, conditioning=conditioning, sample_rate=44100, sigma_min=0.3, sigma_max=500, sampler_type="dpmpp-3m-sde", device=device ) # Convert to numpy and save audio_output = output[0].cpu().numpy() # Create temporary file temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") torchaudio.save(temp_file.name, torch.from_numpy(audio_output), 44100) return temp_file.name except Exception as e: return f"Error: {str(e)}" # Create interface with AUDIO output demo = gr.Interface( fn=generate_audio, inputs=[ gr.Textbox( label="🎵 Audio Prompt", placeholder="heavy boots thudding on wet sand", value="heavy boots thudding on wet sand" ), gr.Slider(5, 47, 10, step=1, label="⏱️ Duration (seconds)") ], outputs=gr.Audio(label="🔊 Generated Audio"), # This will play audio! title="🎵 Stable Audio Generator - WORKING!", description="Generate real audio from text descriptions" ) demo.launch()