Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import numpy as np | |
| import torch | |
| from einops import rearrange | |
| from stable_audio_tools import get_pretrained_model | |
| from stable_audio_tools.inference.generation import generate_diffusion_cond | |
| # ---------- Load model ---------- | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| MODEL_REPO = "stabilityai/stable-audio-open-small" # accept license once on the model page | |
| # Download + load (cached on first run) | |
| model, model_config = get_pretrained_model(MODEL_REPO) | |
| SAMPLE_RATE = int(model_config["sample_rate"]) # 44100 | |
| SAMPLE_SIZE = int(model_config["sample_size"]) # internal size; we will pass seconds via conditioning | |
| model = model.to(DEVICE) | |
| model.eval() | |
| def tta_seconds_to_sample_size(seconds: float) -> int: | |
| # Clamp to 1–11s (model cap) | |
| seconds = max(1.0, min(float(seconds), 11.0)) | |
| return int(seconds) | |
| def generate_sfx(prompt, seconds, steps, cfg_scale, sampler): | |
| if not prompt or not prompt.strip(): | |
| return None, "Enter a descriptive prompt (e.g., 'footsteps on gravel, outdoors, distant')." | |
| seconds = tta_seconds_to_sample_size(seconds) | |
| # Conditioning per stable-audio-tools API | |
| conditioning = [{ | |
| "prompt": prompt.strip(), | |
| "seconds_total": seconds | |
| }] | |
| # Fast, CPU-friendly defaults: | |
| # steps=8–12 is a good range; pingpong sampler is efficient on CPU | |
| output = generate_diffusion_cond( | |
| model=model, | |
| steps=int(steps), | |
| cfg_scale=float(cfg_scale), | |
| conditioning=conditioning, | |
| sample_size=SAMPLE_SIZE, | |
| sampler_type=sampler, | |
| device=DEVICE | |
| ) | |
| # output shape: (B, C, N) -> here B=1. Make it (C, N) | |
| audio = rearrange(output, "b d n -> d (b n)") | |
| # Normalize to [-1, 1] float32 | |
| audio = audio.to(torch.float32) | |
| peak = torch.max(torch.abs(audio)) | |
| if peak > 0: | |
| audio = (audio / peak).clamp(-1, 1) | |
| audio_np = audio.cpu().numpy() | |
| # Gradio expects (sr, np.ndarray [N] or [N, C]); provide stereo [N, 2] | |
| audio_np = audio_np.T # (N, C) | |
| return (SAMPLE_RATE, audio_np), "Done." | |
| EXAMPLES = [ | |
| "Footsteps on gravel, outdoors, medium pace, natural ambience", | |
| "Heavy metal door slam with long metallic reverb, industrial", | |
| "Rain on window, occasional distant thunder, calm night", | |
| "Camera shutter click, mechanical, clean studio", | |
| "Sci-fi laser blast, short, bright, synthetic fizz" | |
| ] | |
| with gr.Blocks(title="Professor Treviño — Text→SFX (Free)") as demo: | |
| gr.Markdown("### Text-to-Sound Effects — Free, no login\nEnter a descriptive prompt and generate up to ~11s stereo @ 44.1 kHz.") | |
| with gr.Row(): | |
| prompt = gr.Textbox(label="Prompt", placeholder="e.g., footsteps on gravel, outdoors, distant") | |
| with gr.Row(): | |
| seconds = gr.Slider(3, 11, value=6, step=1, label="Duration (seconds)") | |
| steps = gr.Slider(6, 16, value=8, step=1, label="Diffusion steps (higher = better/slower)") | |
| with gr.Row(): | |
| cfg_scale = gr.Slider(0.5, 4.0, value=1.0, step=0.1, label="Guidance (CFG scale)") | |
| sampler = gr.Dropdown(choices=["pingpong", "heun", "dpmpp-2m"], value="pingpong", label="Sampler") | |
| btn = gr.Button("Generate") | |
| audio_out = gr.Audio(label="Output", type="numpy") | |
| status = gr.Markdown() | |
| btn.click(fn=generate_sfx, inputs=[prompt, seconds, steps, cfg_scale, sampler], outputs=[audio_out, status]) | |
| gr.Examples(EXAMPLES, [prompt], [], fn=None, cache_examples=False) | |
| demo.queue(concurrency_count=1, max_size=8).launch() |