Spaces:

Thumbwrestler
/

Trevino_Sound_Effects_Generator

Runtime error

App Files Files Community

Trevino_Sound_Effects_Generator / app.py

Thumbwrestler

Create app.py

b6528cf verified 3 months ago

raw

history blame contribute delete

3.53 kB

	import gradio as gr
	import numpy as np
	import torch
	from einops import rearrange
	from stable_audio_tools import get_pretrained_model
	from stable_audio_tools.inference.generation import generate_diffusion_cond

	# ---------- Load model ----------
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	MODEL_REPO = "stabilityai/stable-audio-open-small" # accept license once on the model page

	# Download + load (cached on first run)
	model, model_config = get_pretrained_model(MODEL_REPO)
	SAMPLE_RATE = int(model_config["sample_rate"]) # 44100
	SAMPLE_SIZE = int(model_config["sample_size"]) # internal size; we will pass seconds via conditioning
	model = model.to(DEVICE)
	model.eval()

	def tta_seconds_to_sample_size(seconds: float) -> int:
	# Clamp to 1–11s (model cap)
	seconds = max(1.0, min(float(seconds), 11.0))
	return int(seconds)

	@torch.inference_mode()
	def generate_sfx(prompt, seconds, steps, cfg_scale, sampler):
	if not prompt or not prompt.strip():
	return None, "Enter a descriptive prompt (e.g., 'footsteps on gravel, outdoors, distant')."
	seconds = tta_seconds_to_sample_size(seconds)

	# Conditioning per stable-audio-tools API
	conditioning = [{
	"prompt": prompt.strip(),
	"seconds_total": seconds
	}]

	# Fast, CPU-friendly defaults:
	# steps=8–12 is a good range; pingpong sampler is efficient on CPU
	output = generate_diffusion_cond(
	model=model,
	steps=int(steps),
	cfg_scale=float(cfg_scale),
	conditioning=conditioning,
	sample_size=SAMPLE_SIZE,
	sampler_type=sampler,
	device=DEVICE
	)

	# output shape: (B, C, N) -> here B=1. Make it (C, N)
	audio = rearrange(output, "b d n -> d (b n)")
	# Normalize to [-1, 1] float32
	audio = audio.to(torch.float32)
	peak = torch.max(torch.abs(audio))
	if peak > 0:
	audio = (audio / peak).clamp(-1, 1)
	audio_np = audio.cpu().numpy()

	# Gradio expects (sr, np.ndarray [N] or [N, C]); provide stereo [N, 2]
	audio_np = audio_np.T # (N, C)
	return (SAMPLE_RATE, audio_np), "Done."

	EXAMPLES = [
	"Footsteps on gravel, outdoors, medium pace, natural ambience",
	"Heavy metal door slam with long metallic reverb, industrial",
	"Rain on window, occasional distant thunder, calm night",
	"Camera shutter click, mechanical, clean studio",
	"Sci-fi laser blast, short, bright, synthetic fizz"
	]

	with gr.Blocks(title="Professor Treviño — Text→SFX (Free)") as demo:
	gr.Markdown("### Text-to-Sound Effects — Free, no login\nEnter a descriptive prompt and generate up to ~11s stereo @ 44.1 kHz.")

	with gr.Row():
	prompt = gr.Textbox(label="Prompt", placeholder="e.g., footsteps on gravel, outdoors, distant")
	with gr.Row():
	seconds = gr.Slider(3, 11, value=6, step=1, label="Duration (seconds)")
	steps = gr.Slider(6, 16, value=8, step=1, label="Diffusion steps (higher = better/slower)")
	with gr.Row():
	cfg_scale = gr.Slider(0.5, 4.0, value=1.0, step=0.1, label="Guidance (CFG scale)")
	sampler = gr.Dropdown(choices=["pingpong", "heun", "dpmpp-2m"], value="pingpong", label="Sampler")

	btn = gr.Button("Generate")
	audio_out = gr.Audio(label="Output", type="numpy")
	status = gr.Markdown()

	btn.click(fn=generate_sfx, inputs=[prompt, seconds, steps, cfg_scale, sampler], outputs=[audio_out, status])
	gr.Examples(EXAMPLES, [prompt], [], fn=None, cache_examples=False)

	demo.queue(concurrency_count=1, max_size=8).launch()