Pixio-Audio

Runtime error

File size: 2,549 Bytes

456ed62
 
 
 
 
a17cbc4
 
456ed62
 
 
 
 
 
 
 
 
a17cbc4
 
 
456ed62
00aa3d6
9b4a54c
456ed62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
00aa3d6
a17cbc4
 
456ed62
a17cbc4
456ed62
 
a17cbc4
456ed62

import torch
import torchaudio
from einops import rearrange
import gradio as gr
import spaces
import os
import uuid

# Importing the model-related functions
from stable_audio_tools import get_pretrained_model
from stable_audio_tools.inference.generation import generate_diffusion_cond

# Function to set up, generate, and process the audio
@spaces.GPU(duration=120)  # Allocate GPU only when this function is called
def generate_audio(prompt, seconds_total=30, steps=100, cfg_scale=7):
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # Fetch the Hugging Face token from the environment variable
    hf_token = os.getenv('HF_TOKEN')
    
    # Download and set up the model
    model, model_config = get_pretrained_model("stabilityai/stable-audio-open-1.0")
    sample_rate = model_config["sample_rate"]
    sample_size = model_config["sample_size"]

    model = model.to(device)

    # Set up text and timing conditioning
    conditioning = [{
        "prompt": prompt,
        "seconds_start": 0,
        "seconds_total": seconds_total
    }]

    # Generate stereo audio
    output = generate_diffusion_cond(
        model,
        steps=steps,
        cfg_scale=cfg_scale,
        conditioning=conditioning,
        sample_size=sample_size,
        sigma_min=0.3,
        sigma_max=500,
        sampler_type="dpmpp-3m-sde",
        device=device
    )

    # Rearrange audio batch to a single sequence
    output = rearrange(output, "b d n -> d (b n)")

    # Peak normalize, clip, convert to int16
    output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()

    # Generate a unique filename for the output
    unique_filename = f"output_{uuid.uuid4().hex}.wav"
    # Save to file
    torchaudio.save(unique_filename, output, sample_rate)

    # Return the path to the generated audio file
    return unique_filename

# Setting up the Gradio Interface
interface = gr.Interface(
    fn=generate_audio,
    inputs=[
        gr.Textbox(label="Prompt", placeholder="Enter your text prompt here"),
        gr.Slider(0, 47, value=30, label="Duration in Seconds"),
        gr.Slider(10, 300, value=100, step=10, label="Number of Diffusion Steps"),
        gr.Slider(1, 15, value=7, step=0.1, label="CFG Scale")
    ],
    outputs=gr.Audio(type="filepath", label="Generated Audio"),
    title="Stable Audio Generator",
    description="Generate variable-length stereo audio at 44.1kHz from text prompts using Stable Audio Open 1.0."
)

# Launch the Interface
interface.launch()