File size: 5,336 Bytes
c4e7950 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
from typing import Iterable, Tuple
import numpy as np
import torch
from librosa.beat import beat_track
from PIL import Image
from tqdm.auto import tqdm
# from diffusers import AudioDiffusionPipeline
from .pipeline_audio_diffusion import AudioDiffusionPipeline
from .image_encoder import ImageEncoder
VERSION = "1.5.6"
class AudioDiffusion:
def __init__(
self,
model_id: str = "teticio/audio-diffusion-256",
cuda: bool = torch.cuda.is_available(),
progress_bar: Iterable = tqdm,
):
"""Class for generating audio using De-noising Diffusion Probabilistic Models.
Args:
model_id (String): name of model (local directory or Hugging Face Hub)
cuda (bool): use CUDA?
progress_bar (iterable): iterable callback for progress updates or None
"""
self.model_id = model_id
self.pipe = AudioDiffusionPipeline.from_pretrained(self.model_id)
if cuda:
self.pipe.to("cuda")
self.progress_bar = progress_bar or (lambda _: _)
def generate_spectrogram_and_audio(
self,
steps: int = None,
generator: torch.Generator = None,
step_generator: torch.Generator = None,
eta: float = 0,
noise: torch.Tensor = None,
encoding: torch.Tensor = None,
) -> Tuple[Image.Image, Tuple[int, np.ndarray]]:
"""Generate random mel spectrogram and convert to audio.
Args:
steps (int): number of de-noising steps (defaults to 50 for DDIM, 1000 for DDPM)
generator (torch.Generator): random number generator or None
step_generator (torch.Generator): random number generator used to de-noise or None
eta (float): parameter between 0 and 1 used with DDIM scheduler
noise (torch.Tensor): noisy image or None
encoding (`torch.Tensor`): for UNet2DConditionModel shape (batch_size, seq_length, cross_attention_dim)
Returns:
PIL Image: mel spectrogram
(float, np.ndarray): sample rate and raw audio
"""
images, (sample_rate, audios) = self.pipe(
batch_size=1,
steps=steps,
generator=generator,
step_generator=step_generator,
eta=eta,
noise=noise,
encoding=encoding,
return_dict=False,
)
return images[0], (sample_rate, audios[0])
def generate_spectrogram_and_audio_from_audio(
self,
audio_file: str = None,
raw_audio: np.ndarray = None,
slice: int = 0,
start_step: int = 0,
steps: int = None,
generator: torch.Generator = None,
mask_start_secs: float = 0,
mask_end_secs: float = 0,
step_generator: torch.Generator = None,
eta: float = 0,
encoding: torch.Tensor = None,
noise: torch.Tensor = None,
) -> Tuple[Image.Image, Tuple[int, np.ndarray]]:
"""Generate random mel spectrogram from audio input and convert to audio.
Args:
audio_file (str): must be a file on disk due to Librosa limitation or
raw_audio (np.ndarray): audio as numpy array
slice (int): slice number of audio to convert
start_step (int): step to start from
steps (int): number of de-noising steps (defaults to 50 for DDIM, 1000 for DDPM)
generator (torch.Generator): random number generator or None
mask_start_secs (float): number of seconds of audio to mask (not generate) at start
mask_end_secs (float): number of seconds of audio to mask (not generate) at end
step_generator (torch.Generator): random number generator used to de-noise or None
eta (float): parameter between 0 and 1 used with DDIM scheduler
encoding (`torch.Tensor`): for UNet2DConditionModel shape (batch_size, seq_length, cross_attention_dim)
noise (torch.Tensor): noisy image or None
Returns:
PIL Image: mel spectrogram
(float, np.ndarray): sample rate and raw audio
"""
images, (sample_rate, audios) = self.pipe(
batch_size=1,
audio_file=audio_file,
raw_audio=raw_audio,
slice=slice,
start_step=start_step,
steps=steps,
generator=generator,
mask_start_secs=mask_start_secs,
mask_end_secs=mask_end_secs,
step_generator=step_generator,
eta=eta,
noise=noise,
encoding=encoding,
return_dict=False,
)
return images[0], (sample_rate, audios[0])
@staticmethod
def loop_it(audio: np.ndarray, sample_rate: int, loops: int = 12) -> np.ndarray:
"""Loop audio
Args:
audio (np.ndarray): audio as numpy array
sample_rate (int): sample rate of audio
loops (int): number of times to loop
Returns:
(float, np.ndarray): sample rate and raw audio or None
"""
_, beats = beat_track(y=audio, sr=sample_rate, units="samples")
beats_in_bar = (len(beats) - 1) // 4 * 4
if beats_in_bar > 0:
return np.tile(audio[beats[0] : beats[beats_in_bar]], loops)
return None
|