File size: 5,296 Bytes
9f76d9a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
from typing import Iterable, Tuple

import numpy as np
import torch
from librosa.beat import beat_track
from PIL import Image
from tqdm.auto import tqdm

# from diffusers import AudioDiffusionPipeline
from .pipeline_audio_diffusion import AudioDiffusionPipeline

VERSION = "1.5.3"


class AudioDiffusion:
    def __init__(
        self,
        model_id: str = "teticio/audio-diffusion-256",
        cuda: bool = torch.cuda.is_available(),
        progress_bar: Iterable = tqdm,
    ):
        """Class for generating audio using De-noising Diffusion Probabilistic Models.

        Args:
            model_id (String): name of model (local directory or Hugging Face Hub)
            cuda (bool): use CUDA?
            progress_bar (iterable): iterable callback for progress updates or None
        """
        self.model_id = model_id
        self.pipe = AudioDiffusionPipeline.from_pretrained(self.model_id)
        if cuda:
            self.pipe.to("cuda")
        self.progress_bar = progress_bar or (lambda _: _)

    def generate_spectrogram_and_audio(
        self,
        steps: int = None,
        generator: torch.Generator = None,
        step_generator: torch.Generator = None,
        eta: float = 0,
        noise: torch.Tensor = None,
        encoding: torch.Tensor = None,
    ) -> Tuple[Image.Image, Tuple[int, np.ndarray]]:
        """Generate random mel spectrogram and convert to audio.

        Args:
            steps (int): number of de-noising steps (defaults to 50 for DDIM, 1000 for DDPM)
            generator (torch.Generator): random number generator or None
            step_generator (torch.Generator): random number generator used to de-noise or None
            eta (float): parameter between 0 and 1 used with DDIM scheduler
            noise (torch.Tensor): noisy image or None
            encoding (`torch.Tensor`): for UNet2DConditionModel shape (batch_size, seq_length, cross_attention_dim)

        Returns:
            PIL Image: mel spectrogram
            (float, np.ndarray): sample rate and raw audio
        """
        images, (sample_rate, audios) = self.pipe(
            batch_size=1,
            steps=steps,
            generator=generator,
            step_generator=step_generator,
            eta=eta,
            noise=noise,
            encoding=encoding,
            return_dict=False,
        )
        return images[0], (sample_rate, audios[0])

    def generate_spectrogram_and_audio_from_audio(
        self,
        audio_file: str = None,
        raw_audio: np.ndarray = None,
        slice: int = 0,
        start_step: int = 0,
        steps: int = None,
        generator: torch.Generator = None,
        mask_start_secs: float = 0,
        mask_end_secs: float = 0,
        step_generator: torch.Generator = None,
        eta: float = 0,
        encoding: torch.Tensor = None,
        noise: torch.Tensor = None,
    ) -> Tuple[Image.Image, Tuple[int, np.ndarray]]:
        """Generate random mel spectrogram from audio input and convert to audio.

        Args:
            audio_file (str): must be a file on disk due to Librosa limitation or
            raw_audio (np.ndarray): audio as numpy array
            slice (int): slice number of audio to convert
            start_step (int): step to start from
            steps (int): number of de-noising steps (defaults to 50 for DDIM, 1000 for DDPM)
            generator (torch.Generator): random number generator or None
            mask_start_secs (float): number of seconds of audio to mask (not generate) at start
            mask_end_secs (float): number of seconds of audio to mask (not generate) at end
            step_generator (torch.Generator): random number generator used to de-noise or None
            eta (float): parameter between 0 and 1 used with DDIM scheduler
            encoding (`torch.Tensor`): for UNet2DConditionModel shape (batch_size, seq_length, cross_attention_dim)
            noise (torch.Tensor): noisy image or None

        Returns:
            PIL Image: mel spectrogram
            (float, np.ndarray): sample rate and raw audio
        """

        images, (sample_rate, audios) = self.pipe(
            batch_size=1,
            audio_file=audio_file,
            raw_audio=raw_audio,
            slice=slice,
            start_step=start_step,
            steps=steps,
            generator=generator,
            mask_start_secs=mask_start_secs,
            mask_end_secs=mask_end_secs,
            step_generator=step_generator,
            eta=eta,
            noise=noise,
            encoding=encoding,
            return_dict=False,
        )
        return images[0], (sample_rate, audios[0])

    @staticmethod
    def loop_it(audio: np.ndarray, sample_rate: int, loops: int = 12) -> np.ndarray:
        """Loop audio

        Args:
            audio (np.ndarray): audio as numpy array
            sample_rate (int): sample rate of audio
            loops (int): number of times to loop

        Returns:
            (float, np.ndarray): sample rate and raw audio or None
        """
        _, beats = beat_track(y=audio, sr=sample_rate, units="samples")
        beats_in_bar = (len(beats) - 1) // 4 * 4
        if beats_in_bar > 0:
            return np.tile(audio[beats[0] : beats[beats_in_bar]], loops)
        return None