teticio commited on
Commit
e66133f
1 Parent(s): 600e950

add abilithy to generate audio from another audio

Browse files
audio_to_images.py CHANGED
@@ -80,4 +80,8 @@ if __name__ == "__main__":
80
  parser.add_argument("--hop_length", type=int, default=512)
81
  parser.add_argument("--push_to_hub", type=str, default=None)
82
  args = parser.parse_args()
 
 
 
 
83
  main(args)
 
80
  parser.add_argument("--hop_length", type=int, default=512)
81
  parser.add_argument("--push_to_hub", type=str, default=None)
82
  args = parser.parse_args()
83
+ if args.input_dir is None:
84
+ raise ValueError(
85
+ "You must specify an input directory for the audio files."
86
+ )
87
  main(args)
audiodiffusion/__init__.py CHANGED
@@ -1,61 +1,133 @@
 
 
 
1
  import numpy as np
2
  from PIL import Image
3
- from torch import cuda
4
  from diffusers import DDPMPipeline
5
  from librosa.beat import beat_track
6
 
7
  from .mel import Mel
8
 
9
- VERSION = "1.0.1"
10
 
11
 
12
  class AudioDiffusion:
13
 
14
  def __init__(self,
15
- model_id="teticio/audio-diffusion-256",
16
- resolution=256,
17
- cuda=cuda.is_available()):
 
18
  """Class for generating audio using Denoising Diffusion Probabilistic Models.
19
 
20
  Args:
21
  model_id (String): name of model (local directory or Hugging Face Hub)
22
  resolution (int): size of square mel spectrogram in pixels
23
  cuda (bool): use CUDA?
 
24
  """
25
  self.mel = Mel(x_res=resolution, y_res=resolution)
26
  self.model_id = model_id
27
  self.ddpm = DDPMPipeline.from_pretrained(self.model_id)
28
  if cuda:
29
  self.ddpm.to("cuda")
 
30
 
31
- def generate_spectrogram_and_audio(self):
 
 
 
32
  """Generate random mel spectrogram and convert to audio.
33
 
 
 
 
34
  Returns:
35
  PIL Image: mel spectrogram
36
- (float, array): sample rate and raw audio
37
  """
38
- images = self.ddpm(output_type="numpy")["sample"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  images = (images * 255).round().astype("uint8").transpose(0, 3, 1, 2)
40
  image = Image.fromarray(images[0][0])
41
  audio = self.mel.image_to_audio(image)
42
  return image, (self.mel.get_sample_rate(), audio)
43
 
44
  @staticmethod
45
- def loop_it(audio, sample_rate, loops=12):
 
 
46
  """Loop audio
47
 
48
  Args:
49
- audio (array): audio as numpy array
50
  sample_rate (int): sample rate of audio
51
  loops (int): number of times to loop
52
 
53
  Returns:
54
- (float, array): sample rate and raw audio or None
55
  """
56
- tempo, beats = beat_track(y=audio, sr=sample_rate, units='samples')
57
- if len(beats) > 8:
58
- return np.tile(audio[beats[0]:beats[8]], loops)
59
- if len(beats) > 4:
60
- return np.tile(audio[beats[0]:beats[4]], loops)
61
  return None
 
1
+ from typing import Iterable, Tuple
2
+
3
+ import torch
4
  import numpy as np
5
  from PIL import Image
6
+ from tqdm.auto import tqdm
7
  from diffusers import DDPMPipeline
8
  from librosa.beat import beat_track
9
 
10
  from .mel import Mel
11
 
12
+ VERSION = "1.1.1"
13
 
14
 
15
  class AudioDiffusion:
16
 
17
  def __init__(self,
18
+ model_id: str = "teticio/audio-diffusion-256",
19
+ resolution: int = 256,
20
+ cuda: bool = torch.cuda.is_available(),
21
+ progress_bar: Iterable = tqdm):
22
  """Class for generating audio using Denoising Diffusion Probabilistic Models.
23
 
24
  Args:
25
  model_id (String): name of model (local directory or Hugging Face Hub)
26
  resolution (int): size of square mel spectrogram in pixels
27
  cuda (bool): use CUDA?
28
+ progress_bar (iterable): iterable callback for progress updates or None
29
  """
30
  self.mel = Mel(x_res=resolution, y_res=resolution)
31
  self.model_id = model_id
32
  self.ddpm = DDPMPipeline.from_pretrained(self.model_id)
33
  if cuda:
34
  self.ddpm.to("cuda")
35
+ self.progress_bar = progress_bar or (lambda _: _)
36
 
37
+ def generate_spectrogram_and_audio(
38
+ self,
39
+ generator: torch.Generator = None
40
+ ) -> Tuple[Image.Image, Tuple[int, np.ndarray]]:
41
  """Generate random mel spectrogram and convert to audio.
42
 
43
+ Args:
44
+ generator (torch.Generator): random number generator or None
45
+
46
  Returns:
47
  PIL Image: mel spectrogram
48
+ (float, np.ndarray): sample rate and raw audio
49
  """
50
+ images = self.ddpm(output_type="numpy", generator=generator)["sample"]
51
+ images = (images * 255).round().astype("uint8").transpose(0, 3, 1, 2)
52
+ image = Image.fromarray(images[0][0])
53
+ audio = self.mel.image_to_audio(image)
54
+ return image, (self.mel.get_sample_rate(), audio)
55
+
56
+ @torch.no_grad()
57
+ def generate_spectrogram_and_audio_from_audio(
58
+ self,
59
+ audio_file: str = None,
60
+ raw_audio: np.ndarray = None,
61
+ slice: int = 0,
62
+ start_step: int = 0,
63
+ steps: int = 1000,
64
+ generator: torch.Generator = None
65
+ ) -> Tuple[Image.Image, Tuple[int, np.ndarray]]:
66
+ """Generate random mel spectrogram from audio input and convert to audio.
67
+
68
+ Args:
69
+ audio_file (str): must be a file on disk due to Librosa limitation or
70
+ raw_audio (np.ndarray): audio as numpy array
71
+ slice (int): slice number of audio to convert
72
+ start_step (int): step to start from
73
+ steps (int): number of de-noising steps to perform
74
+ generator (torch.Generator): random number generator or None
75
+
76
+ Returns:
77
+ PIL Image: mel spectrogram
78
+ (float, np.ndarray): sample rate and raw audio
79
+ """
80
+
81
+ # It would be better to derive a class from DDPMDiffusionPipeline
82
+ # but currently the return type ImagePipelineOutput cannot be imported.
83
+ images = torch.randn(
84
+ (1, self.ddpm.unet.in_channels, self.ddpm.unet.sample_size,
85
+ self.ddpm.unet.sample_size),
86
+ generator=generator,
87
+ )
88
+ if audio_file is not None or raw_audio is not None:
89
+ self.mel.load_audio(audio_file, raw_audio)
90
+ input_image = self.mel.audio_slice_to_image(slice)
91
+ input_image = np.frombuffer(input_image.tobytes(),
92
+ dtype="uint8").reshape(
93
+ (input_image.width,
94
+ input_image.height))
95
+ input_image = ((input_image / 255) * 2 - 1)
96
+ if start_step > 0:
97
+ images[0][0] = self.ddpm.scheduler.add_noise(
98
+ torch.tensor(input_image[np.newaxis, np.newaxis, :]), images,
99
+ steps - start_step)
100
+
101
+ images = images.to(self.ddpm.device)
102
+ self.ddpm.scheduler.set_timesteps(steps)
103
+ for t in self.progress_bar(self.ddpm.scheduler.timesteps[start_step:]):
104
+ model_output = self.ddpm.unet(images, t)['sample']
105
+ images = self.ddpm.scheduler.step(
106
+ model_output, t, images, generator=generator)['prev_sample']
107
+ images = (images / 2 + 0.5).clamp(0, 1)
108
+ images = images.cpu().permute(0, 2, 3, 1).numpy()
109
+
110
  images = (images * 255).round().astype("uint8").transpose(0, 3, 1, 2)
111
  image = Image.fromarray(images[0][0])
112
  audio = self.mel.image_to_audio(image)
113
  return image, (self.mel.get_sample_rate(), audio)
114
 
115
  @staticmethod
116
+ def loop_it(audio: np.ndarray,
117
+ sample_rate: int,
118
+ loops: int = 12) -> np.ndarray:
119
  """Loop audio
120
 
121
  Args:
122
+ audio (np.ndarray): audio as numpy array
123
  sample_rate (int): sample rate of audio
124
  loops (int): number of times to loop
125
 
126
  Returns:
127
+ (float, np.ndarray): sample rate and raw audio or None
128
  """
129
+ _, beats = beat_track(y=audio, sr=sample_rate, units='samples')
130
+ for beats_in_bar in [16, 12, 8, 4]:
131
+ if len(beats) > beats_in_bar:
132
+ return np.tile(audio[beats[0]:beats[beats_in_bar]], loops)
 
133
  return None
audiodiffusion/mel.py CHANGED
@@ -11,12 +11,12 @@ class Mel:
11
 
12
  def __init__(
13
  self,
14
- x_res=256,
15
- y_res=256,
16
- sample_rate=22050,
17
- n_fft=2048,
18
- hop_length=512,
19
- top_db=80,
20
  ):
21
  """Class to convert audio to mel spectrograms and vice versa.
22
 
@@ -39,15 +39,18 @@ class Mel:
39
  self.top_db = top_db
40
  self.y = None
41
 
42
- def load_audio(self, audio_file):
43
  """Load audio.
44
 
45
  Args:
46
- file (str): must be a file on disk due to Librosa limitation
 
47
  """
48
- self.y, _ = librosa.load(audio_file, mono=True)
 
 
49
 
50
- def get_number_of_slices(self):
51
  """Get number of slices in audio.
52
 
53
  Returns:
@@ -55,7 +58,18 @@ class Mel:
55
  """
56
  return len(self.y) // self.slice_size
57
 
58
- def get_sample_rate(self):
 
 
 
 
 
 
 
 
 
 
 
59
  """Get sample rate:
60
 
61
  Returns:
@@ -63,7 +77,7 @@ class Mel:
63
  """
64
  return self.sr
65
 
66
- def audio_slice_to_image(self, slice):
67
  """Convert slice of audio to spectrogram.
68
 
69
  Args:
@@ -73,7 +87,7 @@ class Mel:
73
  PIL Image: grayscale image of x_res x y_res
74
  """
75
  S = librosa.feature.melspectrogram(
76
- y=self.y[self.slice_size * slice:self.slice_size * (slice + 1)],
77
  sr=self.sr,
78
  n_fft=self.n_fft,
79
  hop_length=self.hop_length,
@@ -86,14 +100,14 @@ class Mel:
86
  image = Image.frombytes("L", log_S.shape, bytedata.tobytes())
87
  return image
88
 
89
- def image_to_audio(self, image):
90
  """Converts spectrogram to audio.
91
 
92
  Args:
93
  image (PIL Image): x_res x y_res grayscale image
94
 
95
  Returns:
96
- audio (array): raw audio
97
  """
98
  bytedata = np.frombuffer(image.tobytes(), dtype="uint8").reshape(
99
  (image.width, image.height))
 
11
 
12
  def __init__(
13
  self,
14
+ x_res: int = 256,
15
+ y_res: int = 256,
16
+ sample_rate: int = 22050,
17
+ n_fft: int = 2048,
18
+ hop_length: int = 512,
19
+ top_db: int = 80,
20
  ):
21
  """Class to convert audio to mel spectrograms and vice versa.
22
 
 
39
  self.top_db = top_db
40
  self.y = None
41
 
42
+ def load_audio(self, audio_file: str = None, raw_audio: np.ndarray = None):
43
  """Load audio.
44
 
45
  Args:
46
+ audio_file (str): must be a file on disk due to Librosa limitation or
47
+ raw_audio (np.ndarray): audio as numpy array
48
  """
49
+ self.y, _ = librosa.load(
50
+ audio_file,
51
+ mono=True) if audio_file is not None else raw_audio, None
52
 
53
+ def get_number_of_slices(self) -> int:
54
  """Get number of slices in audio.
55
 
56
  Returns:
 
58
  """
59
  return len(self.y) // self.slice_size
60
 
61
+ def get_audio_slice(self, slice: int = 0) -> np.ndarray:
62
+ """Get slice of audio.
63
+
64
+ Args:
65
+ slice (int): slice number of audio (out of get_number_of_slices())
66
+
67
+ Returns:
68
+ np.ndarray: audio as numpy array
69
+ """
70
+ return self.y[self.slice_size * slice:self.slice_size * (slice + 1)]
71
+
72
+ def get_sample_rate(self) -> int:
73
  """Get sample rate:
74
 
75
  Returns:
 
77
  """
78
  return self.sr
79
 
80
+ def audio_slice_to_image(self, slice: int) -> Image.Image:
81
  """Convert slice of audio to spectrogram.
82
 
83
  Args:
 
87
  PIL Image: grayscale image of x_res x y_res
88
  """
89
  S = librosa.feature.melspectrogram(
90
+ y=self.get_audio_slice(slice),
91
  sr=self.sr,
92
  n_fft=self.n_fft,
93
  hop_length=self.hop_length,
 
100
  image = Image.frombytes("L", log_S.shape, bytedata.tobytes())
101
  return image
102
 
103
+ def image_to_audio(self, image: Image.Image) -> np.ndarray:
104
  """Converts spectrogram to audio.
105
 
106
  Args:
107
  image (PIL Image): x_res x y_res grayscale image
108
 
109
  Returns:
110
+ audio (np.ndarray): raw audio
111
  """
112
  bytedata = np.frombuffer(image.tobytes(), dtype="uint8").reshape(
113
  (image.width, image.height))
notebooks/test_model.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
tmp_model ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit 3750ad3934edb6562655a80b1572c975203ff92b
train_unconditional.py CHANGED
@@ -315,5 +315,8 @@ if __name__ == "__main__":
315
  raise ValueError(
316
  "You must specify either a dataset name from the hub or a train data directory."
317
  )
 
 
 
318
 
319
  main(args)
 
315
  raise ValueError(
316
  "You must specify either a dataset name from the hub or a train data directory."
317
  )
318
+ if args.dataset_name is not None and args.dataset_name == args.hub_model_id:
319
+ raise ValueError(
320
+ "The local dataset name must be different from the hub model id.")
321
 
322
  main(args)