Spaces:

teticio
/

audio-diffusion

Runtime error

App Files Files Community

teticio commited on Sep 25, 2022

Commit

e66133f

•

1 Parent(s): 600e950

add abilithy to generate audio from another audio

Browse files

Files changed (6) hide show

audio_to_images.py +4 -0
audiodiffusion/__init__.py +88 -16
audiodiffusion/mel.py +29 -15
notebooks/test_model.ipynb +0 -0
tmp_model +1 -0
train_unconditional.py +3 -0

audio_to_images.py CHANGED Viewed

@@ -80,4 +80,8 @@ if __name__ == "__main__":
     parser.add_argument("--hop_length", type=int, default=512)
     parser.add_argument("--push_to_hub", type=str, default=None)
     args = parser.parse_args()
     main(args)

     parser.add_argument("--hop_length", type=int, default=512)
     parser.add_argument("--push_to_hub", type=str, default=None)
     args = parser.parse_args()
+    if args.input_dir is None:
+        raise ValueError(
+            "You must specify an input directory for the audio files."
+        )
     main(args)

audiodiffusion/__init__.py CHANGED Viewed

@@ -1,61 +1,133 @@
 import numpy as np
 from PIL import Image
-from torch import cuda
 from diffusers import DDPMPipeline
 from librosa.beat import beat_track
 from .mel import Mel
-VERSION = "1.0.1"
 class AudioDiffusion:
     def __init__(self,
-                 model_id="teticio/audio-diffusion-256",
-                 resolution=256,
-                 cuda=cuda.is_available()):
         """Class for generating audio using Denoising Diffusion Probabilistic Models.
         Args:
             model_id (String): name of model (local directory or Hugging Face Hub)
             resolution (int): size of square mel spectrogram in pixels
             cuda (bool): use CUDA?
         """
         self.mel = Mel(x_res=resolution, y_res=resolution)
         self.model_id = model_id
         self.ddpm = DDPMPipeline.from_pretrained(self.model_id)
         if cuda:
             self.ddpm.to("cuda")
-    def generate_spectrogram_and_audio(self):
         """Generate random mel spectrogram and convert to audio.
         Returns:
             PIL Image: mel spectrogram
-            (float, array): sample rate and raw audio
         """
-        images = self.ddpm(output_type="numpy")["sample"]
         images = (images * 255).round().astype("uint8").transpose(0, 3, 1, 2)
         image = Image.fromarray(images[0][0])
         audio = self.mel.image_to_audio(image)
         return image, (self.mel.get_sample_rate(), audio)
     @staticmethod
-    def loop_it(audio, sample_rate, loops=12):
         """Loop audio
         Args:
-            audio (array): audio as numpy array
             sample_rate (int): sample rate of audio
             loops (int): number of times to loop
         Returns:
-            (float, array): sample rate and raw audio or None
         """
-        tempo, beats = beat_track(y=audio, sr=sample_rate, units='samples')
-        if len(beats) > 8:
-            return np.tile(audio[beats[0]:beats[8]], loops)
-        if len(beats) > 4:
-            return np.tile(audio[beats[0]:beats[4]], loops)
         return None

+from typing import Iterable, Tuple
+import torch
 import numpy as np
 from PIL import Image
+from tqdm.auto import tqdm
 from diffusers import DDPMPipeline
 from librosa.beat import beat_track
 from .mel import Mel
+VERSION = "1.1.1"
 class AudioDiffusion:
     def __init__(self,
+                 model_id: str = "teticio/audio-diffusion-256",
+                 resolution: int = 256,
+                 cuda: bool = torch.cuda.is_available(),
+                 progress_bar: Iterable = tqdm):
         """Class for generating audio using Denoising Diffusion Probabilistic Models.
         Args:
             model_id (String): name of model (local directory or Hugging Face Hub)
             resolution (int): size of square mel spectrogram in pixels
             cuda (bool): use CUDA?
+            progress_bar (iterable): iterable callback for progress updates or None
         """
         self.mel = Mel(x_res=resolution, y_res=resolution)
         self.model_id = model_id
         self.ddpm = DDPMPipeline.from_pretrained(self.model_id)
         if cuda:
             self.ddpm.to("cuda")
+        self.progress_bar = progress_bar or (lambda _: _)
+    def generate_spectrogram_and_audio(
+        self,
+        generator: torch.Generator = None
+    ) -> Tuple[Image.Image, Tuple[int, np.ndarray]]:
         """Generate random mel spectrogram and convert to audio.
+        Args:
+            generator (torch.Generator): random number generator or None
         Returns:
             PIL Image: mel spectrogram
+            (float, np.ndarray): sample rate and raw audio
         """
+        images = self.ddpm(output_type="numpy", generator=generator)["sample"]
+        images = (images * 255).round().astype("uint8").transpose(0, 3, 1, 2)
+        image = Image.fromarray(images[0][0])
+        audio = self.mel.image_to_audio(image)
+        return image, (self.mel.get_sample_rate(), audio)
+    @torch.no_grad()
+    def generate_spectrogram_and_audio_from_audio(
+        self,
+        audio_file: str = None,
+        raw_audio: np.ndarray = None,
+        slice: int = 0,
+        start_step: int = 0,
+        steps: int = 1000,
+        generator: torch.Generator = None
+    ) -> Tuple[Image.Image, Tuple[int, np.ndarray]]:
+        """Generate random mel spectrogram from audio input and convert to audio.
+        Args:
+            audio_file (str): must be a file on disk due to Librosa limitation or
+            raw_audio (np.ndarray): audio as numpy array
+            slice (int): slice number of audio to convert
+            start_step (int): step to start from
+            steps (int): number of de-noising steps to perform
+            generator (torch.Generator): random number generator or None
+        Returns:
+            PIL Image: mel spectrogram
+            (float, np.ndarray): sample rate and raw audio
+        """
+        # It would be better to derive a class from DDPMDiffusionPipeline
+        # but currently the return type ImagePipelineOutput cannot be imported.
+        images = torch.randn(
+            (1, self.ddpm.unet.in_channels, self.ddpm.unet.sample_size,
+             self.ddpm.unet.sample_size),
+            generator=generator,
+        )
+        if audio_file is not None or raw_audio is not None:
+            self.mel.load_audio(audio_file, raw_audio)
+            input_image = self.mel.audio_slice_to_image(slice)
+            input_image = np.frombuffer(input_image.tobytes(),
+                                        dtype="uint8").reshape(
+                                            (input_image.width,
+                                             input_image.height))
+            input_image = ((input_image / 255) * 2 - 1)
+            if start_step > 0:
+                images[0][0] = self.ddpm.scheduler.add_noise(
+                torch.tensor(input_image[np.newaxis, np.newaxis, :]), images,
+                steps - start_step)
+        images = images.to(self.ddpm.device)
+        self.ddpm.scheduler.set_timesteps(steps)
+        for t in self.progress_bar(self.ddpm.scheduler.timesteps[start_step:]):
+            model_output = self.ddpm.unet(images, t)['sample']
+            images = self.ddpm.scheduler.step(
+                model_output, t, images, generator=generator)['prev_sample']
+        images = (images / 2 + 0.5).clamp(0, 1)
+        images = images.cpu().permute(0, 2, 3, 1).numpy()
         images = (images * 255).round().astype("uint8").transpose(0, 3, 1, 2)
         image = Image.fromarray(images[0][0])
         audio = self.mel.image_to_audio(image)
         return image, (self.mel.get_sample_rate(), audio)
     @staticmethod
+    def loop_it(audio: np.ndarray,
+                sample_rate: int,
+                loops: int = 12) -> np.ndarray:
         """Loop audio
         Args:
+            audio (np.ndarray): audio as numpy array
             sample_rate (int): sample rate of audio
             loops (int): number of times to loop
         Returns:
+            (float, np.ndarray): sample rate and raw audio or None
         """
+        _, beats = beat_track(y=audio, sr=sample_rate, units='samples')
+        for beats_in_bar in [16, 12, 8, 4]:
+            if len(beats) > beats_in_bar:
+                return np.tile(audio[beats[0]:beats[beats_in_bar]], loops)
         return None

audiodiffusion/mel.py CHANGED Viewed

@@ -11,12 +11,12 @@ class Mel:
     def __init__(
         self,
-        x_res=256,
-        y_res=256,
-        sample_rate=22050,
-        n_fft=2048,
-        hop_length=512,
-        top_db=80,
     ):
         """Class to convert audio to mel spectrograms and vice versa.
@@ -39,15 +39,18 @@ class Mel:
         self.top_db = top_db
         self.y = None
-    def load_audio(self, audio_file):
         """Load audio.
         Args:
-            file (str): must be a file on disk due to Librosa limitation
         """
-        self.y, _ = librosa.load(audio_file, mono=True)
-    def get_number_of_slices(self):
         """Get number of slices in audio.
         Returns:
@@ -55,7 +58,18 @@ class Mel:
         """
         return len(self.y) // self.slice_size
-    def get_sample_rate(self):
         """Get sample rate:
         Returns:
@@ -63,7 +77,7 @@ class Mel:
         """
         return self.sr
-    def audio_slice_to_image(self, slice):
         """Convert slice of audio to spectrogram.
         Args:
@@ -73,7 +87,7 @@ class Mel:
             PIL Image: grayscale image of x_res x y_res
         """
         S = librosa.feature.melspectrogram(
-            y=self.y[self.slice_size * slice:self.slice_size * (slice + 1)],
             sr=self.sr,
             n_fft=self.n_fft,
             hop_length=self.hop_length,
@@ -86,14 +100,14 @@ class Mel:
         image = Image.frombytes("L", log_S.shape, bytedata.tobytes())
         return image
-    def image_to_audio(self, image):
         """Converts spectrogram to audio.
         Args:
             image (PIL Image): x_res x y_res grayscale image
         Returns:
-            audio (array): raw audio
         """
         bytedata = np.frombuffer(image.tobytes(), dtype="uint8").reshape(
             (image.width, image.height))

     def __init__(
         self,
+        x_res: int = 256,
+        y_res: int = 256,
+        sample_rate: int = 22050,
+        n_fft: int = 2048,
+        hop_length: int = 512,
+        top_db: int = 80,
     ):
         """Class to convert audio to mel spectrograms and vice versa.
         self.top_db = top_db
         self.y = None
+    def load_audio(self, audio_file: str = None, raw_audio: np.ndarray = None):
         """Load audio.
         Args:
+            audio_file (str): must be a file on disk due to Librosa limitation or
+            raw_audio (np.ndarray): audio as numpy array
         """
+        self.y, _ = librosa.load(
+            audio_file,
+            mono=True) if audio_file is not None else raw_audio, None
+    def get_number_of_slices(self) -> int:
         """Get number of slices in audio.
         Returns:
         """
         return len(self.y) // self.slice_size
+    def get_audio_slice(self, slice: int = 0) -> np.ndarray:
+        """Get slice of audio.
+        Args:
+            slice (int): slice number of audio (out of get_number_of_slices())
+        Returns:
+            np.ndarray: audio as numpy array
+        """
+        return self.y[self.slice_size * slice:self.slice_size * (slice + 1)]
+    def get_sample_rate(self) -> int:
         """Get sample rate:
         Returns:
         """
         return self.sr
+    def audio_slice_to_image(self, slice: int) -> Image.Image:
         """Convert slice of audio to spectrogram.
         Args:
             PIL Image: grayscale image of x_res x y_res
         """
         S = librosa.feature.melspectrogram(
+            y=self.get_audio_slice(slice),
             sr=self.sr,
             n_fft=self.n_fft,
             hop_length=self.hop_length,
         image = Image.frombytes("L", log_S.shape, bytedata.tobytes())
         return image
+    def image_to_audio(self, image: Image.Image) -> np.ndarray:
         """Converts spectrogram to audio.
         Args:
             image (PIL Image): x_res x y_res grayscale image
         Returns:
+            audio (np.ndarray): raw audio
         """
         bytedata = np.frombuffer(image.tobytes(), dtype="uint8").reshape(
             (image.width, image.height))

notebooks/test_model.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

tmp_model ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit 3750ad3934edb6562655a80b1572c975203ff92b

train_unconditional.py CHANGED Viewed

@@ -315,5 +315,8 @@ if __name__ == "__main__":
         raise ValueError(
             "You must specify either a dataset name from the hub or a train data directory."
         )
     main(args)

         raise ValueError(
             "You must specify either a dataset name from the hub or a train data directory."
         )
+    if args.dataset_name is not None and args.dataset_name == args.hub_model_id:
+        raise ValueError(
+            "The local dataset name must be different from the hub model id.")
     main(args)