Spaces:

Woleek
/

image-based-soundtrack-generation

Running

App Files Files Community

Woleek commited on Nov 21, 2023

Commit

c4e7950

•

1 Parent(s): 41e2411

Init

Browse files

Files changed (8) hide show

README.md +5 -4
app.py +74 -0
audiodiffusion/__init__.py +141 -0
audiodiffusion/image_encoder.py +21 -0
audiodiffusion/mel.py +169 -0
audiodiffusion/pipeline_audio_diffusion.py +257 -0
audiodiffusion/utils.py +303 -0
requirements.txt +7 -0

README.md CHANGED Viewed

@@ -1,9 +1,10 @@
 ---
-title: Image Based Soundtrack Generation
-emoji: 🦀
-colorFrom: indigo
-colorTo: green
 sdk: gradio
 sdk_version: 4.5.0
 app_file: app.py
 pinned: false

 ---
+title: Image-based soundtrack generation
+emoji: 🎶
+colorFrom: purple
+colorTo: blue
 sdk: gradio
+python_version: 3.10.8
 sdk_version: 4.5.0
 app_file: app.py
 pinned: false

app.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import torch
+import gradio as gr
+from transformers import ViTImageProcessor, ViTModel
+from audiodiffusion import AudioDiffusionPipeline, ImageEncoder
+device = "cuda" if torch.cuda.is_available() else "cpu"
+generator1 = torch.Generator(device)
+generator2 = torch.Generator(device)
+pipe = AudioDiffusionPipeline.from_pretrained('Woleek/clMusDiff').to(device)
+processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k')
+extractor = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
+image_encoder = ImageEncoder(processor, extractor)
+def _encode_image(image):
+    return torch.unsqueeze(image_encoder.encode(image), axis=1).to(device)
+def _generate_spectrogram(condition, steps, eta):
+    images, (sample_rate, audios) = pipe(
+        batch_size=1,
+        steps=steps,
+        generator=generator1,
+        step_generator=generator2,
+        encoding=condition,
+        eta=eta,
+        return_dict=False,
+    )
+    return images[0], (sample_rate, audios[0])
+def run_generation(image, steps, eta):
+    condition = _encode_image(image)
+    spectrogram, (sr, audio) = _generate_spectrogram(condition, steps, eta)
+    return spectrogram, (sr, audio)
+with gr.Blocks(title="Image-based soundtrack generation") as demo:
+    gr.Markdown('''
+        # Image-based soundtrack generation
+    ''')
+    with gr.Row():
+        with gr.Column():
+            image = gr.Image(
+                type="pil",
+                label="Conditioning image"
+            )
+            steps = gr.Slider(
+                minimum=1,
+                maximum=1000,
+                step=1,
+                value=50,
+                label="Denoising steps"
+            )
+            eta = gr.Slider(
+                minimum=0.1,
+                maximum=1.0,
+                step=0.1,
+                value=0.9,
+                label="η"
+            )
+            gr.Markdown('''
+                Eta (η) is a variable that controls the level of interpolation between a deterministic DDIM (η=0.0) and a stochastic DDPM (η=1.0).
+            ''')
+            btn = gr.Button("Generate")
+            clear = gr.ClearButton(image)
+        with gr.Column():
+            spectrogram = gr.Image(
+                label="Generated Mel spectrogram"
+            )
+            audio = gr.Audio(
+                label="Resulting audio"
+            )
+    btn.click(run_generation, inputs=[image, steps, eta], outputs=[spectrogram, audio])
+demo.launch()

audiodiffusion/__init__.py ADDED Viewed

	@@ -0,0 +1,141 @@

+from typing import Iterable, Tuple
+import numpy as np
+import torch
+from librosa.beat import beat_track
+from PIL import Image
+from tqdm.auto import tqdm
+# from diffusers import AudioDiffusionPipeline
+from .pipeline_audio_diffusion import AudioDiffusionPipeline
+from .image_encoder import ImageEncoder
+VERSION = "1.5.6"
+class AudioDiffusion:
+    def __init__(
+        self,
+        model_id: str = "teticio/audio-diffusion-256",
+        cuda: bool = torch.cuda.is_available(),
+        progress_bar: Iterable = tqdm,
+    ):
+        """Class for generating audio using De-noising Diffusion Probabilistic Models.
+        Args:
+            model_id (String): name of model (local directory or Hugging Face Hub)
+            cuda (bool): use CUDA?
+            progress_bar (iterable): iterable callback for progress updates or None
+        """
+        self.model_id = model_id
+        self.pipe = AudioDiffusionPipeline.from_pretrained(self.model_id)
+        if cuda:
+            self.pipe.to("cuda")
+        self.progress_bar = progress_bar or (lambda _: _)
+    def generate_spectrogram_and_audio(
+        self,
+        steps: int = None,
+        generator: torch.Generator = None,
+        step_generator: torch.Generator = None,
+        eta: float = 0,
+        noise: torch.Tensor = None,
+        encoding: torch.Tensor = None,
+    ) -> Tuple[Image.Image, Tuple[int, np.ndarray]]:
+        """Generate random mel spectrogram and convert to audio.
+        Args:
+            steps (int): number of de-noising steps (defaults to 50 for DDIM, 1000 for DDPM)
+            generator (torch.Generator): random number generator or None
+            step_generator (torch.Generator): random number generator used to de-noise or None
+            eta (float): parameter between 0 and 1 used with DDIM scheduler
+            noise (torch.Tensor): noisy image or None
+            encoding (`torch.Tensor`): for UNet2DConditionModel shape (batch_size, seq_length, cross_attention_dim)
+        Returns:
+            PIL Image: mel spectrogram
+            (float, np.ndarray): sample rate and raw audio
+        """
+        images, (sample_rate, audios) = self.pipe(
+            batch_size=1,
+            steps=steps,
+            generator=generator,
+            step_generator=step_generator,
+            eta=eta,
+            noise=noise,
+            encoding=encoding,
+            return_dict=False,
+        )
+        return images[0], (sample_rate, audios[0])
+    def generate_spectrogram_and_audio_from_audio(
+        self,
+        audio_file: str = None,
+        raw_audio: np.ndarray = None,
+        slice: int = 0,
+        start_step: int = 0,
+        steps: int = None,
+        generator: torch.Generator = None,
+        mask_start_secs: float = 0,
+        mask_end_secs: float = 0,
+        step_generator: torch.Generator = None,
+        eta: float = 0,
+        encoding: torch.Tensor = None,
+        noise: torch.Tensor = None,
+    ) -> Tuple[Image.Image, Tuple[int, np.ndarray]]:
+        """Generate random mel spectrogram from audio input and convert to audio.
+        Args:
+            audio_file (str): must be a file on disk due to Librosa limitation or
+            raw_audio (np.ndarray): audio as numpy array
+            slice (int): slice number of audio to convert
+            start_step (int): step to start from
+            steps (int): number of de-noising steps (defaults to 50 for DDIM, 1000 for DDPM)
+            generator (torch.Generator): random number generator or None
+            mask_start_secs (float): number of seconds of audio to mask (not generate) at start
+            mask_end_secs (float): number of seconds of audio to mask (not generate) at end
+            step_generator (torch.Generator): random number generator used to de-noise or None
+            eta (float): parameter between 0 and 1 used with DDIM scheduler
+            encoding (`torch.Tensor`): for UNet2DConditionModel shape (batch_size, seq_length, cross_attention_dim)
+            noise (torch.Tensor): noisy image or None
+        Returns:
+            PIL Image: mel spectrogram
+            (float, np.ndarray): sample rate and raw audio
+        """
+        images, (sample_rate, audios) = self.pipe(
+            batch_size=1,
+            audio_file=audio_file,
+            raw_audio=raw_audio,
+            slice=slice,
+            start_step=start_step,
+            steps=steps,
+            generator=generator,
+            mask_start_secs=mask_start_secs,
+            mask_end_secs=mask_end_secs,
+            step_generator=step_generator,
+            eta=eta,
+            noise=noise,
+            encoding=encoding,
+            return_dict=False,
+        )
+        return images[0], (sample_rate, audios[0])
+    @staticmethod
+    def loop_it(audio: np.ndarray, sample_rate: int, loops: int = 12) -> np.ndarray:
+        """Loop audio
+        Args:
+            audio (np.ndarray): audio as numpy array
+            sample_rate (int): sample rate of audio
+            loops (int): number of times to loop
+        Returns:
+            (float, np.ndarray): sample rate and raw audio or None
+        """
+        _, beats = beat_track(y=audio, sr=sample_rate, units="samples")
+        beats_in_bar = (len(beats) - 1) // 4 * 4
+        if beats_in_bar > 0:
+            return np.tile(audio[beats[0] : beats[beats_in_bar]], loops)
+        return None

audiodiffusion/image_encoder.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import torch
+from diffusers import ConfigMixin, Mel, ModelMixin
+class ImageEncoder(ModelMixin, ConfigMixin):
+    def __init__(self, image_processor, encoder_model):
+        super().__init__()
+        self.processor = image_processor
+        self.encoder = encoder_model
+        self.eval()
+    def forward(self, x):
+        x = self.encoder(x)
+        return x
+    @torch.no_grad()
+    def encode(self, image):
+        x = self.processor(image, return_tensors="pt")['pixel_values']
+        y = self(x)
+        y = y.last_hidden_state
+        embedings = y[:,0,:]
+        return embedings

audiodiffusion/mel.py ADDED Viewed

	@@ -0,0 +1,169 @@

+# This code has been migrated to diffusers but can be run locally with
+# pipe = DiffusionPipeline.from_pretrained("teticio/audio-diffusion-256", custom_pipeline="audio-diffusion/audiodiffusion/pipeline_audio_diffusion.py")
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+from typing import Callable, Union
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.schedulers.scheduling_utils import SchedulerMixin
+warnings.filterwarnings("ignore")
+import numpy as np  # noqa: E402
+try:
+    import librosa  # noqa: E402
+    _librosa_can_be_imported = True
+    _import_error = ""
+except Exception as e:
+    _librosa_can_be_imported = False
+    _import_error = (
+        f"Cannot import librosa because {e}. Make sure to correctly install librosa to be able to install it."
+    )
+from PIL import Image  # noqa: E402
+class Mel(ConfigMixin, SchedulerMixin):
+    """
+    Parameters:
+        x_res (`int`): x resolution of spectrogram (time)
+        y_res (`int`): y resolution of spectrogram (frequency bins)
+        sample_rate (`int`): sample rate of audio
+        n_fft (`int`): number of Fast Fourier Transforms
+        hop_length (`int`): hop length (a higher number is recommended for lower than 256 y_res)
+        top_db (`int`): loudest in decibels
+        n_iter (`int`): number of iterations for Griffin Linn mel inversion
+    """
+    config_name = "mel_config.json"
+    @register_to_config
+    def __init__(
+        self,
+        x_res: int = 256,
+        y_res: int = 256,
+        sample_rate: int = 22050,
+        n_fft: int = 2048,
+        hop_length: int = 512,
+        top_db: int = 80,
+        n_iter: int = 32,
+    ):
+        self.hop_length = hop_length
+        self.sr = sample_rate
+        self.n_fft = n_fft
+        self.top_db = top_db
+        self.n_iter = n_iter
+        self.set_resolution(x_res, y_res)
+        self.audio = None
+        if not _librosa_can_be_imported:
+            raise ValueError(_import_error)
+    def set_resolution(self, x_res: int, y_res: int):
+        """Set resolution.
+        Args:
+            x_res (`int`): x resolution of spectrogram (time)
+            y_res (`int`): y resolution of spectrogram (frequency bins)
+        """
+        self.x_res = x_res
+        self.y_res = y_res
+        self.n_mels = self.y_res
+        self.slice_size = self.x_res * self.hop_length - 1
+    def load_audio(self, audio_file: str = None, raw_audio: np.ndarray = None):
+        """Load audio.
+        Args:
+            audio_file (`str`): must be a file on disk due to Librosa limitation or
+            raw_audio (`np.ndarray`): audio as numpy array
+        """
+        if audio_file is not None:
+            self.audio, _ = librosa.load(audio_file, mono=True, sr=self.sr)
+        else:
+            self.audio = raw_audio
+        # Pad with silence if necessary.
+        if len(self.audio) < self.x_res * self.hop_length:
+            self.audio = np.concatenate([self.audio, np.zeros((self.x_res * self.hop_length - len(self.audio),))])
+    def get_number_of_slices(self) -> int:
+        """Get number of slices in audio.
+        Returns:
+            `int`: number of spectograms audio can be sliced into
+        """
+        return len(self.audio) // self.slice_size
+    def get_audio_slice(self, slice: int = 0) -> np.ndarray:
+        """Get slice of audio.
+        Args:
+            slice (`int`): slice number of audio (out of get_number_of_slices())
+        Returns:
+            `np.ndarray`: audio as numpy array
+        """
+        return self.audio[self.slice_size * slice : self.slice_size * (slice + 1)]
+    def get_sample_rate(self) -> int:
+        """Get sample rate:
+        Returns:
+            `int`: sample rate of audio
+        """
+        return self.sr
+    def audio_slice_to_image(self, slice: int, ref: Union[float, Callable] = np.max) -> Image.Image:
+        """Convert slice of audio to spectrogram.
+        Args:
+            slice (`int`): slice number of audio to convert (out of get_number_of_slices())
+            ref (`Union[float, Callable]`): reference value for spectrogram
+        Returns:
+            `PIL Image`: grayscale image of x_res x y_res
+        """
+        S = librosa.feature.melspectrogram(
+            y=self.get_audio_slice(slice), sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length, n_mels=self.n_mels
+        )
+        log_S = librosa.power_to_db(S, ref=ref, top_db=self.top_db)
+        bytedata = (((log_S + self.top_db) * 255 / self.top_db).clip(0, 255) + 0.5).astype(np.uint8)
+        image = Image.fromarray(bytedata)
+        return image
+    def image_to_audio(self, image: Image.Image) -> np.ndarray:
+        """Converts spectrogram to audio.
+        Args:
+            image (`PIL Image`): x_res x y_res grayscale image
+        Returns:
+            audio (`np.ndarray`): raw audio
+        """
+        bytedata = np.frombuffer(image.tobytes(), dtype="uint8").reshape((image.height, image.width))
+        log_S = bytedata.astype("float") * self.top_db / 255 - self.top_db
+        S = librosa.db_to_power(log_S)
+        audio = librosa.feature.inverse.mel_to_audio(
+            S, sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length, n_iter=self.n_iter
+        )
+        return audio

audiodiffusion/pipeline_audio_diffusion.py ADDED Viewed

	@@ -0,0 +1,257 @@

+# This code has been migrated to diffusers but can be run locally with
+# pipe = DiffusionPipeline.from_pretrained("teticio/audio-diffusion-256", custom_pipeline="audio-diffusion/audiodiffusion/pipeline_audio_diffusion.py")
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from math import acos, sin
+from typing import List, Tuple, Union
+import numpy as np
+import torch
+from diffusers import (
+    AudioPipelineOutput,
+    AutoencoderKL,
+    DDIMScheduler,
+    DDPMScheduler,
+    DiffusionPipeline,
+    ImagePipelineOutput,
+    UNet2DConditionModel,
+)
+from diffusers.utils import BaseOutput
+from PIL import Image
+from .mel import Mel
+class AudioDiffusionPipeline(DiffusionPipeline):
+    """
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Parameters:
+        vqae ([`AutoencoderKL`]): Variational AutoEncoder for Latent Audio Diffusion or None
+        unet ([`UNet2DConditionModel`]): UNET model
+        mel ([`Mel`]): transform audio <-> spectrogram
+        scheduler ([`DDIMScheduler` or `DDPMScheduler`]): de-noising scheduler
+    """
+    _optional_components = ["vqvae"]
+    def __init__(
+        self,
+        vqvae: AutoencoderKL,
+        unet: UNet2DConditionModel,
+        mel: Mel,
+        scheduler: Union[DDIMScheduler, DDPMScheduler],
+    ):
+        super().__init__()
+        self.register_modules(unet=unet, scheduler=scheduler, mel=mel, vqvae=vqvae)
+    def get_default_steps(self) -> int:
+        """Returns default number of steps recommended for inference
+        Returns:
+            `int`: number of steps
+        """
+        return 50 if isinstance(self.scheduler, DDIMScheduler) else 1000
+    @torch.no_grad()
+    def __call__(
+        self,
+        batch_size: int = 1,
+        audio_file: str = None,
+        raw_audio: np.ndarray = None,
+        slice: int = 0,
+        start_step: int = 0,
+        steps: int = None,
+        generator: torch.Generator = None,
+        mask_start_secs: float = 0,
+        mask_end_secs: float = 0,
+        step_generator: torch.Generator = None,
+        eta: float = 0,
+        noise: torch.Tensor = None,
+        encoding: torch.Tensor = None,
+        return_dict=True,
+    ) -> Union[
+        Union[AudioPipelineOutput, ImagePipelineOutput],
+        Tuple[List[Image.Image], Tuple[int, List[np.ndarray]]],
+    ]:
+        """Generate random mel spectrogram from audio input and convert to audio.
+        Args:
+            batch_size (`int`): number of samples to generate
+            audio_file (`str`): must be a file on disk due to Librosa limitation or
+            raw_audio (`np.ndarray`): audio as numpy array
+            slice (`int`): slice number of audio to convert
+            start_step (int): step to start from
+            steps (`int`): number of de-noising steps (defaults to 50 for DDIM, 1000 for DDPM)
+            generator (`torch.Generator`): random number generator or None
+            mask_start_secs (`float`): number of seconds of audio to mask (not generate) at start
+            mask_end_secs (`float`): number of seconds of audio to mask (not generate) at end
+            step_generator (`torch.Generator`): random number generator used to de-noise or None
+            eta (`float`): parameter between 0 and 1 used with DDIM scheduler
+            noise (`torch.Tensor`): noise tensor of shape (batch_size, 1, height, width) or None
+            encoding (`torch.Tensor`): for UNet2DConditionModel shape (batch_size, seq_length, cross_attention_dim)
+            return_dict (`bool`): if True return AudioPipelineOutput, ImagePipelineOutput else Tuple
+        Returns:
+            `List[PIL Image]`: mel spectrograms (`float`, `List[np.ndarray]`): sample rate and raw audios
+        """
+        steps = steps or self.get_default_steps()
+        self.scheduler.set_timesteps(steps)
+        step_generator = step_generator or generator
+        # For backwards compatibility
+        if type(self.unet.sample_size) == int:
+            self.unet.sample_size = (self.unet.sample_size, self.unet.sample_size)
+        if noise is None:
+            noise = torch.randn(
+                (
+                    batch_size,
+                    self.unet.in_channels,
+                    self.unet.sample_size[0],
+                    self.unet.sample_size[1],
+                ),
+                generator=generator,
+                device=self.device,
+            )
+        images = noise
+        mask = None
+        if audio_file is not None or raw_audio is not None:
+            self.mel.load_audio(audio_file, raw_audio)
+            input_image = self.mel.audio_slice_to_image(slice)
+            input_image = np.frombuffer(input_image.tobytes(), dtype="uint8").reshape(
+                (input_image.height, input_image.width)
+            )
+            input_image = (input_image / 255) * 2 - 1
+            input_images = torch.tensor(input_image[np.newaxis, :, :], dtype=torch.float).to(self.device)
+            if self.vqvae is not None:
+                input_images = self.vqvae.encode(torch.unsqueeze(input_images, 0)).latent_dist.sample(
+                    generator=generator
+                )[0]
+                input_images = 0.18215 * input_images
+            if start_step > 0:
+                images[0, 0] = self.scheduler.add_noise(input_images, noise, self.scheduler.timesteps[start_step - 1])
+            pixels_per_second = (
+                self.unet.sample_size[1] * self.mel.get_sample_rate() / self.mel.x_res / self.mel.hop_length
+            )
+            mask_start = int(mask_start_secs * pixels_per_second)
+            mask_end = int(mask_end_secs * pixels_per_second)
+            mask = self.scheduler.add_noise(input_images, noise, torch.tensor(self.scheduler.timesteps[start_step:]))
+        for step, t in enumerate(self.progress_bar(self.scheduler.timesteps[start_step:])):
+            if isinstance(self.unet, UNet2DConditionModel):
+                model_output = self.unet(images, t, encoding)["sample"]
+            else:
+                model_output = self.unet(images, t)["sample"]
+            if isinstance(self.scheduler, DDIMScheduler):
+                images = self.scheduler.step(
+                    model_output=model_output,
+                    timestep=t,
+                    sample=images,
+                    eta=eta,
+                    generator=step_generator,
+                )["prev_sample"]
+            else:
+                images = self.scheduler.step(
+                    model_output=model_output,
+                    timestep=t,
+                    sample=images,
+                    generator=step_generator,
+                )["prev_sample"]
+            if mask is not None:
+                if mask_start > 0:
+                    images[:, :, :, :mask_start] = mask[:, step, :, :mask_start]
+                if mask_end > 0:
+                    images[:, :, :, -mask_end:] = mask[:, step, :, -mask_end:]
+        if self.vqvae is not None:
+            # 0.18215 was scaling factor used in training to ensure unit variance
+            images = 1 / 0.18215 * images
+            images = self.vqvae.decode(images)["sample"]
+        images = (images / 2 + 0.5).clamp(0, 1)
+        images = images.cpu().permute(0, 2, 3, 1).numpy()
+        images = (images * 255).round().astype("uint8")
+        images = list(
+            map(lambda _: Image.fromarray(_[:, :, 0]), images)
+            if images.shape[3] == 1
+            else map(lambda _: Image.fromarray(_, mode="RGB").convert("L"), images)
+        )
+        audios = list(map(lambda _: self.mel.image_to_audio(_), images))
+        if not return_dict:
+            return images, (self.mel.get_sample_rate(), audios)
+        return BaseOutput(**AudioPipelineOutput(np.array(audios)[:, np.newaxis, :]), **ImagePipelineOutput(images))
+    @torch.no_grad()
+    def encode(self, images: List[Image.Image], steps: int = 50) -> np.ndarray:
+        """Reverse step process: recover noisy image from generated image.
+        Args:
+            images (`List[PIL Image]`): list of images to encode
+            steps (`int`): number of encoding steps to perform (defaults to 50)
+        Returns:
+            `np.ndarray`: noise tensor of shape (batch_size, 1, height, width)
+        """
+        # Only works with DDIM as this method is deterministic
+        assert isinstance(self.scheduler, DDIMScheduler)
+        self.scheduler.set_timesteps(steps)
+        sample = np.array(
+            [np.frombuffer(image.tobytes(), dtype="uint8").reshape((1, image.height, image.width)) for image in images]
+        )
+        sample = (sample / 255) * 2 - 1
+        sample = torch.Tensor(sample).to(self.device)
+        for t in self.progress_bar(torch.flip(self.scheduler.timesteps, (0,))):
+            prev_timestep = t - self.scheduler.config.num_train_timesteps // self.scheduler.num_inference_steps
+            alpha_prod_t = self.scheduler.alphas_cumprod[t]
+            alpha_prod_t_prev = (
+                self.scheduler.alphas_cumprod[prev_timestep]
+                if prev_timestep >= 0
+                else self.scheduler.final_alpha_cumprod
+            )
+            beta_prod_t = 1 - alpha_prod_t
+            model_output = self.unet(sample, t)["sample"]
+            pred_sample_direction = (1 - alpha_prod_t_prev) ** (0.5) * model_output
+            sample = (sample - pred_sample_direction) * alpha_prod_t_prev ** (-0.5)
+            sample = sample * alpha_prod_t ** (0.5) + beta_prod_t ** (0.5) * model_output
+        return sample
+    @staticmethod
+    def slerp(x0: torch.Tensor, x1: torch.Tensor, alpha: float) -> torch.Tensor:
+        """Spherical Linear intERPolation
+        Args:
+            x0 (`torch.Tensor`): first tensor to interpolate between
+            x1 (`torch.Tensor`): seconds tensor to interpolate between
+            alpha (`float`): interpolation between 0 and 1
+        Returns:
+            `torch.Tensor`: interpolated tensor
+        """
+        theta = acos(torch.dot(torch.flatten(x0), torch.flatten(x1)) / torch.norm(x0) / torch.norm(x1))
+        return sin((1 - alpha) * theta) * x0 / sin(theta) + sin(alpha * theta) * x1 / sin(theta)

audiodiffusion/utils.py ADDED Viewed

	@@ -0,0 +1,303 @@

+# adpated from https://github.com/huggingface/diffusers/blob/main/scripts/convert_original_stable_diffusion_to_diffusers.py
+import torch
+from diffusers import AutoencoderKL
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
+def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+        new_item = new_item.replace("norm.weight", "group_norm.weight")
+        new_item = new_item.replace("norm.bias", "group_norm.bias")
+        new_item = new_item.replace("q.weight", "query.weight")
+        new_item = new_item.replace("q.bias", "query.bias")
+        new_item = new_item.replace("k.weight", "key.weight")
+        new_item = new_item.replace("k.bias", "key.bias")
+        new_item = new_item.replace("v.weight", "value.weight")
+        new_item = new_item.replace("v.bias", "value.bias")
+        new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
+        new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+def assign_to_checkpoint(
+    paths,
+    checkpoint,
+    old_checkpoint,
+    attention_paths_to_split=None,
+    additional_replacements=None,
+    config=None,
+):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming
+    to them. It splits attention layers, and takes into account additional replacements
+    that may arise.
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+    # Splits the attention layers into three variables.
+    if attention_paths_to_split is not None:
+        for path, path_map in attention_paths_to_split.items():
+            old_tensor = old_checkpoint[path]
+            channels = old_tensor.shape[0] // 3
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
+            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
+            query, key, value = old_tensor.split(channels // num_heads, dim=1)
+            checkpoint[path_map["query"]] = query.reshape(target_shape)
+            checkpoint[path_map["key"]] = key.reshape(target_shape)
+            checkpoint[path_map["value"]] = value.reshape(target_shape)
+    for path in paths:
+        new_path = path["new"]
+        # These have already been assigned
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
+            continue
+        # Global renaming happens here
+        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
+        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
+        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"], replacement["new"])
+        # proj_attn.weight has to be converted from conv 1D to linear
+        if "proj_attn.weight" in new_path:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
+        else:
+            checkpoint[new_path] = old_checkpoint[path["old"]]
+def conv_attn_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    attn_keys = ["query.weight", "key.weight", "value.weight"]
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in attn_keys:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0, 0]
+        elif "proj_attn.weight" in key:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0]
+def create_vae_diffusers_config(original_config):
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    vae_params = original_config.model.params.ddconfig
+    _ = original_config.model.params.embed_dim
+    block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult]
+    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
+    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
+    config = dict(
+        sample_size=tuple(vae_params.resolution),
+        in_channels=vae_params.in_channels,
+        out_channels=vae_params.out_ch,
+        down_block_types=tuple(down_block_types),
+        up_block_types=tuple(up_block_types),
+        block_out_channels=tuple(block_out_channels),
+        latent_channels=vae_params.z_channels,
+        layers_per_block=vae_params.num_res_blocks,
+    )
+    return config
+def convert_ldm_vae_checkpoint(checkpoint, config):
+    # extract state dict for VAE
+    vae_state_dict = checkpoint
+    new_checkpoint = {}
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
+    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
+    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
+    down_blocks = {
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
+    }
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
+    up_blocks = {
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
+    }
+    for i in range(num_down_blocks):
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
+        assign_to_checkpoint(
+            paths,
+            new_checkpoint,
+            vae_state_dict,
+            additional_replacements=[meta_path],
+            config=config,
+        )
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(
+            paths,
+            new_checkpoint,
+            vae_state_dict,
+            additional_replacements=[meta_path],
+            config=config,
+        )
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        paths,
+        new_checkpoint,
+        vae_state_dict,
+        additional_replacements=[meta_path],
+        config=config,
+    )
+    conv_attn_to_linear(new_checkpoint)
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+        resnets = [
+            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+        ]
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
+        assign_to_checkpoint(
+            paths,
+            new_checkpoint,
+            vae_state_dict,
+            additional_replacements=[meta_path],
+            config=config,
+        )
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(
+            paths,
+            new_checkpoint,
+            vae_state_dict,
+            additional_replacements=[meta_path],
+            config=config,
+        )
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        paths,
+        new_checkpoint,
+        vae_state_dict,
+        additional_replacements=[meta_path],
+        config=config,
+    )
+    conv_attn_to_linear(new_checkpoint)
+    return new_checkpoint
+def convert_ldm_to_hf_vae(ldm_checkpoint, ldm_config, hf_checkpoint, sample_size):
+    checkpoint = torch.load(ldm_checkpoint)["state_dict"]
+    # Convert the VAE model.
+    vae_config = create_vae_diffusers_config(ldm_config)
+    converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
+    vae = AutoencoderKL(**vae_config)
+    vae.load_state_dict(converted_vae_checkpoint)
+    vae.save_pretrained(hf_checkpoint)

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+torch==2.0.1
+gradio==4.5.0
+transformers==4.35.2
+numpy==1.23.5
+Pillow==9.3.0
+diffusers==0.23.1
+librosa==0.10.1