Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	allow steps to be different from 1000
Browse files- audiodiffusion/__init__.py +16 -11
 - notebooks/test_model.ipynb +0 -0
 
    	
        audiodiffusion/__init__.py
    CHANGED
    
    | 
         @@ -4,12 +4,12 @@ import torch 
     | 
|
| 4 | 
         
             
            import numpy as np
         
     | 
| 5 | 
         
             
            from PIL import Image
         
     | 
| 6 | 
         
             
            from tqdm.auto import tqdm
         
     | 
| 7 | 
         
            -
            from diffusers import DDPMPipeline
         
     | 
| 8 | 
         
             
            from librosa.beat import beat_track
         
     | 
| 
         | 
|
| 9 | 
         | 
| 10 | 
         
             
            from .mel import Mel
         
     | 
| 11 | 
         | 
| 12 | 
         
            -
            VERSION = "1.1. 
     | 
| 13 | 
         | 
| 14 | 
         | 
| 15 | 
         
             
            class AudioDiffusion:
         
     | 
| 
         @@ -60,7 +60,7 @@ class AudioDiffusion: 
     | 
|
| 60 | 
         
             
                    raw_audio: np.ndarray = None,
         
     | 
| 61 | 
         
             
                    slice: int = 0,
         
     | 
| 62 | 
         
             
                    start_step: int = 0,
         
     | 
| 63 | 
         
            -
                    steps: int =  
     | 
| 64 | 
         
             
                    generator: torch.Generator = None
         
     | 
| 65 | 
         
             
                ) -> Tuple[Image.Image, Tuple[int, np.ndarray]]:
         
     | 
| 66 | 
         
             
                    """Generate random mel spectrogram from audio input and convert to audio.
         
     | 
| 
         @@ -70,7 +70,7 @@ class AudioDiffusion: 
     | 
|
| 70 | 
         
             
                        raw_audio (np.ndarray): audio as numpy array
         
     | 
| 71 | 
         
             
                        slice (int): slice number of audio to convert
         
     | 
| 72 | 
         
             
                        start_step (int): step to start from
         
     | 
| 73 | 
         
            -
                        steps (int): number of de-noising steps to perform
         
     | 
| 74 | 
         
             
                        generator (torch.Generator): random number generator or None
         
     | 
| 75 | 
         | 
| 76 | 
         
             
                    Returns:
         
     | 
| 
         @@ -80,6 +80,10 @@ class AudioDiffusion: 
     | 
|
| 80 | 
         | 
| 81 | 
         
             
                    # It would be better to derive a class from DDPMDiffusionPipeline
         
     | 
| 82 | 
         
             
                    # but currently the return type ImagePipelineOutput cannot be imported.
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 83 | 
         
             
                    images = torch.randn(
         
     | 
| 84 | 
         
             
                        (1, self.ddpm.unet.in_channels, self.ddpm.unet.sample_size,
         
     | 
| 85 | 
         
             
                         self.ddpm.unet.sample_size),
         
     | 
| 
         @@ -94,16 +98,17 @@ class AudioDiffusion: 
     | 
|
| 94 | 
         
             
                                                         input_image.height))
         
     | 
| 95 | 
         
             
                        input_image = ((input_image / 255) * 2 - 1)
         
     | 
| 96 | 
         
             
                        if start_step > 0:
         
     | 
| 97 | 
         
            -
                            images[0][0] =  
     | 
| 98 | 
         
            -
             
     | 
| 99 | 
         
            -
             
     | 
| 100 | 
         | 
| 101 | 
         
             
                    images = images.to(self.ddpm.device)
         
     | 
| 102 | 
         
            -
                    self. 
     | 
| 103 | 
         
            -
                    for t in self.progress_bar(self.ddpm.scheduler.timesteps[start_step:]):
         
     | 
| 104 | 
         
             
                        model_output = self.ddpm.unet(images, t)['sample']
         
     | 
| 105 | 
         
            -
                        images =  
     | 
| 106 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 107 | 
         
             
                    images = (images / 2 + 0.5).clamp(0, 1)
         
     | 
| 108 | 
         
             
                    images = images.cpu().permute(0, 2, 3, 1).numpy()
         
     | 
| 109 | 
         | 
| 
         | 
|
| 4 | 
         
             
            import numpy as np
         
     | 
| 5 | 
         
             
            from PIL import Image
         
     | 
| 6 | 
         
             
            from tqdm.auto import tqdm
         
     | 
| 
         | 
|
| 7 | 
         
             
            from librosa.beat import beat_track
         
     | 
| 8 | 
         
            +
            from diffusers import DDPMPipeline, DDPMScheduler
         
     | 
| 9 | 
         | 
| 10 | 
         
             
            from .mel import Mel
         
     | 
| 11 | 
         | 
| 12 | 
         
            +
            VERSION = "1.1.3"
         
     | 
| 13 | 
         | 
| 14 | 
         | 
| 15 | 
         
             
            class AudioDiffusion:
         
     | 
| 
         | 
|
| 60 | 
         
             
                    raw_audio: np.ndarray = None,
         
     | 
| 61 | 
         
             
                    slice: int = 0,
         
     | 
| 62 | 
         
             
                    start_step: int = 0,
         
     | 
| 63 | 
         
            +
                    steps: int = None,
         
     | 
| 64 | 
         
             
                    generator: torch.Generator = None
         
     | 
| 65 | 
         
             
                ) -> Tuple[Image.Image, Tuple[int, np.ndarray]]:
         
     | 
| 66 | 
         
             
                    """Generate random mel spectrogram from audio input and convert to audio.
         
     | 
| 
         | 
|
| 70 | 
         
             
                        raw_audio (np.ndarray): audio as numpy array
         
     | 
| 71 | 
         
             
                        slice (int): slice number of audio to convert
         
     | 
| 72 | 
         
             
                        start_step (int): step to start from
         
     | 
| 73 | 
         
            +
                        steps (int): number of de-noising steps to perform (defaults to num_train_timesteps)
         
     | 
| 74 | 
         
             
                        generator (torch.Generator): random number generator or None
         
     | 
| 75 | 
         | 
| 76 | 
         
             
                    Returns:
         
     | 
| 
         | 
|
| 80 | 
         | 
| 81 | 
         
             
                    # It would be better to derive a class from DDPMDiffusionPipeline
         
     | 
| 82 | 
         
             
                    # but currently the return type ImagePipelineOutput cannot be imported.
         
     | 
| 83 | 
         
            +
                    if steps is None:
         
     | 
| 84 | 
         
            +
                        steps = self.ddpm.scheduler.num_train_timesteps
         
     | 
| 85 | 
         
            +
                    scheduler = DDPMScheduler(num_train_timesteps=steps)
         
     | 
| 86 | 
         
            +
                    scheduler.set_timesteps(steps)
         
     | 
| 87 | 
         
             
                    images = torch.randn(
         
     | 
| 88 | 
         
             
                        (1, self.ddpm.unet.in_channels, self.ddpm.unet.sample_size,
         
     | 
| 89 | 
         
             
                         self.ddpm.unet.sample_size),
         
     | 
| 
         | 
|
| 98 | 
         
             
                                                         input_image.height))
         
     | 
| 99 | 
         
             
                        input_image = ((input_image / 255) * 2 - 1)
         
     | 
| 100 | 
         
             
                        if start_step > 0:
         
     | 
| 101 | 
         
            +
                            images[0][0] = scheduler.add_noise(
         
     | 
| 102 | 
         
            +
                                torch.tensor(input_image[np.newaxis, np.newaxis, :]),
         
     | 
| 103 | 
         
            +
                                images, steps - start_step)
         
     | 
| 104 | 
         | 
| 105 | 
         
             
                    images = images.to(self.ddpm.device)
         
     | 
| 106 | 
         
            +
                    for t in self.progress_bar(scheduler.timesteps[start_step:]):
         
     | 
| 
         | 
|
| 107 | 
         
             
                        model_output = self.ddpm.unet(images, t)['sample']
         
     | 
| 108 | 
         
            +
                        images = scheduler.step(model_output,
         
     | 
| 109 | 
         
            +
                                                t,
         
     | 
| 110 | 
         
            +
                                                images,
         
     | 
| 111 | 
         
            +
                                                generator=generator)['prev_sample']
         
     | 
| 112 | 
         
             
                    images = (images / 2 + 0.5).clamp(0, 1)
         
     | 
| 113 | 
         
             
                    images = images.cpu().permute(0, 2, 3, 1).numpy()
         
     | 
| 114 | 
         | 
    	
        notebooks/test_model.ipynb
    CHANGED
    
    | 
         The diff for this file is too large to render. 
		See raw diff 
     | 
| 
         |