teticio commited on
Commit
c17b696
1 Parent(s): 8c27376

add streamlit app and tidy

Browse files
.gitignore CHANGED
@@ -4,3 +4,5 @@ __pycache__
4
  data*
5
  ddpm-ema-audio-*
6
  flagged
 
 
 
4
  data*
5
  ddpm-ema-audio-*
6
  flagged
7
+ build
8
+ audiodiffusion.egg-info
README.md CHANGED
@@ -28,7 +28,7 @@ You can play around with the model I trained on about 500 songs from my Spotify
28
  #### Training can be run with Mel spectrograms of resolution 64x64 on a single commercial grade GPU (e.g. RTX 2080 Ti). The `hop_length` should be set to 1024 for better results.
29
 
30
  ```bash
31
- python src/audio_to_images.py \
32
  --resolution 64 \
33
  --hop_length 1024\
34
  --input_dir path-to-audio-files \
@@ -38,7 +38,7 @@ python src/audio_to_images.py \
38
  #### Generate dataset of 256x256 Mel spectrograms and push to hub (you will need to be authenticated with `huggingface-cli login`).
39
 
40
  ```bash
41
- python src/audio_to_images.py \
42
  --resolution 256 \
43
  --input_dir path-to-audio-files \
44
  --output_dir data-256 \
@@ -49,7 +49,7 @@ python src/audio_to_images.py \
49
 
50
  ```bash
51
  accelerate launch --config_file accelerate_local.yaml \
52
- src/train_unconditional.py \
53
  --dataset_name data-64 \
54
  --resolution 64 \
55
  --hop_length 1024 \
@@ -66,7 +66,7 @@ accelerate launch --config_file accelerate_local.yaml \
66
 
67
  ```bash
68
  accelerate launch --config_file accelerate_local.yaml \
69
- src/train_unconditional.py \
70
  --dataset_name teticio/audio-diffusion-256 \
71
  --resolution 256 \
72
  --output_dir ddpm-ema-audio-256 \
@@ -86,7 +86,7 @@ accelerate launch --config_file accelerate_local.yaml \
86
 
87
  ```bash
88
  accelerate launch --config_file accelerate_sagemaker.yaml \
89
- src/train_unconditional.py \
90
  --dataset_name teticio/audio-diffusion-256 \
91
  --resolution 256 \
92
  --output_dir ddpm-ema-audio-256 \
 
28
  #### Training can be run with Mel spectrograms of resolution 64x64 on a single commercial grade GPU (e.g. RTX 2080 Ti). The `hop_length` should be set to 1024 for better results.
29
 
30
  ```bash
31
+ python audiodiffusion/audio_to_images.py \
32
  --resolution 64 \
33
  --hop_length 1024\
34
  --input_dir path-to-audio-files \
 
38
  #### Generate dataset of 256x256 Mel spectrograms and push to hub (you will need to be authenticated with `huggingface-cli login`).
39
 
40
  ```bash
41
+ python audiodiffusion/audio_to_images.py \
42
  --resolution 256 \
43
  --input_dir path-to-audio-files \
44
  --output_dir data-256 \
 
49
 
50
  ```bash
51
  accelerate launch --config_file accelerate_local.yaml \
52
+ audiodiffusion/train_unconditional.py \
53
  --dataset_name data-64 \
54
  --resolution 64 \
55
  --hop_length 1024 \
 
66
 
67
  ```bash
68
  accelerate launch --config_file accelerate_local.yaml \
69
+ audiodiffusion/train_unconditional.py \
70
  --dataset_name teticio/audio-diffusion-256 \
71
  --resolution 256 \
72
  --output_dir ddpm-ema-audio-256 \
 
86
 
87
  ```bash
88
  accelerate launch --config_file accelerate_sagemaker.yaml \
89
+ audiodiffusion/train_unconditional.py \
90
  --dataset_name teticio/audio-diffusion-256 \
91
  --resolution 256 \
92
  --output_dir ddpm-ema-audio-256 \
app.py CHANGED
@@ -1,23 +1,10 @@
1
  import argparse
2
 
3
  import gradio as gr
4
- from PIL import Image
5
- from diffusers import DDPMPipeline
6
 
7
- from src.mel import Mel
8
-
9
- mel = Mel(x_res=256, y_res=256)
10
- model_id = "teticio/audio-diffusion-256"
11
- ddpm = DDPMPipeline.from_pretrained(model_id)
12
-
13
-
14
- def generate_spectrogram_and_audio():
15
- images = ddpm(output_type="numpy")["sample"]
16
- images = (images * 255).round().astype("uint8").transpose(0, 3, 1, 2)
17
- image = Image.fromarray(images[0][0])
18
- audio = mel.image_to_audio(image)
19
- return image, (mel.get_sample_rate(), audio)
20
 
 
21
 
22
  if __name__ == "__main__":
23
  parser = argparse.ArgumentParser()
@@ -26,9 +13,9 @@ if __name__ == "__main__":
26
  args = parser.parse_args()
27
 
28
  demo = gr.Interface(
29
- fn=generate_spectrogram_and_audio,
30
  title="Audio Diffusion",
31
- description=f"Generate audio using Huggingface diffusers.\
32
  This takes about 20 minutes without a GPU, so why not make yourself a cup of tea in the meantime?",
33
  inputs=[],
34
  outputs=[
 
1
  import argparse
2
 
3
  import gradio as gr
 
 
4
 
5
+ from audiodiffusion import AudioDiffusion
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
+ audio_diffusion = AudioDiffusion()
8
 
9
  if __name__ == "__main__":
10
  parser = argparse.ArgumentParser()
 
13
  args = parser.parse_args()
14
 
15
  demo = gr.Interface(
16
+ fn=audio_diffusion.generate_spectrogram_and_audio,
17
  title="Audio Diffusion",
18
+ description="Generate audio using Huggingface diffusers.\
19
  This takes about 20 minutes without a GPU, so why not make yourself a cup of tea in the meantime?",
20
  inputs=[],
21
  outputs=[
src/audio_to_images.py → audio_to_images.py RENAMED
@@ -1,8 +1,3 @@
1
- # TODO
2
- # run on sagemaker
3
- # run with deepspeed
4
-
5
-
6
  import os
7
  import re
8
  import io
@@ -12,17 +7,17 @@ import pandas as pd
12
  from tqdm.auto import tqdm
13
  from datasets import Dataset, DatasetDict, Features, Image, Value
14
 
15
- from mel import Mel
16
 
17
 
18
  def main(args):
19
- mel = Mel(x_res=args.resolution, y_res=args.resolution, hop_length=args.hop_length)
 
 
20
  os.makedirs(args.output_dir, exist_ok=True)
21
  audio_files = [
22
- os.path.join(root, file)
23
- for root, _, files in os.walk(args.input_dir)
24
- for file in files
25
- if re.search("\.(mp3|wav|m4a)$", file, re.IGNORECASE)
26
  ]
27
  examples = []
28
  try:
@@ -35,31 +30,26 @@ def main(args):
35
  continue
36
  for slice in range(mel.get_number_of_slices()):
37
  image = mel.audio_slice_to_image(slice)
38
- assert (
39
- image.width == args.resolution and image.height == args.resolution
40
- )
41
  with io.BytesIO() as output:
42
  image.save(output, format="PNG")
43
  bytes = output.getvalue()
44
- examples.extend(
45
- [
46
- {
47
- "image": {"bytes": bytes},
48
- "audio_file": audio_file,
49
- "slice": slice,
50
- }
51
- ]
52
- )
53
  finally:
54
  ds = Dataset.from_pandas(
55
  pd.DataFrame(examples),
56
- features=Features(
57
- {
58
- "image": Image(),
59
- "audio_file": Value(dtype="string"),
60
- "slice": Value(dtype="int16"),
61
- }
62
- ),
63
  )
64
  dsd = DatasetDict({"train": ds})
65
  dsd.save_to_disk(os.path.join(args.output_dir))
@@ -69,8 +59,8 @@ def main(args):
69
 
70
  if __name__ == "__main__":
71
  parser = argparse.ArgumentParser(
72
- description="Create dataset of Mel spectrograms from directory of audio files."
73
- )
74
  parser.add_argument("--input_dir", type=str)
75
  parser.add_argument("--output_dir", type=str, default="data")
76
  parser.add_argument("--resolution", type=int, default=256)
 
 
 
 
 
 
1
  import os
2
  import re
3
  import io
 
7
  from tqdm.auto import tqdm
8
  from datasets import Dataset, DatasetDict, Features, Image, Value
9
 
10
+ from audiodiffusion.mel import Mel
11
 
12
 
13
  def main(args):
14
+ mel = Mel(x_res=args.resolution,
15
+ y_res=args.resolution,
16
+ hop_length=args.hop_length)
17
  os.makedirs(args.output_dir, exist_ok=True)
18
  audio_files = [
19
+ os.path.join(root, file) for root, _, files in os.walk(args.input_dir)
20
+ for file in files if re.search("\.(mp3|wav|m4a)$", file, re.IGNORECASE)
 
 
21
  ]
22
  examples = []
23
  try:
 
30
  continue
31
  for slice in range(mel.get_number_of_slices()):
32
  image = mel.audio_slice_to_image(slice)
33
+ assert (image.width == args.resolution
34
+ and image.height == args.resolution)
 
35
  with io.BytesIO() as output:
36
  image.save(output, format="PNG")
37
  bytes = output.getvalue()
38
+ examples.extend([{
39
+ "image": {
40
+ "bytes": bytes
41
+ },
42
+ "audio_file": audio_file,
43
+ "slice": slice,
44
+ }])
 
 
45
  finally:
46
  ds = Dataset.from_pandas(
47
  pd.DataFrame(examples),
48
+ features=Features({
49
+ "image": Image(),
50
+ "audio_file": Value(dtype="string"),
51
+ "slice": Value(dtype="int16"),
52
+ }),
 
 
53
  )
54
  dsd = DatasetDict({"train": ds})
55
  dsd.save_to_disk(os.path.join(args.output_dir))
 
59
 
60
  if __name__ == "__main__":
61
  parser = argparse.ArgumentParser(
62
+ description=
63
+ "Create dataset of Mel spectrograms from directory of audio files.")
64
  parser.add_argument("--input_dir", type=str)
65
  parser.add_argument("--output_dir", type=str, default="data")
66
  parser.add_argument("--resolution", type=int, default=256)
audiodiffusion/__init__.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image
2
+ from torch import cuda
3
+ from diffusers import DDPMPipeline
4
+
5
+ from .mel import Mel
6
+
7
+ VERSION = "1.0.1"
8
+
9
+
10
+ class AudioDiffusion:
11
+
12
+ def __init__(self,
13
+ model_id="teticio/audio-diffusion-256",
14
+ resolution=256,
15
+ cuda=cuda.is_available()):
16
+ """Class for generating audio using Denoising Diffusion Probabilistic Models.
17
+
18
+ Args:
19
+ model_id (String): name of model (local directory or Hugging Face Hub)
20
+ resolution (int): size of square mel spectrogram in pixels
21
+ cuda (bool): use CUDA?
22
+ """
23
+ self.mel = Mel(x_res=resolution, y_res=resolution)
24
+ self.model_id = model_id
25
+ self.ddpm = DDPMPipeline.from_pretrained(self.model_id)
26
+ if cuda:
27
+ self.ddpm.to("cuda")
28
+
29
+ def generate_spectrogram_and_audio(self):
30
+ """Generate random mel spectrogram and convert to audio.
31
+
32
+ Returns:
33
+ PIL Image: mel spectrogram
34
+ (float, array): sample rate and raw audio
35
+ """
36
+ images = self.ddpm(output_type="numpy")["sample"]
37
+ images = (images * 255).round().astype("uint8").transpose(0, 3, 1, 2)
38
+ image = Image.fromarray(images[0][0])
39
+ audio = self.mel.image_to_audio(image)
40
+ return image, (self.mel.get_sample_rate(), audio)
{src → audiodiffusion}/mel.py RENAMED
@@ -1,4 +1,5 @@
1
  import warnings
 
2
  warnings.filterwarnings('ignore')
3
 
4
  import librosa
@@ -7,6 +8,7 @@ from PIL import Image
7
 
8
 
9
  class Mel:
 
10
  def __init__(
11
  self,
12
  x_res=256,
@@ -16,6 +18,16 @@ class Mel:
16
  hop_length=512,
17
  top_db=80,
18
  ):
 
 
 
 
 
 
 
 
 
 
19
  self.x_res = x_res
20
  self.y_res = y_res
21
  self.sr = sample_rate
@@ -28,17 +40,40 @@ class Mel:
28
  self.y = None
29
 
30
  def load_audio(self, audio_file):
 
 
 
 
 
31
  self.y, _ = librosa.load(audio_file, mono=True)
32
 
33
  def get_number_of_slices(self):
 
 
 
 
 
34
  return len(self.y) // self.slice_size
35
 
36
  def get_sample_rate(self):
 
 
 
 
 
37
  return self.sr
38
 
39
  def audio_slice_to_image(self, slice):
 
 
 
 
 
 
 
 
40
  S = librosa.feature.melspectrogram(
41
- y=self.y[self.slice_size * slice : self.slice_size * (slice + 1)],
42
  sr=self.sr,
43
  n_fft=self.n_fft,
44
  hop_length=self.hop_length,
@@ -46,19 +81,24 @@ class Mel:
46
  fmax=self.fmax,
47
  )
48
  log_S = librosa.power_to_db(S, ref=np.max, top_db=self.top_db)
49
- bytedata = (
50
- ((log_S + self.top_db) * 255 / self.top_db).clip(0, 255) + 0.5
51
- ).astype(np.uint8)
52
  image = Image.frombytes("L", log_S.shape, bytedata.tobytes())
53
  return image
54
 
55
  def image_to_audio(self, image):
 
 
 
 
 
 
 
 
56
  bytedata = np.frombuffer(image.tobytes(), dtype="uint8").reshape(
57
- (image.width, image.height)
58
- )
59
  log_S = bytedata.astype("float") * self.top_db / 255 - self.top_db
60
  S = librosa.db_to_power(log_S)
61
  audio = librosa.feature.inverse.mel_to_audio(
62
- S, sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length
63
- )
64
  return audio
 
1
  import warnings
2
+
3
  warnings.filterwarnings('ignore')
4
 
5
  import librosa
 
8
 
9
 
10
  class Mel:
11
+
12
  def __init__(
13
  self,
14
  x_res=256,
 
18
  hop_length=512,
19
  top_db=80,
20
  ):
21
+ """Class to convert audio to mel spectrograms and vice versa.
22
+
23
+ Args:
24
+ x_res (int): x resolution of spectrogram (time)
25
+ y_res (int): y resolution of spectrogram (frequency bins)
26
+ sample_rate (int): sample rate of audio
27
+ n_fft (int): number of Fast Fourier Transforms
28
+ hop_length (int): hop length (a higher number is recommended for lower than 256 y_res)
29
+ top_db (int): loudest in decibels
30
+ """
31
  self.x_res = x_res
32
  self.y_res = y_res
33
  self.sr = sample_rate
 
40
  self.y = None
41
 
42
  def load_audio(self, audio_file):
43
+ """Load audio.
44
+
45
+ Args:
46
+ file (str): must be a file on disk due to Librosa limitation
47
+ """
48
  self.y, _ = librosa.load(audio_file, mono=True)
49
 
50
  def get_number_of_slices(self):
51
+ """Get number of slices in audio.
52
+
53
+ Returns:
54
+ int: number of spectograms audio can be sliced into
55
+ """
56
  return len(self.y) // self.slice_size
57
 
58
  def get_sample_rate(self):
59
+ """Get sample rate:
60
+
61
+ Returns:
62
+ int: sample rate of audio
63
+ """
64
  return self.sr
65
 
66
  def audio_slice_to_image(self, slice):
67
+ """Convert slice of audio to spectrogram.
68
+
69
+ Args:
70
+ slice (int): slice number of audio to convert (out of get_number_of_slices())
71
+
72
+ Returns:
73
+ PIL Image: grayscale image of x_res x y_res
74
+ """
75
  S = librosa.feature.melspectrogram(
76
+ y=self.y[self.slice_size * slice:self.slice_size * (slice + 1)],
77
  sr=self.sr,
78
  n_fft=self.n_fft,
79
  hop_length=self.hop_length,
 
81
  fmax=self.fmax,
82
  )
83
  log_S = librosa.power_to_db(S, ref=np.max, top_db=self.top_db)
84
+ bytedata = (((log_S + self.top_db) * 255 / self.top_db).clip(0, 255) +
85
+ 0.5).astype(np.uint8)
 
86
  image = Image.frombytes("L", log_S.shape, bytedata.tobytes())
87
  return image
88
 
89
  def image_to_audio(self, image):
90
+ """Converts spectrogram to audio.
91
+
92
+ Args:
93
+ image (PIL Image): x_res x y_res grayscale image
94
+
95
+ Returns:
96
+ audio (array): raw audio
97
+ """
98
  bytedata = np.frombuffer(image.tobytes(), dtype="uint8").reshape(
99
+ (image.width, image.height))
 
100
  log_S = bytedata.astype("float") * self.top_db / 255 - self.top_db
101
  S = librosa.db_to_power(log_S)
102
  audio = librosa.feature.inverse.mel_to_audio(
103
+ S, sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length)
 
104
  return audio
notebooks/test-mel.ipynb CHANGED
@@ -30,8 +30,8 @@
30
  "metadata": {},
31
  "outputs": [],
32
  "source": [
33
- "from src.mel import Mel\n",
34
- "from IPython.display import Audio"
35
  ]
36
  },
37
  {
@@ -178,7 +178,7 @@
178
  "name": "python",
179
  "nbconvert_exporter": "python",
180
  "pygments_lexer": "ipython3",
181
- "version": "3.9.13"
182
  },
183
  "toc": {
184
  "base_numbering": 1,
 
30
  "metadata": {},
31
  "outputs": [],
32
  "source": [
33
+ "from IPython.display import Audio\n",
34
+ "from audiodiffusion.mel import Mel"
35
  ]
36
  },
37
  {
 
178
  "name": "python",
179
  "nbconvert_exporter": "python",
180
  "pygments_lexer": "ipython3",
181
+ "version": "3.10.4"
182
  },
183
  "toc": {
184
  "base_numbering": 1,
notebooks/test-model.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
requirements-lock.txt CHANGED
@@ -2,6 +2,9 @@ absl-py==1.2.0
2
  accelerate==0.12.0
3
  aiohttp==3.8.1
4
  aiosignal==1.2.0
 
 
 
5
  appdirs==1.4.4
6
  argon2-cffi==21.3.0
7
  argon2-cffi-bindings==21.2.0
@@ -10,12 +13,19 @@ async-timeout==4.0.2
10
  attrs==22.1.0
11
  audioread==3.0.0
12
  backcall==0.2.0
 
 
13
  beautifulsoup4==4.11.1
14
  bleach==5.0.1
 
15
  cachetools==5.2.0
16
  certifi==2022.6.15
17
  cffi==1.15.1
18
  charset-normalizer==2.1.1
 
 
 
 
19
  datasets==2.4.0
20
  debugpy==1.6.3
21
  decorator==5.1.1
@@ -24,14 +34,23 @@ diffusers==0.2.4
24
  dill==0.3.5.1
25
  entrypoints==0.4
26
  executing==0.10.0
 
27
  fastjsonschema==2.16.1
 
28
  filelock==3.8.0
 
29
  frozenlist==1.3.1
30
  fsspec==2022.7.1
31
  ftfy==6.1.1
 
 
32
  google-auth==2.11.0
33
  google-auth-oauthlib==0.4.6
 
34
  grpcio==1.47.0
 
 
 
35
  huggingface-hub==0.9.0
36
  idna==3.3
37
  importlib-metadata==4.12.0
@@ -48,13 +67,20 @@ jupyter-console==6.4.4
48
  jupyter-core==4.11.1
49
  jupyterlab-pygments==0.2.2
50
  jupyterlab-widgets==3.0.2
 
51
  librosa==0.9.2
 
52
  llvmlite==0.39.0
53
  lxml==4.9.1
54
  Markdown==3.4.1
 
55
  MarkupSafe==2.1.1
 
56
  matplotlib-inline==0.1.6
 
 
57
  mistune==2.0.4
 
58
  multidict==6.0.2
59
  multiprocess==0.70.13
60
  nbclient==0.6.7
@@ -65,9 +91,11 @@ notebook==6.4.12
65
  numba==0.56.0
66
  numpy==1.22.4
67
  oauthlib==3.2.0
 
68
  packaging==21.3
69
  pandas==1.4.3
70
  pandocfilters==1.5.0
 
71
  parso==0.8.3
72
  pexpect==4.8.0
73
  pickleshare==0.7.5
@@ -83,11 +111,19 @@ pyarrow==9.0.0
83
  pyasn1==0.4.8
84
  pyasn1-modules==0.2.8
85
  pycparser==2.21
 
 
 
 
86
  Pygments==2.13.0
 
 
87
  pyparsing==3.0.9
88
  pyrsistent==0.18.1
89
  python-dateutil==2.8.2
 
90
  pytz==2022.2.1
 
91
  PyYAML==6.0
92
  pyzmq==23.2.1
93
  qtconsole==5.3.1
@@ -97,14 +133,21 @@ requests==2.28.1
97
  requests-oauthlib==1.3.1
98
  resampy==0.4.0
99
  responses==0.18.0
 
 
100
  rsa==4.9
101
  scikit-learn==1.1.2
102
  scipy==1.9.0
 
103
  Send2Trash==1.8.0
104
  six==1.16.0
 
 
105
  SoundFile==0.10.3.post1
106
  soupsieve==2.3.2.post1
107
  stack-data==0.4.0
 
 
108
  tensorboard==2.10.0
109
  tensorboard-data-server==0.6.1
110
  tensorboard-plugin-wit==1.8.1
@@ -113,6 +156,7 @@ threadpoolctl==3.1.0
113
  tinycss2==1.1.1
114
  tokenizers==0.12.1
115
  toml==0.10.2
 
116
  torch==1.12.1
117
  torchvision==0.13.1
118
  tornado==6.2
@@ -120,9 +164,16 @@ tqdm==4.64.0
120
  traitlets==5.3.0
121
  transformers==4.21.1
122
  typing_extensions==4.3.0
 
 
 
123
  urllib3==1.26.12
 
 
 
124
  wcwidth==0.2.5
125
  webencodings==0.5.1
 
126
  Werkzeug==2.2.2
127
  widgetsnbextension==3.6.1
128
  xxhash==3.0.0
 
2
  accelerate==0.12.0
3
  aiohttp==3.8.1
4
  aiosignal==1.2.0
5
+ altair==4.2.0
6
+ analytics-python==1.4.0
7
+ anyio==3.6.1
8
  appdirs==1.4.4
9
  argon2-cffi==21.3.0
10
  argon2-cffi-bindings==21.2.0
 
13
  attrs==22.1.0
14
  audioread==3.0.0
15
  backcall==0.2.0
16
+ backoff==1.10.0
17
+ bcrypt==4.0.0
18
  beautifulsoup4==4.11.1
19
  bleach==5.0.1
20
+ blinker==1.5
21
  cachetools==5.2.0
22
  certifi==2022.6.15
23
  cffi==1.15.1
24
  charset-normalizer==2.1.1
25
+ click==8.1.3
26
+ commonmark==0.9.1
27
+ cryptography==37.0.4
28
+ cycler==0.11.0
29
  datasets==2.4.0
30
  debugpy==1.6.3
31
  decorator==5.1.1
 
34
  dill==0.3.5.1
35
  entrypoints==0.4
36
  executing==0.10.0
37
+ fastapi==0.81.0
38
  fastjsonschema==2.16.1
39
+ ffmpy==0.3.0
40
  filelock==3.8.0
41
+ fonttools==4.37.1
42
  frozenlist==1.3.1
43
  fsspec==2022.7.1
44
  ftfy==6.1.1
45
+ gitdb==4.0.9
46
+ GitPython==3.1.27
47
  google-auth==2.11.0
48
  google-auth-oauthlib==0.4.6
49
+ gradio==3.1.7
50
  grpcio==1.47.0
51
+ h11==0.12.0
52
+ httpcore==0.15.0
53
+ httpx==0.23.0
54
  huggingface-hub==0.9.0
55
  idna==3.3
56
  importlib-metadata==4.12.0
 
67
  jupyter-core==4.11.1
68
  jupyterlab-pygments==0.2.2
69
  jupyterlab-widgets==3.0.2
70
+ kiwisolver==1.4.4
71
  librosa==0.9.2
72
+ linkify-it-py==1.0.3
73
  llvmlite==0.39.0
74
  lxml==4.9.1
75
  Markdown==3.4.1
76
+ markdown-it-py==2.1.0
77
  MarkupSafe==2.1.1
78
+ matplotlib==3.5.3
79
  matplotlib-inline==0.1.6
80
+ mdit-py-plugins==0.3.0
81
+ mdurl==0.1.2
82
  mistune==2.0.4
83
+ monotonic==1.6
84
  multidict==6.0.2
85
  multiprocess==0.70.13
86
  nbclient==0.6.7
 
91
  numba==0.56.0
92
  numpy==1.22.4
93
  oauthlib==3.2.0
94
+ orjson==3.8.0
95
  packaging==21.3
96
  pandas==1.4.3
97
  pandocfilters==1.5.0
98
+ paramiko==2.11.0
99
  parso==0.8.3
100
  pexpect==4.8.0
101
  pickleshare==0.7.5
 
111
  pyasn1==0.4.8
112
  pyasn1-modules==0.2.8
113
  pycparser==2.21
114
+ pycryptodome==3.15.0
115
+ pydantic==1.9.2
116
+ pydeck==0.8.0b1
117
+ pydub==0.25.1
118
  Pygments==2.13.0
119
+ Pympler==1.0.1
120
+ PyNaCl==1.5.0
121
  pyparsing==3.0.9
122
  pyrsistent==0.18.1
123
  python-dateutil==2.8.2
124
+ python-multipart==0.0.5
125
  pytz==2022.2.1
126
+ pytz-deprecation-shim==0.1.0.post0
127
  PyYAML==6.0
128
  pyzmq==23.2.1
129
  qtconsole==5.3.1
 
133
  requests-oauthlib==1.3.1
134
  resampy==0.4.0
135
  responses==0.18.0
136
+ rfc3986==1.5.0
137
+ rich==12.5.1
138
  rsa==4.9
139
  scikit-learn==1.1.2
140
  scipy==1.9.0
141
+ semver==2.13.0
142
  Send2Trash==1.8.0
143
  six==1.16.0
144
+ smmap==5.0.0
145
+ sniffio==1.2.0
146
  SoundFile==0.10.3.post1
147
  soupsieve==2.3.2.post1
148
  stack-data==0.4.0
149
+ starlette==0.19.1
150
+ streamlit==1.12.2
151
  tensorboard==2.10.0
152
  tensorboard-data-server==0.6.1
153
  tensorboard-plugin-wit==1.8.1
 
156
  tinycss2==1.1.1
157
  tokenizers==0.12.1
158
  toml==0.10.2
159
+ toolz==0.12.0
160
  torch==1.12.1
161
  torchvision==0.13.1
162
  tornado==6.2
 
164
  traitlets==5.3.0
165
  transformers==4.21.1
166
  typing_extensions==4.3.0
167
+ tzdata==2022.2
168
+ tzlocal==4.2
169
+ uc-micro-py==1.0.1
170
  urllib3==1.26.12
171
+ uvicorn==0.18.3
172
+ validators==0.20.0
173
+ watchdog==2.1.9
174
  wcwidth==0.2.5
175
  webencodings==0.5.1
176
+ websockets==10.3
177
  Werkzeug==2.2.2
178
  widgetsnbextension==3.6.1
179
  xxhash==3.0.0
requirements.txt CHANGED
@@ -1,8 +1,8 @@
1
- # for Hugging Face Spaces
2
  torch
3
  numpy
4
  Pillow
5
- diffusers
6
  librosa
7
  datasets
8
  gradio
 
 
 
1
  torch
2
  numpy
3
  Pillow
4
+ diffusers>=0.2.4
5
  librosa
6
  datasets
7
  gradio
8
+ streamlit
setup.cfg ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [metadata]
2
+ name = audiodiffusion
3
+ version = attr: audiodiffusion.VERSION
4
+ description = Generate Mel spectrogram dataset from directory of audio files.
5
+ long_description = file: README.md
6
+ license = GPL3
7
+ classifiers =
8
+ Programming Language :: Python :: 3
9
+
10
+ [options]
11
+ zip_safe = False
12
+ packages = audiodiffusion
13
+ install_requires =
14
+ torch
15
+ numpy
16
+ Pillow
17
+ diffusers>=0.2.4
18
+ librosa
19
+ datasets
setup.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ from setuptools import setup
4
+
5
+ if __name__ == "__main__":
6
+ setup()
streamlit_app.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from io import BytesIO
2
+ import streamlit as st
3
+ import soundfile as sf
4
+ from librosa.util import normalize
5
+
6
+ from audiodiffusion import AudioDiffusion
7
+
8
+ audio_diffusion = AudioDiffusion()
9
+
10
+ if __name__ == "__main__":
11
+ st.header("Audio Diffusion")
12
+ st.markdown("Generate audio using Huggingface diffusers.\
13
+ This takes about 20 minutes without a GPU, so why not make yourself a cup of tea in the meantime?"
14
+ )
15
+ if st.button("Generate"):
16
+ st.markdown("Generating...")
17
+ image, (sample_rate,
18
+ audio) = audio_diffusion.generate_spectrogram_and_audio()
19
+ st.image(image, caption="Mel spectrogram")
20
+ buffer = BytesIO()
21
+ sf.write(buffer, normalize(audio), sample_rate, format="WAV")
22
+ st.audio(buffer, format="audio/wav")
src/train_unconditional.py → train_unconditional.py RENAMED
@@ -24,8 +24,9 @@ from torchvision.transforms import (
24
  ToTensor,
25
  )
26
  from tqdm.auto import tqdm
 
27
 
28
- from mel import Mel
29
 
30
  logger = get_logger(__name__)
31
 
@@ -65,7 +66,8 @@ def main(args):
65
  "UpBlock2D",
66
  ),
67
  )
68
- noise_scheduler = DDPMScheduler(num_train_timesteps=1000, tensor_format="pt")
 
69
  optimizer = torch.optim.AdamW(
70
  model.parameters(),
71
  lr=args.learning_rate,
@@ -74,20 +76,17 @@ def main(args):
74
  eps=args.adam_epsilon,
75
  )
76
 
77
- augmentations = Compose(
78
- [
79
- Resize(args.resolution, interpolation=InterpolationMode.BILINEAR),
80
- CenterCrop(args.resolution),
81
- ToTensor(),
82
- Normalize([0.5], [0.5]),
83
- ]
84
- )
85
 
86
  if args.dataset_name is not None:
87
  if os.path.exists(args.dataset_name):
88
- dataset = load_from_disk(args.dataset_name, args.dataset_config_name)[
89
- "train"
90
- ]
91
  else:
92
  dataset = load_dataset(
93
  args.dataset_name,
@@ -110,20 +109,18 @@ def main(args):
110
 
111
  dataset.set_transform(transforms)
112
  train_dataloader = torch.utils.data.DataLoader(
113
- dataset, batch_size=args.train_batch_size, shuffle=True
114
- )
115
 
116
  lr_scheduler = get_scheduler(
117
  args.lr_scheduler,
118
  optimizer=optimizer,
119
  num_warmup_steps=args.lr_warmup_steps,
120
- num_training_steps=(len(train_dataloader) * args.num_epochs)
121
- // args.gradient_accumulation_steps,
122
  )
123
 
124
  model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
125
- model, optimizer, train_dataloader, lr_scheduler
126
- )
127
 
128
  ema_model = EMAModel(
129
  getattr(model, "module", model),
@@ -139,13 +136,14 @@ def main(args):
139
  run = os.path.split(__file__)[-1].split(".")[0]
140
  accelerator.init_trackers(run)
141
 
142
- mel = Mel(x_res=args.resolution, y_res=args.resolution, hop_length=args.hop_length)
 
 
143
 
144
  global_step = 0
145
  for epoch in range(args.num_epochs):
146
- progress_bar = tqdm(
147
- total=len(train_dataloader), disable=not accelerator.is_local_main_process
148
- )
149
  progress_bar.set_description(f"Epoch {epoch}")
150
 
151
  if epoch < args.start_epoch:
@@ -168,13 +166,14 @@ def main(args):
168
  timesteps = torch.randint(
169
  0,
170
  noise_scheduler.num_train_timesteps,
171
- (bsz,),
172
  device=clean_images.device,
173
  ).long()
174
 
175
  # Add noise to the clean images according to the noise magnitude at each timestep
176
  # (this is the forward diffusion process)
177
- noisy_images = noise_scheduler.add_noise(clean_images, noise, timesteps)
 
178
 
179
  with accelerator.accumulate(model):
180
  # Predict the noise residual
@@ -209,11 +208,10 @@ def main(args):
209
  if epoch % args.save_model_epochs == 0 or epoch == args.num_epochs - 1:
210
  pipeline = DDPMPipeline(
211
  unet=accelerator.unwrap_model(
212
- ema_model.averaged_model if args.use_ema else model
213
- ),
214
  scheduler=noise_scheduler,
215
  )
216
-
217
  # save the model
218
  if args.push_to_hub:
219
  try:
@@ -238,17 +236,16 @@ def main(args):
238
  )["sample"]
239
 
240
  # denormalize the images and save to tensorboard
241
- images_processed = (
242
- (images * 255).round().astype("uint8").transpose(0, 3, 1, 2)
243
- )
244
  accelerator.trackers[0].writer.add_images(
245
- "test_samples", images_processed, epoch
246
- )
247
  for _, image in enumerate(images_processed):
248
  audio = mel.image_to_audio(Image.fromarray(image[0]))
249
  accelerator.trackers[0].writer.add_audio(
250
  f"test_audio_{_}",
251
- audio,
252
  epoch,
253
  sample_rate=mel.get_sample_rate(),
254
  )
@@ -258,7 +255,8 @@ def main(args):
258
 
259
 
260
  if __name__ == "__main__":
261
- parser = argparse.ArgumentParser(description="Simple example of a training script.")
 
262
  parser.add_argument("--local_rank", type=int, default=-1)
263
  parser.add_argument("--dataset_name", type=str, default=None)
264
  parser.add_argument("--dataset_config_name", type=str, default=None)
@@ -303,8 +301,7 @@ if __name__ == "__main__":
303
  help=(
304
  "Whether to use mixed precision. Choose"
305
  "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
306
- "and an Nvidia Ampere GPU."
307
- ),
308
  )
309
  parser.add_argument("--hop_length", type=int, default=512)
310
  parser.add_argument("--from_pretrained", type=str, default=None)
 
24
  ToTensor,
25
  )
26
  from tqdm.auto import tqdm
27
+ from librosa.util import normalize
28
 
29
+ from audiodiffusion.mel import Mel
30
 
31
  logger = get_logger(__name__)
32
 
 
66
  "UpBlock2D",
67
  ),
68
  )
69
+ noise_scheduler = DDPMScheduler(num_train_timesteps=1000,
70
+ tensor_format="pt")
71
  optimizer = torch.optim.AdamW(
72
  model.parameters(),
73
  lr=args.learning_rate,
 
76
  eps=args.adam_epsilon,
77
  )
78
 
79
+ augmentations = Compose([
80
+ Resize(args.resolution, interpolation=InterpolationMode.BILINEAR),
81
+ CenterCrop(args.resolution),
82
+ ToTensor(),
83
+ Normalize([0.5], [0.5]),
84
+ ])
 
 
85
 
86
  if args.dataset_name is not None:
87
  if os.path.exists(args.dataset_name):
88
+ dataset = load_from_disk(args.dataset_name,
89
+ args.dataset_config_name)["train"]
 
90
  else:
91
  dataset = load_dataset(
92
  args.dataset_name,
 
109
 
110
  dataset.set_transform(transforms)
111
  train_dataloader = torch.utils.data.DataLoader(
112
+ dataset, batch_size=args.train_batch_size, shuffle=True)
 
113
 
114
  lr_scheduler = get_scheduler(
115
  args.lr_scheduler,
116
  optimizer=optimizer,
117
  num_warmup_steps=args.lr_warmup_steps,
118
+ num_training_steps=(len(train_dataloader) * args.num_epochs) //
119
+ args.gradient_accumulation_steps,
120
  )
121
 
122
  model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
123
+ model, optimizer, train_dataloader, lr_scheduler)
 
124
 
125
  ema_model = EMAModel(
126
  getattr(model, "module", model),
 
136
  run = os.path.split(__file__)[-1].split(".")[0]
137
  accelerator.init_trackers(run)
138
 
139
+ mel = Mel(x_res=args.resolution,
140
+ y_res=args.resolution,
141
+ hop_length=args.hop_length)
142
 
143
  global_step = 0
144
  for epoch in range(args.num_epochs):
145
+ progress_bar = tqdm(total=len(train_dataloader),
146
+ disable=not accelerator.is_local_main_process)
 
147
  progress_bar.set_description(f"Epoch {epoch}")
148
 
149
  if epoch < args.start_epoch:
 
166
  timesteps = torch.randint(
167
  0,
168
  noise_scheduler.num_train_timesteps,
169
+ (bsz, ),
170
  device=clean_images.device,
171
  ).long()
172
 
173
  # Add noise to the clean images according to the noise magnitude at each timestep
174
  # (this is the forward diffusion process)
175
+ noisy_images = noise_scheduler.add_noise(clean_images, noise,
176
+ timesteps)
177
 
178
  with accelerator.accumulate(model):
179
  # Predict the noise residual
 
208
  if epoch % args.save_model_epochs == 0 or epoch == args.num_epochs - 1:
209
  pipeline = DDPMPipeline(
210
  unet=accelerator.unwrap_model(
211
+ ema_model.averaged_model if args.use_ema else model),
 
212
  scheduler=noise_scheduler,
213
  )
214
+
215
  # save the model
216
  if args.push_to_hub:
217
  try:
 
236
  )["sample"]
237
 
238
  # denormalize the images and save to tensorboard
239
+ images_processed = ((images *
240
+ 255).round().astype("uint8").transpose(
241
+ 0, 3, 1, 2))
242
  accelerator.trackers[0].writer.add_images(
243
+ "test_samples", images_processed, epoch)
 
244
  for _, image in enumerate(images_processed):
245
  audio = mel.image_to_audio(Image.fromarray(image[0]))
246
  accelerator.trackers[0].writer.add_audio(
247
  f"test_audio_{_}",
248
+ normalize(audio),
249
  epoch,
250
  sample_rate=mel.get_sample_rate(),
251
  )
 
255
 
256
 
257
  if __name__ == "__main__":
258
+ parser = argparse.ArgumentParser(
259
+ description="Simple example of a training script.")
260
  parser.add_argument("--local_rank", type=int, default=-1)
261
  parser.add_argument("--dataset_name", type=str, default=None)
262
  parser.add_argument("--dataset_config_name", type=str, default=None)
 
301
  help=(
302
  "Whether to use mixed precision. Choose"
303
  "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
304
+ "and an Nvidia Ampere GPU."),
 
305
  )
306
  parser.add_argument("--hop_length", type=int, default=512)
307
  parser.add_argument("--from_pretrained", type=str, default=None)