Spaces:
Runtime error
Runtime error
tidy
Browse files- .gitignore +2 -4
- README.md +6 -6
- notebooks/test_vae.ipynb +26 -6
- scripts/train_unconditional.py +7 -5
- scripts/train_vae.py +2 -2
.gitignore
CHANGED
@@ -1,13 +1,11 @@
|
|
1 |
.vscode
|
2 |
__pycache__
|
3 |
.ipynb_checkpoints
|
4 |
-
data
|
5 |
-
|
6 |
flagged
|
7 |
build
|
8 |
audiodiffusion.egg-info
|
9 |
lightning_logs
|
10 |
taming
|
11 |
checkpoints
|
12 |
-
vae_model
|
13 |
-
latent-audio-diffusion-*
|
|
|
1 |
.vscode
|
2 |
__pycache__
|
3 |
.ipynb_checkpoints
|
4 |
+
data
|
5 |
+
models
|
6 |
flagged
|
7 |
build
|
8 |
audiodiffusion.egg-info
|
9 |
lightning_logs
|
10 |
taming
|
11 |
checkpoints
|
|
|
|
README.md
CHANGED
@@ -56,7 +56,7 @@ python scripts/audio_to_images.py \
|
|
56 |
--resolution 64 \
|
57 |
--hop_length 1024 \
|
58 |
--input_dir path-to-audio-files \
|
59 |
-
--output_dir data
|
60 |
```
|
61 |
#### Generate dataset of 256x256 Mel spectrograms and push to hub (you will need to be authenticated with `huggingface-cli login`).
|
62 |
|
@@ -64,7 +64,7 @@ python scripts/audio_to_images.py \
|
|
64 |
python scripts/audio_to_images.py \
|
65 |
--resolution 256 \
|
66 |
--input_dir path-to-audio-files \
|
67 |
-
--output_dir data-256 \
|
68 |
--push_to_hub teticio/audio-diffusion-256
|
69 |
```
|
70 |
## Train model
|
@@ -72,10 +72,10 @@ python scripts/audio_to_images.py \
|
|
72 |
```bash
|
73 |
accelerate launch --config_file config/accelerate_local.yaml \
|
74 |
scripts/train_unconditional.py \
|
75 |
-
--dataset_name data-64 \
|
76 |
--resolution 64 \
|
77 |
--hop_length 1024 \
|
78 |
-
--output_dir ddpm-ema-audio-64 \
|
79 |
--train_batch_size 16 \
|
80 |
--num_epochs 100 \
|
81 |
--gradient_accumulation_steps 1 \
|
@@ -89,7 +89,7 @@ accelerate launch --config_file config/accelerate_local.yaml \
|
|
89 |
scripts/train_unconditional.py \
|
90 |
--dataset_name teticio/audio-diffusion-256 \
|
91 |
--resolution 256 \
|
92 |
-
--output_dir audio-diffusion-256 \
|
93 |
--num_epochs 100 \
|
94 |
--train_batch_size 2 \
|
95 |
--eval_batch_size 2 \
|
@@ -107,7 +107,7 @@ accelerate launch --config_file config/accelerate_sagemaker.yaml \
|
|
107 |
scripts/train_unconditional.py \
|
108 |
--dataset_name teticio/audio-diffusion-256 \
|
109 |
--resolution 256 \
|
110 |
-
--output_dir ddpm-ema-audio-256 \
|
111 |
--train_batch_size 16 \
|
112 |
--num_epochs 100 \
|
113 |
--gradient_accumulation_steps 1 \
|
|
|
56 |
--resolution 64 \
|
57 |
--hop_length 1024 \
|
58 |
--input_dir path-to-audio-files \
|
59 |
+
--output_dir path-to-output-data
|
60 |
```
|
61 |
#### Generate dataset of 256x256 Mel spectrograms and push to hub (you will need to be authenticated with `huggingface-cli login`).
|
62 |
|
|
|
64 |
python scripts/audio_to_images.py \
|
65 |
--resolution 256 \
|
66 |
--input_dir path-to-audio-files \
|
67 |
+
--output_dir data/audio-diffusion-256 \
|
68 |
--push_to_hub teticio/audio-diffusion-256
|
69 |
```
|
70 |
## Train model
|
|
|
72 |
```bash
|
73 |
accelerate launch --config_file config/accelerate_local.yaml \
|
74 |
scripts/train_unconditional.py \
|
75 |
+
--dataset_name data/audio-diffusion-64 \
|
76 |
--resolution 64 \
|
77 |
--hop_length 1024 \
|
78 |
+
--output_dir models/ddpm-ema-audio-64 \
|
79 |
--train_batch_size 16 \
|
80 |
--num_epochs 100 \
|
81 |
--gradient_accumulation_steps 1 \
|
|
|
89 |
scripts/train_unconditional.py \
|
90 |
--dataset_name teticio/audio-diffusion-256 \
|
91 |
--resolution 256 \
|
92 |
+
--output_dir models/audio-diffusion-256 \
|
93 |
--num_epochs 100 \
|
94 |
--train_batch_size 2 \
|
95 |
--eval_batch_size 2 \
|
|
|
107 |
scripts/train_unconditional.py \
|
108 |
--dataset_name teticio/audio-diffusion-256 \
|
109 |
--resolution 256 \
|
110 |
+
--output_dir models/ddpm-ema-audio-256 \
|
111 |
--train_batch_size 16 \
|
112 |
--num_epochs 100 \
|
113 |
--gradient_accumulation_steps 1 \
|
notebooks/test_vae.ipynb
CHANGED
@@ -1,5 +1,17 @@
|
|
1 |
{
|
2 |
"cells": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
"execution_count": null,
|
@@ -12,7 +24,9 @@
|
|
12 |
"import numpy as np\n",
|
13 |
"from PIL import Image\n",
|
14 |
"from datasets import load_dataset\n",
|
15 |
-
"from
|
|
|
|
|
16 |
]
|
17 |
},
|
18 |
{
|
@@ -22,7 +36,8 @@
|
|
22 |
"metadata": {},
|
23 |
"outputs": [],
|
24 |
"source": [
|
25 |
-
"
|
|
|
26 |
]
|
27 |
},
|
28 |
{
|
@@ -42,7 +57,7 @@
|
|
42 |
"metadata": {},
|
43 |
"outputs": [],
|
44 |
"source": [
|
45 |
-
"ds = load_dataset('teticio/audio-diffusion-256')"
|
46 |
]
|
47 |
},
|
48 |
{
|
@@ -53,7 +68,8 @@
|
|
53 |
"outputs": [],
|
54 |
"source": [
|
55 |
"image = random.choice(ds['train'])['image']\n",
|
56 |
-
"image"
|
|
|
57 |
]
|
58 |
},
|
59 |
{
|
@@ -84,7 +100,9 @@
|
|
84 |
"output_image = (output_image + 1.0) / 2.0 # -1,1 -> 0,1; c,h,w\n",
|
85 |
"output_image = (output_image.detach().cpu().numpy() *\n",
|
86 |
" 255).round().astype(\"uint8\").transpose(0, 2, 3, 1)[0]\n",
|
87 |
-
"Image.fromarray(output_image)"
|
|
|
|
|
88 |
]
|
89 |
},
|
90 |
{
|
@@ -100,7 +118,9 @@
|
|
100 |
"output_image = (output_image + 1.0) / 2.0 # -1,1 -> 0,1; c,h,w\n",
|
101 |
"output_image = (output_image.detach().cpu().numpy() *\n",
|
102 |
" 255).round().astype(\"uint8\").transpose(0, 2, 3, 1)[0]\n",
|
103 |
-
"Image.fromarray(output_image)"
|
|
|
|
|
104 |
]
|
105 |
},
|
106 |
{
|
|
|
1 |
{
|
2 |
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"id": "3c8663ed",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [],
|
9 |
+
"source": [
|
10 |
+
"import os\n",
|
11 |
+
"import sys\n",
|
12 |
+
"sys.path.insert(0, os.path.dirname(os.path.abspath(\"\")))"
|
13 |
+
]
|
14 |
+
},
|
15 |
{
|
16 |
"cell_type": "code",
|
17 |
"execution_count": null,
|
|
|
24 |
"import numpy as np\n",
|
25 |
"from PIL import Image\n",
|
26 |
"from datasets import load_dataset\n",
|
27 |
+
"from IPython.display import Audio\n",
|
28 |
+
"from diffusers import AutoencoderKL\n",
|
29 |
+
"from audiodiffusion.mel import Mel"
|
30 |
]
|
31 |
},
|
32 |
{
|
|
|
36 |
"metadata": {},
|
37 |
"outputs": [],
|
38 |
"source": [
|
39 |
+
"mel = Mel()\n",
|
40 |
+
"vae = AutoencoderKL.from_pretrained('../models/autoencoder-kl')"
|
41 |
]
|
42 |
},
|
43 |
{
|
|
|
57 |
"metadata": {},
|
58 |
"outputs": [],
|
59 |
"source": [
|
60 |
+
"ds = load_dataset('teticio/audio-diffusion-breaks-256')"
|
61 |
]
|
62 |
},
|
63 |
{
|
|
|
68 |
"outputs": [],
|
69 |
"source": [
|
70 |
"image = random.choice(ds['train'])['image']\n",
|
71 |
+
"display(image)\n",
|
72 |
+
"Audio(data=mel.image_to_audio(image), rate=mel.get_sample_rate())"
|
73 |
]
|
74 |
},
|
75 |
{
|
|
|
100 |
"output_image = (output_image + 1.0) / 2.0 # -1,1 -> 0,1; c,h,w\n",
|
101 |
"output_image = (output_image.detach().cpu().numpy() *\n",
|
102 |
" 255).round().astype(\"uint8\").transpose(0, 2, 3, 1)[0]\n",
|
103 |
+
"output_image = Image.fromarray(output_image).convert('L')\n",
|
104 |
+
"display(output_image)\n",
|
105 |
+
"Audio(data=mel.image_to_audio(output_image), rate=mel.get_sample_rate())"
|
106 |
]
|
107 |
},
|
108 |
{
|
|
|
118 |
"output_image = (output_image + 1.0) / 2.0 # -1,1 -> 0,1; c,h,w\n",
|
119 |
"output_image = (output_image.detach().cpu().numpy() *\n",
|
120 |
" 255).round().astype(\"uint8\").transpose(0, 2, 3, 1)[0]\n",
|
121 |
+
"output_image = Image.fromarray(output_image).convert('L')\n",
|
122 |
+
"display(output_image)\n",
|
123 |
+
"Audio(data=mel.image_to_audio(output_image), rate=mel.get_sample_rate())"
|
124 |
]
|
125 |
},
|
126 |
{
|
scripts/train_unconditional.py
CHANGED
@@ -73,11 +73,12 @@ def main(args):
|
|
73 |
)
|
74 |
|
75 |
if args.scheduler == "ddpm":
|
76 |
-
noise_scheduler = DDPMScheduler(
|
77 |
-
|
78 |
else:
|
79 |
-
noise_scheduler = DDIMScheduler(
|
80 |
-
|
|
|
81 |
optimizer = torch.optim.AdamW(
|
82 |
model.parameters(),
|
83 |
lr=args.learning_rate,
|
@@ -305,7 +306,6 @@ if __name__ == "__main__":
|
|
305 |
parser.add_argument("--overwrite_output_dir", type=bool, default=False)
|
306 |
parser.add_argument("--cache_dir", type=str, default=None)
|
307 |
parser.add_argument("--resolution", type=int, default=256)
|
308 |
-
parser.add_argument("--latent_resolution", type=int, default=64)
|
309 |
parser.add_argument("--train_batch_size", type=int, default=16)
|
310 |
parser.add_argument("--eval_batch_size", type=int, default=16)
|
311 |
parser.add_argument("--num_epochs", type=int, default=100)
|
@@ -342,6 +342,8 @@ if __name__ == "__main__":
|
|
342 |
parser.add_argument("--hop_length", type=int, default=512)
|
343 |
parser.add_argument("--from_pretrained", type=str, default=None)
|
344 |
parser.add_argument("--start_epoch", type=int, default=0)
|
|
|
|
|
345 |
parser.add_argument("--scheduler",
|
346 |
type=str,
|
347 |
default="ddpm",
|
|
|
73 |
)
|
74 |
|
75 |
if args.scheduler == "ddpm":
|
76 |
+
noise_scheduler = DDPMScheduler(
|
77 |
+
num_train_timesteps=args.num_train_steps, tensor_format="pt")
|
78 |
else:
|
79 |
+
noise_scheduler = DDIMScheduler(
|
80 |
+
num_train_timesteps=args.num_train_steps, tensor_format="pt")
|
81 |
+
|
82 |
optimizer = torch.optim.AdamW(
|
83 |
model.parameters(),
|
84 |
lr=args.learning_rate,
|
|
|
306 |
parser.add_argument("--overwrite_output_dir", type=bool, default=False)
|
307 |
parser.add_argument("--cache_dir", type=str, default=None)
|
308 |
parser.add_argument("--resolution", type=int, default=256)
|
|
|
309 |
parser.add_argument("--train_batch_size", type=int, default=16)
|
310 |
parser.add_argument("--eval_batch_size", type=int, default=16)
|
311 |
parser.add_argument("--num_epochs", type=int, default=100)
|
|
|
342 |
parser.add_argument("--hop_length", type=int, default=512)
|
343 |
parser.add_argument("--from_pretrained", type=str, default=None)
|
344 |
parser.add_argument("--start_epoch", type=int, default=0)
|
345 |
+
parser.add_argument("--num_train_steps", type=int, default=1000)
|
346 |
+
parser.add_argument("--latent_resolution", type=int, default=64)
|
347 |
parser.add_argument("--scheduler",
|
348 |
type=str,
|
349 |
default="ddpm",
|
scripts/train_vae.py
CHANGED
@@ -107,7 +107,7 @@ class ImageLogger(Callback):
|
|
107 |
|
108 |
class HFModelCheckpoint(ModelCheckpoint):
|
109 |
|
110 |
-
def __init__(self, ldm_config, hf_checkpoint='
|
111 |
super().__init__(*args, **kwargs)
|
112 |
self.ldm_config = ldm_config
|
113 |
self.hf_checkpoint = hf_checkpoint
|
@@ -130,7 +130,7 @@ if __name__ == "__main__":
|
|
130 |
default="config/ldm_autoencoder_kl.yaml")
|
131 |
parser.add_argument("--ldm_checkpoint_dir",
|
132 |
type=str,
|
133 |
-
default="
|
134 |
parser.add_argument("--hf_checkpoint_dir", type=str, default="vae_model")
|
135 |
parser.add_argument("-r",
|
136 |
"--resume_from_checkpoint",
|
|
|
107 |
|
108 |
class HFModelCheckpoint(ModelCheckpoint):
|
109 |
|
110 |
+
def __init__(self, ldm_config, hf_checkpoint='models/autoencoder-kl', *args, **kwargs):
|
111 |
super().__init__(*args, **kwargs)
|
112 |
self.ldm_config = ldm_config
|
113 |
self.hf_checkpoint = hf_checkpoint
|
|
|
130 |
default="config/ldm_autoencoder_kl.yaml")
|
131 |
parser.add_argument("--ldm_checkpoint_dir",
|
132 |
type=str,
|
133 |
+
default="models/ldm-autoencoder-kl")
|
134 |
parser.add_argument("--hf_checkpoint_dir", type=str, default="vae_model")
|
135 |
parser.add_argument("-r",
|
136 |
"--resume_from_checkpoint",
|