Spaces:
Runtime error
Runtime error
update README
Browse files- README.md +64 -4
- mel.png +0 -0
- notebooks/test-mel.ipynb +2 -2
- notebooks/test-model.ipynb +5 -5
- src/train_unconditional.py +6 -3
README.md
CHANGED
@@ -1,16 +1,76 @@
|
|
1 |
# audio-diffusion
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
```bash
|
3 |
-
|
|
|
|
|
|
|
|
|
4 |
```
|
|
|
|
|
|
|
5 |
```bash
|
6 |
python src/audio_to_images.py \
|
7 |
--resolution 256 \
|
8 |
--input_dir path-to-audio-files \
|
9 |
-
--output_dir data-256
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
```
|
|
|
|
|
|
|
11 |
```bash
|
12 |
-
accelerate launch
|
13 |
-
|
|
|
14 |
--resolution 256 \
|
15 |
--output_dir ddpm-ema-audio-256 \
|
16 |
--train_batch_size 16 \
|
|
|
1 |
# audio-diffusion
|
2 |
+
|
3 |
+
### Apply [Denoising Diffusion Probabilistic Models](https://arxiv.org/abs/2006.11239) using the new Hugging Face [diffusers](https://github.com/huggingface/diffusers) package to synthesize music instead of images.
|
4 |
+
|
5 |
+
---
|
6 |
+
|
7 |
+
![mel spectrogram](mel.png)
|
8 |
+
|
9 |
+
Audio can be represented as images by transforming to a [mel spectrogram](https://en.wikipedia.org/wiki/Mel-frequency_cepstrum), such as the one shown above. The class `Mel` in `mel.py` can convert a slice of audio into a mel spectrogram of `x_res` x `y_res` and vice-versa. The higher the resolution, the less audio information will be lost. You can see how this works in the `test-mel.ipynb` notebook.
|
10 |
+
|
11 |
+
A DDPM model is trained on a set of mel spectrograms that have been generated from a directory of audio files. It is then used to synthesize similar mel spectrograms, which are then converted back into audio. See the `test-model.ipynb` notebook for an example.
|
12 |
+
|
13 |
+
## Generate Mel spectrogram dataset from directory of audio files
|
14 |
+
### Training can be run with Mel spectrograms of resolution 64x64 on a single commercial grade GPU (e.g. RTX 2080 Ti). The `hop_length` should be set to 1024 for better results.
|
15 |
+
|
16 |
```bash
|
17 |
+
python src/audio_to_images.py \
|
18 |
+
--resolution 64 \
|
19 |
+
--hop_length 1024\
|
20 |
+
--input_dir path-to-audio-files \
|
21 |
+
--output_dir data-test
|
22 |
```
|
23 |
+
|
24 |
+
### Generate dataset of 256x256 Mel spectrograms and push to hub (you will need to be authenticated with `huggingface-cli login`).
|
25 |
+
|
26 |
```bash
|
27 |
python src/audio_to_images.py \
|
28 |
--resolution 256 \
|
29 |
--input_dir path-to-audio-files \
|
30 |
+
--output_dir data-256 \
|
31 |
+
--push_to_hub teticio\audio-diffusion-256
|
32 |
+
```
|
33 |
+
## Train model
|
34 |
+
### Run training on local machine.
|
35 |
+
|
36 |
+
```bash
|
37 |
+
accelerate launch --config_file accelerate_local.yaml \
|
38 |
+
src/train_unconditional.py \
|
39 |
+
--dataset_name data-64 \
|
40 |
+
--resolution 64 \
|
41 |
+
--hop_length 1024 \
|
42 |
+
--output_dir ddpm-ema-audio-64 \
|
43 |
+
--train_batch_size 16 \
|
44 |
+
--num_epochs 100 \
|
45 |
+
--gradient_accumulation_steps 1 \
|
46 |
+
--learning_rate 1e-4 \
|
47 |
+
--lr_warmup_steps 500 \
|
48 |
+
--mixed_precision no
|
49 |
+
```
|
50 |
+
|
51 |
+
### Run training on local machine with `batch_size` of 1 and `gradient_accumulation_steps` 16 to compensate, so that 256x256 resolution model fits on commercial grade GPU.
|
52 |
+
|
53 |
+
```bash
|
54 |
+
accelerate launch --config_file accelerate_local.yaml \
|
55 |
+
src/train_unconditional.py \
|
56 |
+
--dataset_name teticio/audio-diffusion-256 \
|
57 |
+
--resolution 256 \
|
58 |
+
--output_dir ddpm-ema-audio-256 \
|
59 |
+
--num_epochs 100 \
|
60 |
+
--train_batch_size 1 \
|
61 |
+
--eval_batch_size 1 \
|
62 |
+
--gradient_accumulation_steps 16 \
|
63 |
+
--learning_rate 1e-4 \
|
64 |
+
--lr_warmup_steps 500 \
|
65 |
+
--mixed_precision no
|
66 |
```
|
67 |
+
|
68 |
+
### Run training on SageMaker.
|
69 |
+
|
70 |
```bash
|
71 |
+
accelerate launch --config_file accelerate_sagemaker.yaml \
|
72 |
+
src/train_unconditional.py \
|
73 |
+
--dataset_name teticio/audio-diffusion-256 \
|
74 |
--resolution 256 \
|
75 |
--output_dir ddpm-ema-audio-256 \
|
76 |
--train_batch_size 16 \
|
mel.png
ADDED
notebooks/test-mel.ipynb
CHANGED
@@ -49,7 +49,7 @@
|
|
49 |
"id": "b2178c3f",
|
50 |
"metadata": {},
|
51 |
"source": [
|
52 |
-
"### Transform slice of audio to
|
53 |
]
|
54 |
},
|
55 |
{
|
@@ -120,7 +120,7 @@
|
|
120 |
"id": "fe112fef",
|
121 |
"metadata": {},
|
122 |
"source": [
|
123 |
-
"### Transform
|
124 |
]
|
125 |
},
|
126 |
{
|
|
|
49 |
"id": "b2178c3f",
|
50 |
"metadata": {},
|
51 |
"source": [
|
52 |
+
"### Transform slice of audio to mel spectrogram"
|
53 |
]
|
54 |
},
|
55 |
{
|
|
|
120 |
"id": "fe112fef",
|
121 |
"metadata": {},
|
122 |
"source": [
|
123 |
+
"### Transform mel spectrogram back to audio"
|
124 |
]
|
125 |
},
|
126 |
{
|
notebooks/test-model.ipynb
CHANGED
@@ -42,7 +42,7 @@
|
|
42 |
"id": "011fb5a1",
|
43 |
"metadata": {},
|
44 |
"source": [
|
45 |
-
"### Run model inference to generate
|
46 |
]
|
47 |
},
|
48 |
{
|
@@ -76,7 +76,7 @@
|
|
76 |
{
|
77 |
"cell_type": "code",
|
78 |
"execution_count": 6,
|
79 |
-
"id": "
|
80 |
"metadata": {},
|
81 |
"outputs": [
|
82 |
{
|
@@ -101,7 +101,7 @@
|
|
101 |
"id": "7230c280",
|
102 |
"metadata": {},
|
103 |
"source": [
|
104 |
-
"### Transform
|
105 |
]
|
106 |
},
|
107 |
{
|
@@ -155,7 +155,7 @@
|
|
155 |
{
|
156 |
"cell_type": "code",
|
157 |
"execution_count": 8,
|
158 |
-
"id": "
|
159 |
"metadata": {},
|
160 |
"outputs": [
|
161 |
{
|
@@ -208,7 +208,7 @@
|
|
208 |
{
|
209 |
"cell_type": "code",
|
210 |
"execution_count": null,
|
211 |
-
"id": "
|
212 |
"metadata": {},
|
213 |
"outputs": [],
|
214 |
"source": []
|
|
|
42 |
"id": "011fb5a1",
|
43 |
"metadata": {},
|
44 |
"source": [
|
45 |
+
"### Run model inference to generate mel spectrogram"
|
46 |
]
|
47 |
},
|
48 |
{
|
|
|
76 |
{
|
77 |
"cell_type": "code",
|
78 |
"execution_count": 6,
|
79 |
+
"id": "75db4b7c",
|
80 |
"metadata": {},
|
81 |
"outputs": [
|
82 |
{
|
|
|
101 |
"id": "7230c280",
|
102 |
"metadata": {},
|
103 |
"source": [
|
104 |
+
"### Transform mel spectrogram to audio"
|
105 |
]
|
106 |
},
|
107 |
{
|
|
|
155 |
{
|
156 |
"cell_type": "code",
|
157 |
"execution_count": 8,
|
158 |
+
"id": "b9023846",
|
159 |
"metadata": {},
|
160 |
"outputs": [
|
161 |
{
|
|
|
208 |
{
|
209 |
"cell_type": "code",
|
210 |
"execution_count": null,
|
211 |
+
"id": "acf96aba",
|
212 |
"metadata": {},
|
213 |
"outputs": [],
|
214 |
"source": []
|
src/train_unconditional.py
CHANGED
@@ -1,3 +1,5 @@
|
|
|
|
|
|
1 |
import argparse
|
2 |
import os
|
3 |
|
@@ -30,7 +32,8 @@ logger = get_logger(__name__)
|
|
30 |
|
31 |
|
32 |
def main(args):
|
33 |
-
|
|
|
34 |
accelerator = Accelerator(
|
35 |
mixed_precision=args.mixed_precision,
|
36 |
log_with="tensorboard",
|
@@ -122,7 +125,7 @@ def main(args):
|
|
122 |
)
|
123 |
|
124 |
ema_model = EMAModel(
|
125 |
-
model,
|
126 |
inv_gamma=args.ema_inv_gamma,
|
127 |
power=args.ema_power,
|
128 |
max_value=args.ema_max_decay,
|
@@ -234,7 +237,7 @@ def main(args):
|
|
234 |
blocking=False,
|
235 |
)
|
236 |
else:
|
237 |
-
pipeline.save_pretrained(
|
238 |
accelerator.wait_for_everyone()
|
239 |
|
240 |
accelerator.end_training()
|
|
|
1 |
+
# based on https://github.com/huggingface/diffusers/blob/main/examples/train_unconditional.py
|
2 |
+
|
3 |
import argparse
|
4 |
import os
|
5 |
|
|
|
32 |
|
33 |
|
34 |
def main(args):
|
35 |
+
output_dir = os.environ.get("SM_MODEL_DIR", None) or args.output_dir
|
36 |
+
logging_dir = os.path.join(output_dir, args.logging_dir)
|
37 |
accelerator = Accelerator(
|
38 |
mixed_precision=args.mixed_precision,
|
39 |
log_with="tensorboard",
|
|
|
125 |
)
|
126 |
|
127 |
ema_model = EMAModel(
|
128 |
+
getattr(model, "module", model),
|
129 |
inv_gamma=args.ema_inv_gamma,
|
130 |
power=args.ema_power,
|
131 |
max_value=args.ema_max_decay,
|
|
|
237 |
blocking=False,
|
238 |
)
|
239 |
else:
|
240 |
+
pipeline.save_pretrained(output_dir)
|
241 |
accelerator.wait_for_everyone()
|
242 |
|
243 |
accelerator.end_training()
|