Add model and config files
#1
by
ungersboeck
- opened
- LICENSE +21 -0
- README.md +75 -3
- config.json +85 -0
- discriminator.pt +3 -0
- model.pt +3 -0
- model_optimizer.pt +3 -0
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2024 ETH DISCO
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
CHANGED
@@ -1,3 +1,75 @@
|
|
1 |
-
---
|
2 |
-
license: mit
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: mit
|
3 |
+
tags:
|
4 |
+
- neural-vocoder
|
5 |
+
- audio
|
6 |
+
---
|
7 |
+
# DisCoder: High-Fidelity Music Vocoder Using Neural Audio Codecs
|
8 |
+
|
9 |
+
[Paper]() | [Samples](https://lucala.github.io/discoder/) | [Code](https://github.com/ETH-DISCO/discoder) | [Model](https://huggingface.co/disco-eth/discoder)
|
10 |
+
|
11 |
+
DisCoder is a neural vocoder that leverages a generative adversarial encoder-decoder architecture informed by
|
12 |
+
a neural audio codec to reconstruct high-fidelity 44.1 kHz audio from mel spectrograms. Our approach first transforms
|
13 |
+
the mel spectrogram into a lower-dimensional representation aligned with the Descript Audio Codec (DAC) latent space
|
14 |
+
before reconstructing it to an audio signal using a fine-tuned DAC decoder.
|
15 |
+
|
16 |
+
|
17 |
+
## Installation
|
18 |
+
The codebase has been tested with Python 3.11. To get started, clone the repository and set up the environment using Conda:
|
19 |
+
```shell
|
20 |
+
git clone https://github.com/ETH-DISCO/discoder
|
21 |
+
|
22 |
+
conda create -n discoder python=3.11
|
23 |
+
conda activate discoder
|
24 |
+
python -m pip install -r requirements.txt
|
25 |
+
```
|
26 |
+
|
27 |
+
## Inference with 🤗 Hugging Face
|
28 |
+
Use the following script to perform inference with the pretrained DisCoder model from Hugging Face.
|
29 |
+
The model uses the z prediction target and was trained using 128 mel bins.
|
30 |
+
```python
|
31 |
+
import torch
|
32 |
+
from discoder.models import DisCoder
|
33 |
+
from discoder import meldataset, utils
|
34 |
+
|
35 |
+
device = "cuda"
|
36 |
+
sr_target = 44100
|
37 |
+
|
38 |
+
# load pretrained DisCoder model
|
39 |
+
discoder = DisCoder.from_pretrained("disco-eth/discoder")
|
40 |
+
discoder = discoder.eval().to(device)
|
41 |
+
|
42 |
+
# load 44.1 kHz audio file and create mel spectrogram
|
43 |
+
audio, _ = meldataset.load_wav(full_path="path/to/audio.wav", sr_target=sr_target, resample=True, normalize=True)
|
44 |
+
audio = torch.tensor(audio).unsqueeze(dim=0).to(device)
|
45 |
+
mel = utils.get_mel_spectrogram_from_config(audio, discoder.config) # [B, 128, frames]
|
46 |
+
|
47 |
+
# reconstruct audio
|
48 |
+
with torch.no_grad():
|
49 |
+
wav_recon = discoder(mel) # [B, 1, time]
|
50 |
+
```
|
51 |
+
|
52 |
+
|
53 |
+
## Training
|
54 |
+
To calculate [ViSQOL](https://github.com/google/visqol) during validation, install the required library by following the steps below:
|
55 |
+
```shell
|
56 |
+
cd discoder
|
57 |
+
git clone https://github.com/google/visqol
|
58 |
+
bazel build :visqol -c opt
|
59 |
+
|
60 |
+
cd visqol && pip install .
|
61 |
+
```
|
62 |
+
|
63 |
+
To start training, use the following command:
|
64 |
+
```shell
|
65 |
+
python -u train.py --config configs/config_z.json
|
66 |
+
```
|
67 |
+
|
68 |
+
|
69 |
+
## Inference
|
70 |
+
The inference script allows batch processing of audio files. It converts all WAV files in the specified `input_dir` to
|
71 |
+
mel spectrograms, then reconstructs them into audio files in the `output_dir`.
|
72 |
+
```shell
|
73 |
+
python -u inference.py --input_dir input_dir --output_dir output_dir --checkpoint_file model.pt --config configs/config_z.json
|
74 |
+
```
|
75 |
+
You can also pass the `normalize_volume` flag to standardize the output volume.
|
config.json
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"wandb": {
|
3 |
+
"mode": "disabled",
|
4 |
+
"checkpoint": "checkpoint_name",
|
5 |
+
"fork_checkpoint": false,
|
6 |
+
"project": "project_name",
|
7 |
+
"wandb_prefix": "wandb_prefix",
|
8 |
+
"dir": "wandb_dir"
|
9 |
+
},
|
10 |
+
"local": {
|
11 |
+
"checkpoint_model": null,
|
12 |
+
"checkpoint_discriminator": null
|
13 |
+
},
|
14 |
+
"checkpoint_dir": "path/to/checkpoints",
|
15 |
+
"seed": 123,
|
16 |
+
"learning_rate": 0.0001,
|
17 |
+
"dataset": "Jamendo, LibriTTS",
|
18 |
+
"train_datafile": "data/train.txt",
|
19 |
+
"validation_datafile": "data/val_short.txt",
|
20 |
+
"n_epochs": 500000,
|
21 |
+
"step_checkpoint": 10000,
|
22 |
+
"step_media_log": 2000,
|
23 |
+
"batch_grad_log": 2000,
|
24 |
+
"batch_size": 32,
|
25 |
+
"sample_rate": 44100,
|
26 |
+
"exp_gamma": 0.9995,
|
27 |
+
"adam_b1": 0.8,
|
28 |
+
"adam_b2": 0.99,
|
29 |
+
"segment_size": 16384,
|
30 |
+
"segment_size_val": 262144,
|
31 |
+
"n_cache_reuse": 30,
|
32 |
+
"num_workers": 4,
|
33 |
+
"prefetch_factor": 2,
|
34 |
+
"use_discriminator": true,
|
35 |
+
"unfreeze": {
|
36 |
+
"steps": 100000,
|
37 |
+
"loss_multiplier": {
|
38 |
+
"loss_z": 0,
|
39 |
+
"loss_ms_mel": 15,
|
40 |
+
"loss_ms_stft": 1,
|
41 |
+
"loss_adv_gen": 1,
|
42 |
+
"loss_adv_feat": 2,
|
43 |
+
"loss_waveform": 1
|
44 |
+
}
|
45 |
+
},
|
46 |
+
"model": {
|
47 |
+
"latent_dim": 1024,
|
48 |
+
"n_codebooks": 9,
|
49 |
+
"codebook_dim": 8,
|
50 |
+
"codebook_size": 1024,
|
51 |
+
"n_resblocks": 6,
|
52 |
+
"initial_out_channels": 1024,
|
53 |
+
"intermediate_dim": 1152,
|
54 |
+
"resblock_type": "AMP",
|
55 |
+
"resblock_kernel_sizes": [3, 3, 7, 7, 11, 11],
|
56 |
+
"resblock_dilations": [[1, 3, 5], [1, 3, 5], [1, 3, 5], [1, 3, 5], [1, 3, 5], [1, 3, 5]],
|
57 |
+
"predict_type": "z",
|
58 |
+
"activation": "snake"
|
59 |
+
},
|
60 |
+
"disc": {
|
61 |
+
"resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]],
|
62 |
+
"periods": [2, 3, 5, 7, 11],
|
63 |
+
"discriminator_channel_mult": 1,
|
64 |
+
"use_spectral_norm": false
|
65 |
+
},
|
66 |
+
"mel": {
|
67 |
+
"n_fft": 1024,
|
68 |
+
"win_length": 1024,
|
69 |
+
"hop_length": 256,
|
70 |
+
"f_min": 0,
|
71 |
+
"f_max": null,
|
72 |
+
"n_mels": 128
|
73 |
+
},
|
74 |
+
"loss_multiplier": {
|
75 |
+
"loss_z": 15,
|
76 |
+
"loss_ms_mel": 15,
|
77 |
+
"loss_ms_stft": 1,
|
78 |
+
"loss_adv_gen": 1,
|
79 |
+
"loss_adv_feat": 2,
|
80 |
+
"loss_waveform": 1
|
81 |
+
},
|
82 |
+
"backend": {
|
83 |
+
"master_port": 12359
|
84 |
+
}
|
85 |
+
}
|
discriminator.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f38ebec119d21568695a4b5fe8417ccaeafdc4f71548e1f197fb557e709142b6
|
3 |
+
size 510776870
|
model.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:eef7b318c4a74fb6c73ab50b69990a79f3bd60086195f18c738f0cc040e12f23
|
3 |
+
size 1720455854
|
model_optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a4946fddfed8039f7d36545d0af210105e13934d8358944b082d5f3b012ff0c2
|
3 |
+
size 3440876956
|