Add model and config files

#1
by ungersboeck - opened
Files changed (6) hide show
  1. LICENSE +21 -0
  2. README.md +75 -3
  3. config.json +85 -0
  4. discriminator.pt +3 -0
  5. model.pt +3 -0
  6. model_optimizer.pt +3 -0
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 ETH DISCO
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,3 +1,75 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ tags:
4
+ - neural-vocoder
5
+ - audio
6
+ ---
7
+ # DisCoder: High-Fidelity Music Vocoder Using Neural Audio Codecs
8
+
9
+ [Paper]() | [Samples](https://lucala.github.io/discoder/) | [Code](https://github.com/ETH-DISCO/discoder) | [Model](https://huggingface.co/disco-eth/discoder)
10
+
11
+ DisCoder is a neural vocoder that leverages a generative adversarial encoder-decoder architecture informed by
12
+ a neural audio codec to reconstruct high-fidelity 44.1 kHz audio from mel spectrograms. Our approach first transforms
13
+ the mel spectrogram into a lower-dimensional representation aligned with the Descript Audio Codec (DAC) latent space
14
+ before reconstructing it to an audio signal using a fine-tuned DAC decoder.
15
+
16
+
17
+ ## Installation
18
+ The codebase has been tested with Python 3.11. To get started, clone the repository and set up the environment using Conda:
19
+ ```shell
20
+ git clone https://github.com/ETH-DISCO/discoder
21
+
22
+ conda create -n discoder python=3.11
23
+ conda activate discoder
24
+ python -m pip install -r requirements.txt
25
+ ```
26
+
27
+ ## Inference with 🤗 Hugging Face
28
+ Use the following script to perform inference with the pretrained DisCoder model from Hugging Face.
29
+ The model uses the z prediction target and was trained using 128 mel bins.
30
+ ```python
31
+ import torch
32
+ from discoder.models import DisCoder
33
+ from discoder import meldataset, utils
34
+
35
+ device = "cuda"
36
+ sr_target = 44100
37
+
38
+ # load pretrained DisCoder model
39
+ discoder = DisCoder.from_pretrained("disco-eth/discoder")
40
+ discoder = discoder.eval().to(device)
41
+
42
+ # load 44.1 kHz audio file and create mel spectrogram
43
+ audio, _ = meldataset.load_wav(full_path="path/to/audio.wav", sr_target=sr_target, resample=True, normalize=True)
44
+ audio = torch.tensor(audio).unsqueeze(dim=0).to(device)
45
+ mel = utils.get_mel_spectrogram_from_config(audio, discoder.config) # [B, 128, frames]
46
+
47
+ # reconstruct audio
48
+ with torch.no_grad():
49
+ wav_recon = discoder(mel) # [B, 1, time]
50
+ ```
51
+
52
+
53
+ ## Training
54
+ To calculate [ViSQOL](https://github.com/google/visqol) during validation, install the required library by following the steps below:
55
+ ```shell
56
+ cd discoder
57
+ git clone https://github.com/google/visqol
58
+ bazel build :visqol -c opt
59
+
60
+ cd visqol && pip install .
61
+ ```
62
+
63
+ To start training, use the following command:
64
+ ```shell
65
+ python -u train.py --config configs/config_z.json
66
+ ```
67
+
68
+
69
+ ## Inference
70
+ The inference script allows batch processing of audio files. It converts all WAV files in the specified `input_dir` to
71
+ mel spectrograms, then reconstructs them into audio files in the `output_dir`.
72
+ ```shell
73
+ python -u inference.py --input_dir input_dir --output_dir output_dir --checkpoint_file model.pt --config configs/config_z.json
74
+ ```
75
+ You can also pass the `normalize_volume` flag to standardize the output volume.
config.json ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "wandb": {
3
+ "mode": "disabled",
4
+ "checkpoint": "checkpoint_name",
5
+ "fork_checkpoint": false,
6
+ "project": "project_name",
7
+ "wandb_prefix": "wandb_prefix",
8
+ "dir": "wandb_dir"
9
+ },
10
+ "local": {
11
+ "checkpoint_model": null,
12
+ "checkpoint_discriminator": null
13
+ },
14
+ "checkpoint_dir": "path/to/checkpoints",
15
+ "seed": 123,
16
+ "learning_rate": 0.0001,
17
+ "dataset": "Jamendo, LibriTTS",
18
+ "train_datafile": "data/train.txt",
19
+ "validation_datafile": "data/val_short.txt",
20
+ "n_epochs": 500000,
21
+ "step_checkpoint": 10000,
22
+ "step_media_log": 2000,
23
+ "batch_grad_log": 2000,
24
+ "batch_size": 32,
25
+ "sample_rate": 44100,
26
+ "exp_gamma": 0.9995,
27
+ "adam_b1": 0.8,
28
+ "adam_b2": 0.99,
29
+ "segment_size": 16384,
30
+ "segment_size_val": 262144,
31
+ "n_cache_reuse": 30,
32
+ "num_workers": 4,
33
+ "prefetch_factor": 2,
34
+ "use_discriminator": true,
35
+ "unfreeze": {
36
+ "steps": 100000,
37
+ "loss_multiplier": {
38
+ "loss_z": 0,
39
+ "loss_ms_mel": 15,
40
+ "loss_ms_stft": 1,
41
+ "loss_adv_gen": 1,
42
+ "loss_adv_feat": 2,
43
+ "loss_waveform": 1
44
+ }
45
+ },
46
+ "model": {
47
+ "latent_dim": 1024,
48
+ "n_codebooks": 9,
49
+ "codebook_dim": 8,
50
+ "codebook_size": 1024,
51
+ "n_resblocks": 6,
52
+ "initial_out_channels": 1024,
53
+ "intermediate_dim": 1152,
54
+ "resblock_type": "AMP",
55
+ "resblock_kernel_sizes": [3, 3, 7, 7, 11, 11],
56
+ "resblock_dilations": [[1, 3, 5], [1, 3, 5], [1, 3, 5], [1, 3, 5], [1, 3, 5], [1, 3, 5]],
57
+ "predict_type": "z",
58
+ "activation": "snake"
59
+ },
60
+ "disc": {
61
+ "resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]],
62
+ "periods": [2, 3, 5, 7, 11],
63
+ "discriminator_channel_mult": 1,
64
+ "use_spectral_norm": false
65
+ },
66
+ "mel": {
67
+ "n_fft": 1024,
68
+ "win_length": 1024,
69
+ "hop_length": 256,
70
+ "f_min": 0,
71
+ "f_max": null,
72
+ "n_mels": 128
73
+ },
74
+ "loss_multiplier": {
75
+ "loss_z": 15,
76
+ "loss_ms_mel": 15,
77
+ "loss_ms_stft": 1,
78
+ "loss_adv_gen": 1,
79
+ "loss_adv_feat": 2,
80
+ "loss_waveform": 1
81
+ },
82
+ "backend": {
83
+ "master_port": 12359
84
+ }
85
+ }
discriminator.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f38ebec119d21568695a4b5fe8417ccaeafdc4f71548e1f197fb557e709142b6
3
+ size 510776870
model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eef7b318c4a74fb6c73ab50b69990a79f3bd60086195f18c738f0cc040e12f23
3
+ size 1720455854
model_optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4946fddfed8039f7d36545d0af210105e13934d8358944b082d5f3b012ff0c2
3
+ size 3440876956