Add model and config files

by ungersboeck - opened Oct 10, 2024

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+190

-3

Files changed (6) hide show

LICENSE +21 -0
README.md +75 -3
config.json +85 -0
discriminator.pt +3 -0
model.pt +3 -0
model_optimizer.pt +3 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 ETH DISCO
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,3 +1,75 @@
----
-license: mit
----

+---
+license: mit
+tags:
+- neural-vocoder
+- audio
+---
+# DisCoder: High-Fidelity Music Vocoder Using Neural Audio Codecs
+[Paper]() | [Samples](https://lucala.github.io/discoder/) | [Code](https://github.com/ETH-DISCO/discoder) | [Model](https://huggingface.co/disco-eth/discoder)
+DisCoder is a neural vocoder that leverages a generative adversarial encoder-decoder architecture informed by
+a neural audio codec to reconstruct high-fidelity 44.1 kHz audio from mel spectrograms. Our approach first transforms
+the mel spectrogram into a lower-dimensional representation aligned with the Descript Audio Codec (DAC) latent space
+before reconstructing it to an audio signal using a fine-tuned DAC decoder.
+## Installation
+The codebase has been tested with Python 3.11. To get started, clone the repository and set up the environment using Conda:
+```shell
+git clone https://github.com/ETH-DISCO/discoder
+conda create -n discoder python=3.11
+conda activate discoder
+python -m pip install -r requirements.txt
+```
+## Inference with 🤗 Hugging Face
+Use the following script to perform inference with the pretrained DisCoder model from Hugging Face.
+The model uses the z prediction target and was trained using 128 mel bins.
+```python
+import torch
+from discoder.models import DisCoder
+from discoder import meldataset, utils
+device = "cuda"
+sr_target = 44100
+# load pretrained DisCoder model
+discoder = DisCoder.from_pretrained("disco-eth/discoder")
+discoder = discoder.eval().to(device)
+# load 44.1 kHz audio file and create mel spectrogram
+audio, _ = meldataset.load_wav(full_path="path/to/audio.wav", sr_target=sr_target, resample=True, normalize=True)
+audio = torch.tensor(audio).unsqueeze(dim=0).to(device)
+mel = utils.get_mel_spectrogram_from_config(audio, discoder.config)  # [B, 128, frames]
+# reconstruct audio
+with torch.no_grad():
+    wav_recon = discoder(mel)  # [B, 1, time]
+```
+## Training
+To calculate [ViSQOL](https://github.com/google/visqol) during validation, install the required library by following the steps below:
+```shell
+cd discoder
+git clone https://github.com/google/visqol
+bazel build :visqol -c opt
+cd visqol && pip install .
+```
+To start training, use the following command:
+```shell
+python -u train.py --config configs/config_z.json
+```
+## Inference
+The inference script allows batch processing of audio files. It converts all WAV files in the specified `input_dir` to
+mel spectrograms, then reconstructs them into audio files in the `output_dir`.
+```shell
+python -u inference.py --input_dir input_dir --output_dir output_dir --checkpoint_file model.pt --config  configs/config_z.json
+```
+You can also pass the `normalize_volume` flag to standardize the output volume.

config.json ADDED Viewed

	@@ -0,0 +1,85 @@

+{
+    "wandb": {
+        "mode": "disabled",
+        "checkpoint": "checkpoint_name",
+        "fork_checkpoint": false,
+        "project": "project_name",
+        "wandb_prefix": "wandb_prefix",
+        "dir": "wandb_dir"
+    },
+    "local": {
+        "checkpoint_model": null,
+        "checkpoint_discriminator": null
+    },
+    "checkpoint_dir": "path/to/checkpoints",
+    "seed": 123,
+    "learning_rate": 0.0001,
+    "dataset": "Jamendo, LibriTTS",
+    "train_datafile": "data/train.txt",
+    "validation_datafile": "data/val_short.txt",
+    "n_epochs": 500000,
+    "step_checkpoint": 10000,
+    "step_media_log": 2000,
+    "batch_grad_log": 2000,
+    "batch_size": 32,
+    "sample_rate": 44100,
+    "exp_gamma": 0.9995,
+    "adam_b1": 0.8,
+    "adam_b2": 0.99,
+    "segment_size": 16384,
+    "segment_size_val": 262144,
+    "n_cache_reuse": 30,
+    "num_workers": 4,
+    "prefetch_factor": 2,
+    "use_discriminator": true,
+    "unfreeze": {
+        "steps": 100000,
+        "loss_multiplier": {
+            "loss_z": 0,
+            "loss_ms_mel": 15,
+            "loss_ms_stft": 1,
+            "loss_adv_gen": 1,
+            "loss_adv_feat": 2,
+            "loss_waveform": 1
+        }
+    },
+    "model": {
+        "latent_dim": 1024,
+        "n_codebooks": 9,
+        "codebook_dim": 8,
+        "codebook_size": 1024,
+        "n_resblocks": 6,
+        "initial_out_channels": 1024,
+        "intermediate_dim": 1152,
+        "resblock_type": "AMP",
+        "resblock_kernel_sizes": [3, 3, 7, 7, 11, 11],
+        "resblock_dilations": [[1, 3, 5], [1, 3, 5], [1, 3, 5], [1, 3, 5], [1, 3, 5], [1, 3, 5]],
+        "predict_type": "z",
+        "activation": "snake"
+    },
+    "disc": {
+        "resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]],
+        "periods": [2, 3, 5, 7, 11],
+        "discriminator_channel_mult": 1,
+        "use_spectral_norm": false
+    },
+    "mel": {
+        "n_fft": 1024,
+        "win_length": 1024,
+        "hop_length": 256,
+        "f_min": 0,
+        "f_max": null,
+        "n_mels": 128
+    },
+    "loss_multiplier": {
+        "loss_z": 15,
+        "loss_ms_mel": 15,
+        "loss_ms_stft": 1,
+        "loss_adv_gen": 1,
+        "loss_adv_feat": 2,
+        "loss_waveform": 1
+    },
+    "backend": {
+        "master_port": 12359
+    }
+}

discriminator.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f38ebec119d21568695a4b5fe8417ccaeafdc4f71548e1f197fb557e709142b6
+size 510776870

model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eef7b318c4a74fb6c73ab50b69990a79f3bd60086195f18c738f0cc040e12f23
+size 1720455854

model_optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a4946fddfed8039f7d36545d0af210105e13934d8358944b082d5f3b012ff0c2
+size 3440876956