diff --git a/.gitignore b/.gitignore index 2e0e007618af6c37752803b7f09263bbdf00d02e..30afab233d17ae0fee2527c6baf314466574fae8 100644 --- a/.gitignore +++ b/.gitignore @@ -35,6 +35,7 @@ egs/svc/dev_exp_config.json bins/svc/demo* bins/svc/preprocess_custom.py data +ckpts # Data and ckpt *.pkl diff --git a/app.py b/app.py index ae800ac455bdfc38d48da3b70360798a2ef73e6a..93a0d33724f1002815be998a85a683af7c7967ab 100644 --- a/app.py +++ b/app.py @@ -1,24 +1,125 @@ import gradio as gr +import argparse import os import torch +import soundfile as sf +import numpy as np +from models.tts.naturalspeech2.ns2 import NaturalSpeech2 +from encodec import EncodecModel +from encodec.utils import convert_audio +from utils.util import load_config +from text import text_to_sequence +from text.cmudict import valid_symbols +from text.g2p import preprocess_english, read_lexicon -def build_codec(): - ... +import torchaudio + + +def build_codec(device): + encodec_model = EncodecModel.encodec_model_24khz() + encodec_model = encodec_model.to(device=device) + encodec_model.set_target_bandwidth(12.0) + return encodec_model + +def build_model(cfg, device): + + model = NaturalSpeech2(cfg.model) + model.load_state_dict( + torch.load( + "ckpts/ns2/pytorch_model.bin", + map_location="cpu", + ) + ) + model = model.to(device=device) + return model -def build_model(): - ... def ns2_inference( - prmopt_audio_path, - text, - diffusion_steps=100, + prmopt_audio_path, + text, + diffusion_steps=100, ): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') -demo_inputs = ... -demo_outputs = ... + os.environ["WORK_DIR"] = "./" + cfg = load_config("egs/tts/NaturalSpeech2/exp_config.json") + + model = build_model(cfg, device) + codec = build_codec(device) + + ref_wav_path = prmopt_audio_path + ref_wav, sr = torchaudio.load(ref_wav_path) + ref_wav = convert_audio( + ref_wav, sr, codec.sample_rate, codec.channels + ) + ref_wav = ref_wav.unsqueeze(0).to(device=device) + + with torch.no_grad(): + encoded_frames = codec.encode(ref_wav) + ref_code = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1) + + ref_mask = torch.ones(ref_code.shape[0], ref_code.shape[-1]).to(ref_code.device) + + symbols = valid_symbols + ["sp", "spn", "sil"] + ["", ""] + phone2id = {s: i for i, s in enumerate(symbols)} + id2phone = {i: s for s, i in phone2id.items()} + + lexicon = read_lexicon(cfg.preprocess.lexicon_path) + phone_seq = preprocess_english(text, lexicon) + + + phone_id = np.array( + [ + *map( + phone2id.get, + phone_seq.replace("{", "").replace("}", "").split(), + ) + ] + ) + phone_id = torch.from_numpy(phone_id).unsqueeze(0).to(device=device) + + + x0, prior_out = model.inference( + ref_code, phone_id, ref_mask, diffusion_steps + ) + + latent_ref = codec.quantizer.vq.decode(ref_code.transpose(0, 1)) + rec_wav = codec.decoder(x0) + + os.makedirs("result", exist_ok=True) + sf.write( + "result/{}.wav".format(prmopt_audio_path.split("/")[-1][:-4] + "_zero_shot_result"), + rec_wav[0, 0].detach().cpu().numpy(), + samplerate=24000, + ) + + result_file = "result/{}.wav".format(prmopt_audio_path.split("/")[-1][:-4] + "_zero_shot_result") + return result_file + + +demo_inputs = [ + gr.Audio( + sources=["upload", "microphone"], + label="Upload a reference speech you want to clone timbre", + type="filepath", + ), + gr.Textbox( + value="Amphion is a toolkit that can speak, make sounds, and sing.", + label="Text you want to generate", + type="text", + ), + gr.Slider( + 10, + 1000, + value=200, + step=1, + label="Diffusion Inference Steps", + info="As the step number increases, the synthesis quality will be better while the inference speed will be lower", + ), +] +demo_outputs = gr.Audio(label="") demo = gr.Interface( fn=ns2_inference, diff --git a/egs/datasets/README.md b/egs/datasets/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d6a4931373ad9986a52ed17d7574a6502c6b4d04 --- /dev/null +++ b/egs/datasets/README.md @@ -0,0 +1,381 @@ +# Datasets Format + +Amphion support the following academic datasets (sort alphabetically): + +- [Datasets Format](#datasets-format) + - [AudioCaps](#audiocaps) + - [CSD](#csd) + - [KiSing](#kising) + - [LibriTTS](#libritts) + - [LJSpeech](#ljspeech) + - [M4Singer](#m4singer) + - [NUS-48E](#nus-48e) + - [Opencpop](#opencpop) + - [OpenSinger](#opensinger) + - [Opera](#opera) + - [PopBuTFy](#popbutfy) + - [PopCS](#popcs) + - [PJS](#pjs) + - [SVCC](#svcc) + - [VCTK](#vctk) + +The downloading link and the file structure tree of each dataset is displayed as follows. + +## AudioCaps + +AudioCaps is a dataset of around 44K audio-caption pairs, where each audio clip corresponds to a caption with rich semantic information. You can download the dataset [here](https://github.com/cdjkim/audiocaps). The file structure tree is like: + +```plaintext +[AudioCaps dataset path] +┣ AudioCpas +┃   ┣ wav +┃ ┃ ┣ ---1_cCGK4M_0_10000.wav +┃ ┃ ┣ ---lTs1dxhU_30000_40000.wav +┃ ┃ ┣ ... +``` + +## CSD + +The official CSD dataset can be download [here](https://zenodo.org/records/4785016). The file structure tree is like: + +```plaintext +[CSD dataset path] + ┣ english + ┣ korean + ┣ utterances + ┃ ┣ en001a + ┃ ┃ ┣ {UtterenceID}.wav + ┃ ┣ en001b + ┃ ┣ en002a + ┃ ┣ en002b + ┃ ┣ ... + ┣ README +``` + +## KiSing + +The official KiSing dataset can be download [here](http://shijt.site/index.php/2021/05/16/kising-the-first-open-source-mandarin-singing-voice-synthesis-corpus/). The file structure tree is like: + +```plaintext +[KiSing dataset path] + ┣ clean + ┃ ┣ 421 + ┃ ┣ 422 + ┃ ┣ ... +``` + +## LibriTTS + +The official LibriTTS dataset can be download [here](https://www.openslr.org/60/). The file structure tree is like: + +```plaintext +[LibriTTS dataset path] + ┣ BOOKS.txt + ┣ CHAPTERS.txt + ┣ eval_sentences10.tsv + ┣ LICENSE.txt + ┣ NOTE.txt + ┣ reader_book.tsv + ┣ README_librispeech.txt + ┣ README_libritts.txt + ┣ speakers.tsv + ┣ SPEAKERS.txt + ┣ dev-clean (Subset) + ┃ ┣ 1272{Speaker_ID} + ┃ ┃ ┣ 128104 {Chapter_ID} + ┃ ┃ ┃ ┣ 1272_128104_000001_000000.normalized.txt + ┃ ┃ ┃ ┣ 1272_128104_000001_000000.original.txt + ┃ ┃ ┃ ┣ 1272_128104_000001_000000.wav + ┃ ┃ ┃ ┣ ... + ┃ ┃ ┃ ┣ 1272_128104.book.tsv + ┃ ┃ ┃ ┣ 1272_128104.trans.tsv + ┃ ┃ ┣ ... + ┃ ┣ ... + ┣ dev-other (Subset) + ┃ ┣ 116 (Speaker) + ┃ ┃ ┣ 288045 {Chapter_ID} + ┃ ┃ ┃ ┣ 116_288045_000003_000000.normalized.txt + ┃ ┃ ┃ ┣ 116_288045_000003_000000.original.txt + ┃ ┃ ┃ ┣ 116_288045_000003_000000.wav + ┃ ┃ ┃ ┣ ... + ┃ ┃ ┃ ┣ 116_288045.book.tsv + ┃ ┃ ┃ ┣ 116_288045.trans.tsv + ┃ ┃ ┣ ... + ┃ ┣ ... + ┃ ┣ ... + ┣ test-clean (Subset) + ┃ ┣ {Speaker_ID} + ┃ ┃ ┣ {Chapter_ID} + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav + ┃ ┃ ┃ ┣ ... + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv + ┃ ┃ ┣ ... + ┃ ┣ ... + ┣ test-other + ┃ ┣ {Speaker_ID} + ┃ ┃ ┣ {Chapter_ID} + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav + ┃ ┃ ┃ ┣ ... + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv + ┃ ┃ ┣ ... + ┃ ┣ ... + ┣ train-clean-100 + ┃ ┣ {Speaker_ID} + ┃ ┃ ┣ {Chapter_ID} + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav + ┃ ┃ ┃ ┣ ... + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv + ┃ ┃ ┣ ... + ┃ ┣ ... + ┣ train-clean-360 + ┃ ┣ {Speaker_ID} + ┃ ┃ ┣ {Chapter_ID} + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav + ┃ ┃ ┃ ┣ ... + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv + ┃ ┃ ┣ ... + ┃ ┣ ... + ┣ train-other-500 + ┃ ┣ {Speaker_ID} + ┃ ┃ ┣ {Chapter_ID} + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav + ┃ ┃ ┃ ┣ ... + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv + ┃ ┃ ┣ ... + ┃ ┣ ... +``` + + +## LJSpeech + +The official LJSpeech dataset can be download [here](https://keithito.com/LJ-Speech-Dataset/). The file structure tree is like: + +```plaintext +[LJSpeech dataset path] + ┣ metadata.csv + ┣ wavs + ┃ ┣ LJ001-0001.wav + ┃ ┣ LJ001-0002.wav + ┃ ┣ ... + ┣ README +``` + +## M4Singer + +The official M4Singer dataset can be downloaded [here](https://drive.google.com/file/d/1xC37E59EWRRFFLdG3aJkVqwtLDgtFNqW/view). The file structure tree is like: + +```plaintext +[M4Singer dataset path] + ┣ {Singer_1}#{Song_1} + ┃ ┣ 0000.mid + ┃ ┣ 0000.TextGrid + ┃ ┣ 0000.wav + ┃ ┣ ... + ┣ {Singer_1}#{Song_2} + ┣ ... + ┣ {Singer_2}#{Song_1} + ┣ {Singer_2}#{Song_2} + ┣ ... + ┗ meta.json +``` + +## NUS-48E + +The official NUS-48E dataset can be download [here](https://drive.google.com/drive/folders/12pP9uUl0HTVANU3IPLnumTJiRjPtVUMx). The file structure tree is like: + +```plaintext +[NUS-48E dataset path] + ┣ {SpeakerID} + ┃ ┣ read + ┃ ┃ ┣ {SongID}.txt + ┃ ┃ ┣ {SongID}.wav + ┃ ┃ ┣ ... + ┃ ┣ sing + ┃ ┃ ┣ {SongID}.txt + ┃ ┃ ┣ {SongID}.wav + ┃ ┃ ┣ ... + ┣ ... + ┣ README.txt + +``` + +## Opencpop + +The official Opera dataset can be downloaded [here](https://wenet.org.cn/opencpop/). The file structure tree is like: + +```plaintext +[Opencpop dataset path] + ┣ midis + ┃ ┣ 2001.midi + ┃ ┣ 2002.midi + ┃ ┣ 2003.midi + ┃ ┣ ... + ┣ segments + ┃ ┣ wavs + ┃ ┃ ┣ 2001000001.wav + ┃ ┃ ┣ 2001000002.wav + ┃ ┃ ┣ 2001000003.wav + ┃ ┃ ┣ ... + ┃ ┣ test.txt + ┃ ┣ train.txt + ┃ ┗ transcriptions.txt + ┣ textgrids + ┃ ┣ 2001.TextGrid + ┃ ┣ 2002.TextGrid + ┃ ┣ 2003.TextGrid + ┃ ┣ ... + ┣ wavs + ┃ ┣ 2001.wav + ┃ ┣ 2002.wav + ┃ ┣ 2003.wav + ┃ ┣ ... + ┣ TERMS_OF_ACCESS + ┗ readme.md +``` + +## OpenSinger + +The official OpenSinger dataset can be downloaded [here](https://drive.google.com/file/d/1EofoZxvalgMjZqzUEuEdleHIZ6SHtNuK/view). The file structure tree is like: + +```plaintext +[OpenSinger dataset path] + ┣ ManRaw + ┃ ┣ {Singer_1}_{Song_1} + ┃ ┃ ┣ {Singer_1}_{Song_1}_0.lab + ┃ ┃ ┣ {Singer_1}_{Song_1}_0.txt + ┃ ┃ ┣ {Singer_1}_{Song_1}_0.wav + ┃ ┃ ┣ ... + ┃ ┣ {Singer_1}_{Song_2} + ┃ ┣ ... + ┣ WomanRaw + ┣ LICENSE + ┗ README.md +``` + +## Opera + +The official Opera dataset can be downloaded [here](http://isophonics.net/SingingVoiceDataset). The file structure tree is like: + +```plaintext +[Opera dataset path] + ┣ monophonic + ┃ ┣ chinese + ┃ ┃ ┣ {Gender}_{SingerID} + ┃ ┃ ┃ ┣ {Emotion}_{SongID}.wav + ┃ ┃ ┃ ┣ ... + ┃ ┃ ┣ ... + ┃ ┣ western + ┣ polyphonic + ┃ ┣ chinese + ┃ ┣ western + ┣ CrossculturalDataSet.xlsx +``` + +## PopBuTFy + +The official PopBuTFy dataset can be downloaded [here](https://github.com/MoonInTheRiver/NeuralSVB). The file structure tree is like: + +```plaintext +[PopBuTFy dataset path] + ┣ data + ┃ ┣ {SingerID}#singing#{SongName}_Amateur + ┃ ┃ ┣ {SingerID}#singing#{SongName}_Amateur_{UtteranceID}.mp3 + ┃ ┃ ┣ ... + ┃ ┣ {SingerID}#singing#{SongName}_Professional + ┃ ┃ ┣ {SingerID}#singing#{SongName}_Professional_{UtteranceID}.mp3 + ┃ ┃ ┣ ... + ┣ text_labels + ┗ TERMS_OF_ACCESS +``` + +## PopCS + +The official PopCS dataset can be downloaded [here](https://github.com/MoonInTheRiver/DiffSinger/blob/master/resources/apply_form.md). The file structure tree is like: + +```plaintext +[PopCS dataset path] + ┣ popcs + ┃ ┣ popcs-{SongName} + ┃ ┃ ┣ {UtteranceID}_ph.txt + ┃ ┃ ┣ {UtteranceID}_wf0.wav + ┃ ┃ ┣ {UtteranceID}.TextGrid + ┃ ┃ ┣ {UtteranceID}.txt + ┃ ┃ ┣ ... + ┃ ┣ ... + ┗ TERMS_OF_ACCESS +``` + +## PJS + +The official PJS dataset can be downloaded [here](https://sites.google.com/site/shinnosuketakamichi/research-topics/pjs_corpus). The file structure tree is like: + +```plaintext +[PJS dataset path] + ┣ PJS_corpus_ver1.1 + ┃ ┣ background_noise + ┃ ┣ pjs{SongID} + ┃ ┃ ┣ pjs{SongID}_song.wav + ┃ ┃ ┣ pjs{SongID}_speech.wav + ┃ ┃ ┣ pjs{SongID}.lab + ┃ ┃ ┣ pjs{SongID}.mid + ┃ ┃ ┣ pjs{SongID}.musicxml + ┃ ┃ ┣ pjs{SongID}.txt + ┃ ┣ ... +``` + +## SVCC + +The official SVCC dataset can be downloaded [here](https://github.com/lesterphillip/SVCC23_FastSVC/tree/main/egs/generate_dataset). The file structure tree is like: + +```plaintext +[SVCC dataset path] + ┣ Data + ┃ ┣ CDF1 + ┃ ┃ ┣ 10001.wav + ┃ ┃ ┣ 10002.wav + ┃ ┃ ┣ ... + ┃ ┣ CDM1 + ┃ ┣ IDF1 + ┃ ┣ IDM1 + ┗ README.md +``` + +## VCTK + +The official VCTK dataset can be downloaded [here](https://datashare.ed.ac.uk/handle/10283/3443). The file structure tree is like: + +```plaintext +[VCTK dataset path] + ┣ txt + ┃ ┣ {Speaker_1} + ┃ ┃ ┣ {Speaker_1}_001.txt + ┃ ┃ ┣ {Speaker_1}_002.txt + ┃ ┃ ┣ ... + ┃ ┣ {Speaker_2} + ┃ ┣ ... + ┣ wav48_silence_trimmed + ┃ ┣ {Speaker_1} + ┃ ┃ ┣ {Speaker_1}_001_mic1.flac + ┃ ┃ ┣ {Speaker_1}_001_mic2.flac + ┃ ┃ ┣ {Speaker_1}_002_mic1.flac + ┃ ┃ ┣ ... + ┃ ┣ {Speaker_2} + ┃ ┣ ... + ┣ speaker-info.txt + ┗ update.txt +``` diff --git a/egs/metrics/README.md b/egs/metrics/README.md new file mode 100644 index 0000000000000000000000000000000000000000..882b31365cdd69b381cc8b9bef77509e94c9deb9 --- /dev/null +++ b/egs/metrics/README.md @@ -0,0 +1,94 @@ +# Amphion Evaluation Recipe + +## Supported Evaluation Metrics + +Until now, Amphion Evaluation has supported the following objective metrics: + +- **F0 Modeling**: + - F0 Pearson Coefficients (FPC) + - F0 Periodicity Root Mean Square Error (PeriodicityRMSE) + - F0 Root Mean Square Error (F0RMSE) + - Voiced/Unvoiced F1 Score (V/UV F1) +- **Energy Modeling**: + - Energy Root Mean Square Error (EnergyRMSE) + - Energy Pearson Coefficients (EnergyPC) +- **Intelligibility**: + - Character Error Rate (CER) based on [Whipser](https://github.com/openai/whisper) + - Word Error Rate (WER) based on [Whipser](https://github.com/openai/whisper) +- **Spectrogram Distortion**: + - Frechet Audio Distance (FAD) + - Mel Cepstral Distortion (MCD) + - Multi-Resolution STFT Distance (MSTFT) + - Perceptual Evaluation of Speech Quality (PESQ) + - Short Time Objective Intelligibility (STOI) + - Scale Invariant Signal to Distortion Ratio (SISDR) + - Scale Invariant Signal to Noise Ratio (SISNR) +- **Speaker Similarity**: + - Cosine similarity based on [Rawnet3](https://github.com/Jungjee/RawNet) + - Cosine similarity based on [WeSpeaker](https://github.com/wenet-e2e/wespeaker) (👨‍💻 developing) + +We provide a recipe to demonstrate how to objectively evaluate your generated audios. There are three steps in total: + +1. Pretrained Models Preparation +2. Audio Data Preparation +3. Evaluation + +## 1. Pretrained Models Preparation + +If you want to calculate `RawNet3` based speaker similarity, you need to download the pretrained model first, as illustrated [here](../../pretrained/README.md). + +## 2. Aduio Data Preparation + +Prepare reference audios and generated audios in two folders, the `ref_dir` contains the reference audio and the `gen_dir` contains the generated audio. Here is an example. + +```plaintext + ┣ {ref_dir} + ┃ ┣ sample1.wav + ┃ ┣ sample2.wav + ┣ {gen_dir} + ┃ ┣ sample1.wav + ┃ ┣ sample2.wav +``` + +You have to make sure that the pairwise **reference audio and generated audio are named the same**, as illustrated above (sample1 to sample1, sample2 to sample2). + +## 3. Evaluation + +Run the `run.sh` with specified refenrece folder, generated folder, dump folder and metrics. + +```bash +cd Amphion +sh egs/metrics/run.sh \ + --reference_folder [Your path to the reference audios] \ + --generated_folder [Your path to the generated audios] \ + --dump_folder [Your path to dump the objective results] \ + --metrics [The metrics you need] \ + --fs [Optional. To calculate all metrics in the specified sampling rate] +``` + +As for the metrics, an example is provided below: + +```bash +--metrics "mcd pesq fad" +``` + +All currently available metrics keywords are listed below: + +| Keys | Description | +| --------------------- | ------------------------------------------ | +| `fpc` | F0 Pearson Coefficients | +| `f0_periodicity_rmse` | F0 Periodicity Root Mean Square Error | +| `f0rmse` | F0 Root Mean Square Error | +| `v_uv_f1` | Voiced/Unvoiced F1 Score | +| `energy_rmse` | Energy Root Mean Square Error | +| `energy_pc` | Energy Pearson Coefficients | +| `cer` | Character Error Rate | +| `wer` | Word Error Rate | +| `speaker_similarity` | Cos Similarity based on RawNet3 | +| `fad` | Frechet Audio Distance | +| `mcd` | Mel Cepstral Distortion | +| `mstft` | Multi-Resolution STFT Distance | +| `pesq` | Perceptual Evaluation of Speech Quality | +| `si_sdr` | Scale Invariant Signal to Distortion Ratio | +| `si_snr` | Scale Invariant Signal to Noise Ratio | +| `stoi` | Short Time Objective Intelligibility | diff --git a/egs/metrics/run.sh b/egs/metrics/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..6778f36454b788938befd19dca1bf2ef466f4cf9 --- /dev/null +++ b/egs/metrics/run.sh @@ -0,0 +1,42 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $exp_dir)) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Parse the Given Parameters from the Commond ########### +options=$(getopt -o c:n:s --long gpu:,reference_folder:,generated_folder:,dump_folder:,metrics:,fs: -- "$@") +eval set -- "$options" + +while true; do + case $1 in + # Reference Audio Folder + --reference_folder) shift; ref_dir=$1 ; shift ;; + # Generated Audio Folder + --generated_folder) shift; deg_dir=$1 ; shift ;; + # Result Dumping Folder + --dump_folder) shift; dump_dir=$1 ; shift ;; + # Metrics to Compute + --metrics) shift; metrics=$1 ; shift ;; + # Sampling Rate + --fs) shift; fs=$1 ; shift ;; + + --) shift ; break ;; + *) echo "Invalid option: $1" exit 1 ;; + esac +done + +######## Calculate Objective Metrics ########### +CUDA_VISIBLE_DEVICES=$gpu python "$work_dir"/bins/calc_metrics.py \ + --ref_dir $ref_dir \ + --deg_dir $deg_dir \ + --dump_dir $dump_dir \ + --metrics $metrics \ + --fs $fs \ \ No newline at end of file diff --git a/egs/svc/DiffComoSVC/README.md b/egs/svc/DiffComoSVC/README.md new file mode 100644 index 0000000000000000000000000000000000000000..aecf0d33fdb31f566339a1a29926457541efe8a3 --- /dev/null +++ b/egs/svc/DiffComoSVC/README.md @@ -0,0 +1,234 @@ +# Accelerating Diffusion-based Singing Voice Conversion through Consistency Distillation +
+
+ +
+
+ +This is an implement of [Consistency Models](https://arxiv.org/abs/2303.01469) for accelerating diffusion-based singing voice conversion. The overall architecture follows "[Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (NeurIPS 2023 Workshop on Machine Learning for Audio), only a slightly modification is applied on acoustic model. Specifically, + +* The acoustic model is a conformer which generates a coarse spectrogram and a diffusion decoder based on Bidirectional Non-Causal Dilated CNN which polish the former spectrogram for better. This is similar to [CoMoSpeech: One-Step Speech and Singing Voice Synthesis via Consistency Model](https://comospeech.github.io/) +* To accelerate diffusion model, we apply consistency distillation from [Consistency Models](https://arxiv.org/abs/2303.01469). For teacher model, the diffusion schedule of the diffusion decoder follows [karras diffusion](https://arxiv.org/abs/2206.00364). For distilling teacher model, the condition encoder and the conformer part of acoustic model are frozen while the diffusion decoder model is updated via exponential moving average. See Figure above for details. + +There are five stages in total: + +1. Data preparation +2. Features extraction +3. Teacher Model Training +4. Consistency Distillation +5. Inference/conversion + +## 1. Data Preparation + +### Dataset Download + +By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md). + +### Configuration + +Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets. + +```json + "dataset": [ + "m4singer", + "opencpop", + "opensinger", + "svcc", + "vctk" + ], + "dataset_path": { + // TODO: Fill in your dataset path + "m4singer": "[M4Singer dataset path]", + "opencpop": "[Opencpop dataset path]", + "opensinger": "[OpenSinger dataset path]", + "svcc": "[SVCC dataset path]", + "vctk": "[VCTK dataset path]" + }, +``` + +## 2. Features Extraction + +### Content-based Pretrained Models Download + +By default, we utilize the Whisper and ContentVec to extract content features. How to download them is detailed [here](../../../pretrained/README.md). + +### Configuration + +Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`: + +```json + // TODO: Fill in the output log path + "log_dir": "[Your path to save logs and checkpoints]", + "preprocess": { + // TODO: Fill in the output data path + "processed_dir": "[Your path to save processed data]", + ... + }, +``` + +### Run + +Run the `run.sh` as the preproces stage (set `--stage 1`). + +```bash +cd Amphion +sh egs/svc/DiffComoSVC/run.sh --stage 1 +``` + +Note: The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`. + +## 3. Teacher Model Training + +### Configuration + +Set the `distill` in `config/comosvc.json` to `false` for teacher model training, you can also specify the detailed configuration for conformer encoder and diffusion process here: + +```JSON +"comosvc":{ + "distill": false, + // conformer encoder + "input_dim": 384, + "output_dim": 100, + "n_heads": 2, + "n_layers": 6, + "filter_channels":512, + // karras diffusion + "P_mean": -1.2, + "P_std": 1.2, + "sigma_data": 0.5, + "sigma_min": 0.002, + "sigma_max": 80, + "rho": 7, + "n_timesteps": 40, + }, +``` + +We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines. + +```json +"train": { + "batch_size": 32, + ... + "adamw": { + "lr": 2.0e-4 + }, + ... + } +``` + +### Run + +Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `[Your path to save logs and checkpoints]/[YourExptName]`. + +```bash +cd Amphion +sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName] +``` + +Note: The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can specify it when running `run.sh` such as: + +```bash +cd Amphion +sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName] --gpu "0,1,2,3" +``` + +## 4. Consistency Distillation + +### Configuration + +Set the `distill` in `config/comosvc.json` to `true` for teacher model training, and specify the `teacher_model_path` for consistency distillation. You can also specify the detailed configuration for conformer encoder and diffusion process here: + +```JSON +"model": { + "teacher_model_path":"[Your_teacher_model_checkpoint].bin", + ... + "comosvc":{ + "distill": true, + // conformer encoder + "input_dim": 384, + "output_dim": 100, + "n_heads": 2, + "n_layers": 6, + "filter_channels":512, + // karras diffusion + "P_mean": -1.2, + "P_std": 1.2, + "sigma_data": 0.5, + "sigma_min": 0.002, + "sigma_max": 80, + "rho": 7, + "n_timesteps": 40, + }, +``` + +We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines. + +```json +"train": { + "batch_size": 32, + ... + "adamw": { + "lr": 2.0e-4 + }, + ... + } +``` + +### Run + +Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `[Your path to save logs and checkpoints]/[YourExptName]`. + +```bash +cd Amphion +sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName] +``` + +Note: The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can specify it when running `run.sh` such as: + +```bash +cd Amphion +sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName] --gpu "0,1,2,3" +``` + +## 5. Inference/Conversion + +### Pretrained Vocoder Download + +We fine-tune the official BigVGAN pretrained model with over 120 hours singing voice data. The benifits of fine-tuning has been investigated in our paper (see this [demo page](https://www.zhangxueyao.com/data/MultipleContentsSVC/vocoder.html)). The final pretrained singing voice vocoder is released [here](../../../pretrained/README.md#amphion-singing-bigvgan) (called `Amphion Singing BigVGAN`). + +### Run + +For inference/conversion, you need to specify the following configurations when running `run.sh`: + +| Parameters | Description | Example | +| --------------------------------------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | +| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `[Your path to save logs and checkpoints]/[YourExptName]` | +| `--infer_output_dir` | The output directory to save inferred audios. | `[Your path to save logs and checkpoints]/[YourExptName]/result` | +| `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `[Your path to save processed data]/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). | +| `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `[Your path to save logs and checkpoints]/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. | +| `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. | + +For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run: + +```bash +cd Amphion +sh egs/svc/DiffComoSVC/run.sh --stage 3 --gpu "0" \ + --infer_expt_dir [Your path to save logs and checkpoints]/[YourExptName] \ + --infer_output_dir [Your path to save logs and checkpoints]/[YourExptName]/result \ + --infer_source_audio_dir [Your Audios Folder] \ + --infer_target_speaker "opencpop_female1" \ + --infer_key_shift "autoshift" +``` +Specially, you can configurate the inference steps for teacher model by setting `inference` at `exp_config`(student model is always one-step sampling): +```json + "inference": { + "comosvc": { + "inference_steps": 40 + } + } +``` + +# Reference +https://github.com/zhenye234/CoMoSpeech + +https://github.com/openai/consistency_models \ No newline at end of file diff --git a/egs/svc/DiffComoSVC/exp_config.json b/egs/svc/DiffComoSVC/exp_config.json new file mode 100644 index 0000000000000000000000000000000000000000..37f977f21a09ea64eb218388b38fe1c1f7350a04 --- /dev/null +++ b/egs/svc/DiffComoSVC/exp_config.json @@ -0,0 +1,143 @@ +{ + "base_config": "config/comosvc.json", + "model_type": "DiffComoSVC", + "dataset": [ + "m4singer", + "opencpop", + "opensinger", + "svcc", + "vctk" + ], + "dataset_path": { + // TODO: Fill in your dataset path + "m4singer": "[M4Singer dataset path]", + "opencpop": "[Opencpop dataset path]", + "opensinger": "[OpenSinger dataset path]", + "svcc": "[SVCC dataset path]", + "vctk": "[VCTK dataset path]" + }, + // TODO: Fill in the output log path + "log_dir": "[Your path to save logs and checkpoints]", + "preprocess": { + // TODO: Fill in the output data path + "processed_dir": "[Your path to save processed data]", + // Config for features extraction + "extract_mel": true, + "extract_pitch": true, + "extract_energy": true, + "extract_whisper_feature": true, + "extract_contentvec_feature": true, + "extract_wenet_feature": false, + "whisper_batch_size": 30, // decrease it if your GPU is out of memory + "contentvec_batch_size": 1, + // Fill in the content-based pretrained model's path + "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt", + "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt", + "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml", + "whisper_model": "medium", + "whisper_model_path": "pretrained/whisper/medium.pt", + // Config for features usage + "use_mel": true, + "use_min_max_norm_mel": true, + "use_frame_pitch": true, + "use_frame_energy": true, + "use_spkid": true, + "use_whisper": true, + "use_contentvec": true, + "use_wenet": false, + "n_mel": 100, + "sample_rate": 24000 + }, + "model": { + "teacher_model_path":"[Your_teacher_model_checkpoint].bin", + "condition_encoder": { + // Config for features usage + "use_whisper": true, + "use_contentvec": true, + "use_wenet": false, + "whisper_dim": 1024, + "contentvec_dim": 256, + "wenet_dim": 512, + "use_singer_encoder": false, + "pitch_min": 50, + "pitch_max": 1100 + }, + "comosvc":{ + "distill": false, + // conformer encoder + "input_dim": 384, + "output_dim": 100, + "n_heads": 2, + "n_layers": 6, + "filter_channels":512, + "dropout":0.1, + // karras diffusion + "P_mean": -1.2, + "P_std": 1.2, + "sigma_data": 0.5, + "sigma_min": 0.002, + "sigma_max": 80, + "rho": 7, + "n_timesteps": 40, + }, + "diffusion": { + // Diffusion steps encoder + "step_encoder": { + "dim_raw_embedding": 128, + "dim_hidden_layer": 512, + "activation": "SiLU", + "num_layer": 2, + "max_period": 10000 + }, + // Diffusion decoder + "model_type": "bidilconv", + // bidilconv, unet2d, TODO: unet1d + "bidilconv": { + "base_channel": 384, + "n_res_block": 20, + "conv_kernel_size": 3, + "dilation_cycle_length": 4, + // specially, 1 means no dilation + "conditioner_size": 100 + } + } + }, + "train": { + "batch_size": 64, + "gradient_accumulation_step": 1, + "max_epoch": -1, // -1 means no limit + "save_checkpoint_stride": [ + 50, + 50 + ], + "keep_last": [ + 5, + -1 + ], + "run_eval": [ + false, + true + ], + "adamw": { + "lr": 4.0e-4 + }, + "reducelronplateau": { + "factor": 0.8, + "patience": 10, + "min_lr": 1.0e-4 + }, + "dataloader": { + "num_worker": 8, + "pin_memory": true + }, + "sampler": { + "holistic_shuffle": false, + "drop_last": true + } + }, + "inference": { + "comosvc": { + "inference_steps": 40 + } + } +} \ No newline at end of file diff --git a/egs/svc/DiffComoSVC/run.sh b/egs/svc/DiffComoSVC/run.sh new file mode 120000 index 0000000000000000000000000000000000000000..f8daac3da463c177e36cdf041342566cc4243257 --- /dev/null +++ b/egs/svc/DiffComoSVC/run.sh @@ -0,0 +1 @@ +../_template/run.sh \ No newline at end of file diff --git a/egs/svc/MultipleContentsSVC/README.md b/egs/svc/MultipleContentsSVC/README.md new file mode 100755 index 0000000000000000000000000000000000000000..ac999e6253076f79ca59ed05fac168e5679feaea --- /dev/null +++ b/egs/svc/MultipleContentsSVC/README.md @@ -0,0 +1,153 @@ +# Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion + +[![arXiv](https://img.shields.io/badge/arXiv-Paper-.svg)](https://arxiv.org/abs/2310.11160) +[![demo](https://img.shields.io/badge/SVC-Demo-red)](https://www.zhangxueyao.com/data/MultipleContentsSVC/index.html) + +
+
+ +
+
+ +This is the official implementation of the paper "[Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (NeurIPS 2023 Workshop on Machine Learning for Audio). Specially, + +- The muptile content features are from [Whipser](https://github.com/wenet-e2e/wenet) and [ContentVec](https://github.com/auspicious3000/contentvec). +- The acoustic model is based on Bidirectional Non-Causal Dilated CNN (called `DiffWaveNetSVC` in Amphion), which is similar to [WaveNet](https://arxiv.org/pdf/1609.03499.pdf), [DiffWave](https://openreview.net/forum?id=a-xFK8Ymz5J), and [DiffSVC](https://ieeexplore.ieee.org/document/9688219). +- The vocoder is [BigVGAN](https://github.com/NVIDIA/BigVGAN) architecture and we fine-tuned it in over 120 hours singing voice data. + +There are four stages in total: + +1. Data preparation +2. Features extraction +3. Training +4. Inference/conversion + +> **NOTE:** You need to run every command of this recipe in the `Amphion` root path: +> ```bash +> cd Amphion +> ``` + +## 1. Data Preparation + +### Dataset Download + +By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md). + +### Configuration + +Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets. + +```json + "dataset": [ + "m4singer", + "opencpop", + "opensinger", + "svcc", + "vctk" + ], + "dataset_path": { + // TODO: Fill in your dataset path + "m4singer": "[M4Singer dataset path]", + "opencpop": "[Opencpop dataset path]", + "opensinger": "[OpenSinger dataset path]", + "svcc": "[SVCC dataset path]", + "vctk": "[VCTK dataset path]" + }, +``` + +## 2. Features Extraction + +### Content-based Pretrained Models Download + +By default, we utilize the Whisper and ContentVec to extract content features. How to download them is detailed [here](../../../pretrained/README.md). + +### Configuration + +Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`: + +```json + // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc" + "log_dir": "ckpts/svc", + "preprocess": { + // TODO: Fill in the output data path. The default value is "Amphion/data" + "processed_dir": "data", + ... + }, +``` + +### Run + +Run the `run.sh` as the preproces stage (set `--stage 1`). + +```bash +sh egs/svc/MultipleContentsSVC/run.sh --stage 1 +``` + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`. + +## 3. Training + +### Configuration + +We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines. + +```json +"train": { + "batch_size": 32, + ... + "adamw": { + "lr": 2.0e-4 + }, + ... + } +``` + +### Run + +Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`. + +```bash +sh egs/svc/MultipleContentsSVC/run.sh --stage 2 --name [YourExptName] +``` + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`. + +## 4. Inference/Conversion + +### Pretrained Vocoder Download + +We fine-tune the official BigVGAN pretrained model with over 120 hours singing voice data. The benifits of fine-tuning has been investigated in our paper (see this [demo page](https://www.zhangxueyao.com/data/MultipleContentsSVC/vocoder.html)). The final pretrained singing voice vocoder is released [here](../../../pretrained/README.md#amphion-singing-bigvgan) (called `Amphion Singing BigVGAN`). + +### Run + +For inference/conversion, you need to specify the following configurations when running `run.sh`: + +| Parameters | Description | Example | +| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `Amphion/ckpts/svc/[YourExptName]` | +| `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/svc/[YourExptName]/result` | +| `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `Amphion/data/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). | +| `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `Amphion/ckpts/svc/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. | +| `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. | + +For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run: + +```bash +sh egs/svc/MultipleContentsSVC/run.sh --stage 3 --gpu "0" \ + --infer_expt_dir Amphion/ckpts/svc/[YourExptName] \ + --infer_output_dir Amphion/ckpts/svc/[YourExptName]/result \ + --infer_source_audio_dir [Your Audios Folder] \ + --infer_target_speaker "opencpop_female1" \ + --infer_key_shift "autoshift" +``` + +## Citations + +```bibtex +@article{zhang2023leveraging, + title={Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion}, + author={Zhang, Xueyao and Gu, Yicheng and Chen, Haopeng and Fang, Zihao and Zou, Lexiao and Xue, Liumeng and Wu, Zhizheng}, + journal={Machine Learning for Audio Worshop, NeurIPS 2023}, + year={2023} +} +``` diff --git a/egs/svc/MultipleContentsSVC/exp_config.json b/egs/svc/MultipleContentsSVC/exp_config.json new file mode 100755 index 0000000000000000000000000000000000000000..7047855abd18c25760fcdd46ec63da5c4b7ad8ba --- /dev/null +++ b/egs/svc/MultipleContentsSVC/exp_config.json @@ -0,0 +1,126 @@ +{ + "base_config": "config/diffusion.json", + "model_type": "DiffWaveNetSVC", + "dataset": [ + "m4singer", + "opencpop", + "opensinger", + "svcc", + "vctk" + ], + "dataset_path": { + // TODO: Fill in your dataset path + "m4singer": "[M4Singer dataset path]", + "opencpop": "[Opencpop dataset path]", + "opensinger": "[OpenSinger dataset path]", + "svcc": "[SVCC dataset path]", + "vctk": "[VCTK dataset path]" + }, + // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc" + "log_dir": "ckpts/svc", + "preprocess": { + // TODO: Fill in the output data path. The default value is "Amphion/data" + "processed_dir": "data", + // Config for features extraction + "extract_mel": true, + "extract_pitch": true, + "extract_energy": true, + "extract_whisper_feature": true, + "extract_contentvec_feature": true, + "extract_wenet_feature": false, + "whisper_batch_size": 30, // decrease it if your GPU is out of memory + "contentvec_batch_size": 1, + // Fill in the content-based pretrained model's path + "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt", + "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt", + "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml", + "whisper_model": "medium", + "whisper_model_path": "pretrained/whisper/medium.pt", + // Config for features usage + "use_mel": true, + "use_min_max_norm_mel": true, + "use_frame_pitch": true, + "use_frame_energy": true, + "use_spkid": true, + "use_whisper": true, + "use_contentvec": true, + "use_wenet": false, + "n_mel": 100, + "sample_rate": 24000 + }, + "model": { + "condition_encoder": { + // Config for features usage + "use_whisper": true, + "use_contentvec": true, + "use_wenet": false, + "whisper_dim": 1024, + "contentvec_dim": 256, + "wenet_dim": 512, + "use_singer_encoder": false, + "pitch_min": 50, + "pitch_max": 1100 + }, + "diffusion": { + "scheduler": "ddpm", + "scheduler_settings": { + "num_train_timesteps": 1000, + "beta_start": 1.0e-4, + "beta_end": 0.02, + "beta_schedule": "linear" + }, + // Diffusion steps encoder + "step_encoder": { + "dim_raw_embedding": 128, + "dim_hidden_layer": 512, + "activation": "SiLU", + "num_layer": 2, + "max_period": 10000 + }, + // Diffusion decoder + "model_type": "bidilconv", + // bidilconv, unet2d, TODO: unet1d + "bidilconv": { + "base_channel": 512, + "n_res_block": 40, + "conv_kernel_size": 3, + "dilation_cycle_length": 4, + // specially, 1 means no dilation + "conditioner_size": 384 + } + } + }, + "train": { + "batch_size": 32, + "gradient_accumulation_step": 1, + "max_epoch": -1, // -1 means no limit + "save_checkpoint_stride": [ + 3, + 50 + ], + "keep_last": [ + 3, + 2 + ], + "run_eval": [ + true, + true + ], + "adamw": { + "lr": 2.0e-4 + }, + "reducelronplateau": { + "factor": 0.8, + "patience": 30, + "min_lr": 1.0e-4 + }, + "dataloader": { + "num_worker": 8, + "pin_memory": true + }, + "sampler": { + "holistic_shuffle": false, + "drop_last": true + } + } +} \ No newline at end of file diff --git a/egs/svc/MultipleContentsSVC/run.sh b/egs/svc/MultipleContentsSVC/run.sh new file mode 120000 index 0000000000000000000000000000000000000000..f8daac3da463c177e36cdf041342566cc4243257 --- /dev/null +++ b/egs/svc/MultipleContentsSVC/run.sh @@ -0,0 +1 @@ +../_template/run.sh \ No newline at end of file diff --git a/egs/svc/README.md b/egs/svc/README.md new file mode 100755 index 0000000000000000000000000000000000000000..3207ecd790a614b850260086f6a99486fe33165e --- /dev/null +++ b/egs/svc/README.md @@ -0,0 +1,34 @@ +# Amphion Singing Voice Conversion (SVC) Recipe + +## Quick Start + +We provide a **[beginner recipe](MultipleContentsSVC)** to demonstrate how to train a cutting edge SVC model. Specifically, it is also an official implementation of the paper "[Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (NeurIPS 2023 Workshop on Machine Learning for Audio). Some demos can be seen [here](https://www.zhangxueyao.com/data/MultipleContentsSVC/index.html). + +## Supported Model Architectures + +The main idea of SVC is to first disentangle the speaker-agnostic representations from the source audio, and then inject the desired speaker information to synthesize the target, which usually utilizes an acoustic decoder and a subsequent waveform synthesizer (vocoder): + +
+
+ +
+
+ +Until now, Amphion SVC has supported the following features and models: + +- **Speaker-agnostic Representations**: + - Content Features: Sourcing from [WeNet](https://github.com/wenet-e2e/wenet), [Whisper](https://github.com/openai/whisper), and [ContentVec](https://github.com/auspicious3000/contentvec). + - Prosody Features: F0 and energy. +- **Speaker Embeddings**: + - Speaker Look-Up Table. + - Reference Encoder (👨‍💻 developing): It can be used for zero-shot SVC. +- **Acoustic Decoders**: + - Diffusion-based models: + - **[DiffWaveNetSVC](MultipleContentsSVC)**: The encoder is based on Bidirectional Non-Causal Dilated CNN, which is similar to [WaveNet](https://arxiv.org/pdf/1609.03499.pdf), [DiffWave](https://openreview.net/forum?id=a-xFK8Ymz5J), and [DiffSVC](https://ieeexplore.ieee.org/document/9688219). + - **[DiffComoSVC](DiffComoSVC)** (👨‍💻 developing): The diffusion framework is based on [Consistency Model](https://proceedings.mlr.press/v202/song23a.html). It can significantly accelerate the inference process of the diffusion model. + - Transformer-based models: + - **[TransformerSVC](TransformerSVC)**: Encoder-only and Non-autoregressive Transformer Architecture. + - VAE- and Flow-based models: + - **[VitsSVC](VitsSVC)**: It is designed as a [VITS](https://arxiv.org/abs/2106.06103)-like model whose textual input is replaced by the content features, which is similar to [so-vits-svc](https://github.com/svc-develop-team/so-vits-svc). +- **Waveform Synthesizers (Vocoders)**: + - The supported vocoders can be seen in [Amphion Vocoder Recipe](../vocoder/README.md). diff --git a/egs/svc/TransformerSVC/README.md b/egs/svc/TransformerSVC/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1797e32f310994ce4c0a1ff3e7789b397b358907 --- /dev/null +++ b/egs/svc/TransformerSVC/README.md @@ -0,0 +1,164 @@ +# Transformer for Singing Voice Conversion + +This is an implementation of **vanilla transformer encoder**/**conformer** as acoustic model for singing voice conversion. + +There are four stages in total: + +1. Data preparation +2. Features extraction +3. Training +4. Inference/conversion + +> **NOTE:** You need to run every command of this recipe in the `Amphion` root path: +> ```bash +> cd Amphion +> ``` + +## 1. Data Preparation + +### Dataset Download + +By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md). + +### Configuration + +Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets. + +```json + "dataset": [ + "m4singer", + "opencpop", + "opensinger", + "svcc", + "vctk" + ], + "dataset_path": { + // TODO: Fill in your dataset path + "m4singer": "[M4Singer dataset path]", + "opencpop": "[Opencpop dataset path]", + "opensinger": "[OpenSinger dataset path]", + "svcc": "[SVCC dataset path]", + "vctk": "[VCTK dataset path]" + }, +``` + +## 2. Features Extraction + +### Content-based Pretrained Models Download + +By default, we utilize the Whisper and ContentVec to extract content features. How to download them is detailed [here](../../../pretrained/README.md). + +### Configuration + +Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`: + +```json + // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc" + "log_dir": "ckpts/svc", + "preprocess": { + // TODO: Fill in the output data path. The default value is "Amphion/data" + "processed_dir": "data", + ... + }, +``` + +### Run + +Run the `run.sh` as the preproces stage (set `--stage 1`). + +```bash +sh egs/svc/TransformerSVC/run.sh --stage 1 +``` + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`. + +## 3. Training + +### Configuration +Specify the detailed configuration for transformer block in `exp_config.json`. For key `type`, `conformer` and `transformer` are supported: +```json +"model": { + ... + "transformer":{ + // 'conformer' or 'transformer' + "type": "conformer", + "input_dim": 384, + "output_dim": 100, + "n_heads": 2, + "n_layers": 6, + "filter_channels":512, + "dropout":0.1, + } + } +``` +We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines. + +```json +"train": { + "batch_size": 32, + ... + "adamw": { + "lr": 2.0e-4 + }, + ... + } +``` + +### Run + +Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`. + +```bash +sh egs/svc/TransformerSVC/run.sh --stage 2 --name [YourExptName] +``` + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`. + +## 4. Inference/Conversion + +### Pretrained Vocoder Download + +We fine-tune the official BigVGAN pretrained model with over 120 hours singing voice data. The benifits of fine-tuning has been investigated in our paper (see this [demo page](https://www.zhangxueyao.com/data/MultipleContentsSVC/vocoder.html)). The final pretrained singing voice vocoder is released [here](../../../pretrained/README.md#amphion-singing-bigvgan) (called `Amphion Singing BigVGAN`). + +### Run + +For inference/conversion, you need to specify the following configurations when running `run.sh`: + +| Parameters | Description | Example | +| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `Amphion/ckpts/svc/[YourExptName]` | +| `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/svc/[YourExptName]/result` | +| `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `Amphion/data/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). | +| `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `Amphion/ckpts/svc/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. | +| `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. | + +For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run: + +```bash +cd Amphion +sh egs/svc/TransformerSVC/run.sh --stage 3 --gpu "0" \ + --infer_expt_dir Amphion/ckpts/svc/[YourExptName] \ + --infer_output_dir Amphion/ckpts/svc/[YourExptName]/result \ + --infer_source_audio_dir [Your Audios Folder] \ + --infer_target_speaker "opencpop_female1" \ + --infer_key_shift "autoshift" +``` + +## Citations + +```bibtex +@inproceedings{transformer, + author = {Ashish Vaswani and + Noam Shazeer and + Niki Parmar and + Jakob Uszkoreit and + Llion Jones and + Aidan N. Gomez and + Lukasz Kaiser and + Illia Polosukhin}, + title = {Attention is All you Need}, + booktitle = {{NIPS}}, + pages = {5998--6008}, + year = {2017} +} +``` \ No newline at end of file diff --git a/egs/svc/TransformerSVC/exp_config.json b/egs/svc/TransformerSVC/exp_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4ad85fbe47dd4945e579200a7e5d82a03a02457b --- /dev/null +++ b/egs/svc/TransformerSVC/exp_config.json @@ -0,0 +1,108 @@ +{ + "base_config": "config/transformer.json", + "model_type": "TransformerSVC", + "dataset": [ + "m4singer", + "opencpop", + "opensinger", + "svcc", + "vctk" + ], + "dataset_path": { + // TODO: Fill in your dataset path + "m4singer": "[M4Singer dataset path]", + "opencpop": "[Opencpop dataset path]", + "opensinger": "[OpenSinger dataset path]", + "svcc": "[SVCC dataset path]", + "vctk": "[VCTK dataset path]" + }, + // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc" + "log_dir": "ckpts/svc", + "preprocess": { + // TODO: Fill in the output data path. The default value is "Amphion/data" + "processed_dir": "data", + // Config for features extraction + "extract_mel": true, + "extract_pitch": true, + "extract_energy": true, + "extract_whisper_feature": true, + "extract_contentvec_feature": true, + "extract_wenet_feature": false, + "whisper_batch_size": 30, // decrease it if your GPU is out of memory + "contentvec_batch_size": 1, + // Fill in the content-based pretrained model's path + "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt", + "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt", + "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml", + "whisper_model": "medium", + "whisper_model_path": "pretrained/whisper/medium.pt", + // Config for features usage + "use_mel": true, + "use_min_max_norm_mel": true, + "use_frame_pitch": true, + "use_frame_energy": true, + "use_spkid": true, + "use_whisper": true, + "use_contentvec": true, + "use_wenet": false, + "n_mel": 100, + "sample_rate": 24000 + }, + "model": { + "condition_encoder": { + // Config for features usage + "use_whisper": true, + "use_contentvec": true, + "use_wenet": false, + "whisper_dim": 1024, + "contentvec_dim": 256, + "wenet_dim": 512, + "use_singer_encoder": false, + "pitch_min": 50, + "pitch_max": 1100 + }, + "transformer": { + // 'conformer' or 'transformer' + "type": "conformer", + "input_dim": 384, + "output_dim": 100, + "n_heads": 2, + "n_layers": 6, + "filter_channels": 512, + "dropout": 0.1, + } + }, + "train": { + "batch_size": 64, + "gradient_accumulation_step": 1, + "max_epoch": -1, // -1 means no limit + "save_checkpoint_stride": [ + 50, + 50 + ], + "keep_last": [ + 5, + -1 + ], + "run_eval": [ + false, + true + ], + "adamw": { + "lr": 4.0e-4 + }, + "reducelronplateau": { + "factor": 0.8, + "patience": 10, + "min_lr": 1.0e-4 + }, + "dataloader": { + "num_worker": 8, + "pin_memory": true + }, + "sampler": { + "holistic_shuffle": false, + "drop_last": true + } + } +} \ No newline at end of file diff --git a/egs/svc/TransformerSVC/run.sh b/egs/svc/TransformerSVC/run.sh new file mode 120000 index 0000000000000000000000000000000000000000..f8daac3da463c177e36cdf041342566cc4243257 --- /dev/null +++ b/egs/svc/TransformerSVC/run.sh @@ -0,0 +1 @@ +../_template/run.sh \ No newline at end of file diff --git a/egs/svc/VitsSVC/README.md b/egs/svc/VitsSVC/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6dc81ee9486a9a926bd132ff4afc68881ba39895 --- /dev/null +++ b/egs/svc/VitsSVC/README.md @@ -0,0 +1,125 @@ +# VITS for Singing Voice Conversion + +This is an implementation of VITS as acoustic model for end-to-end singing voice conversion. Adapted from [so-vits-svc](https://github.com/svc-develop-team/so-vits-svc), SoftVC content encoder is used to extract content features from the source audio. These feature vectors are directly fed into VITS without the need for conversion to a text-based intermediate representation. + +There are four stages in total: + +1. Data preparation +2. Features extraction +3. Training +4. Inference/conversion + +> **NOTE:** You need to run every command of this recipe in the `Amphion` root path: +> ```bash +> cd Amphion +> ``` + +## 1. Data Preparation + +### Dataset Download + +By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md). + +### Configuration + +Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets. + +```json + "dataset": [ + "m4singer", + "opencpop", + "opensinger", + "svcc", + "vctk" + ], + "dataset_path": { + // TODO: Fill in your dataset path + "m4singer": "[M4Singer dataset path]", + "opencpop": "[Opencpop dataset path]", + "opensinger": "[OpenSinger dataset path]", + "svcc": "[SVCC dataset path]", + "vctk": "[VCTK dataset path]" + }, +``` + +## 2. Features Extraction + +### Content-based Pretrained Models Download + +By default, we utilize ContentVec and Whisper to extract content features. How to download them is detailed [here](../../../pretrained/README.md). + +### Configuration + +Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`: + +```json + // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc" + "log_dir": "ckpts/svc", + "preprocess": { + // TODO: Fill in the output data path. The default value is "Amphion/data" + "processed_dir": "data", + ... + }, +``` + +### Run + +Run the `run.sh` as the preproces stage (set `--stage 1`). + +```bash +sh egs/svc/VitsSVC/run.sh --stage 1 +``` + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`. + +## 3. Training + +### Configuration + +We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines. + +```json +"train": { + "batch_size": 32, + ... + "adamw": { + "lr": 2.0e-4 + }, + ... + } +``` + +### Run + +Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`. + +```bash +sh egs/svc/VitsSVC/run.sh --stage 2 --name [YourExptName] +``` + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`. + +## 4. Inference/Conversion + +### Run + +For inference/conversion, you need to specify the following configurations when running `run.sh`: + +| Parameters | Description | Example | +| --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `[Your path to save logs and checkpoints]/[YourExptName]` | +| `--infer_output_dir` | The output directory to save inferred audios. | `[Your path to save logs and checkpoints]/[YourExptName]/result` | +| `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `[Your path to save processed data]/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). | +| `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `[Your path to save logs and checkpoints]/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. | +| `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. | + +For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run: + +```bash +sh egs/svc/VitsSVC/run.sh --stage 3 --gpu "0" \ + --infer_expt_dir Amphion/ckpts/svc/[YourExptName] \ + --infer_output_dir Amphion/ckpts/svc/[YourExptName]/result \ + --infer_source_audio_dir [Your Audios Folder] \ + --infer_target_speaker "opencpop_female1" \ + --infer_key_shift "autoshift" +``` \ No newline at end of file diff --git a/egs/svc/VitsSVC/exp_config.json b/egs/svc/VitsSVC/exp_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bd3b4481829ce4c43bb2f352e08d3258fca775e9 --- /dev/null +++ b/egs/svc/VitsSVC/exp_config.json @@ -0,0 +1,162 @@ +{ + "base_config": "config/vitssvc.json", + "model_type": "VitsSVC", + "dataset": [ + "m4singer", + "opencpop", + "opensinger", + "svcc", + "vctk" + ], + "dataset_path": { + // TODO: Fill in your dataset path + "m4singer": "[M4Singer dataset path]", + "opencpop": "[Opencpop dataset path]", + "opensinger": "[OpenSinger dataset path]", + "svcc": "[SVCC dataset path]", + "vctk": "[VCTK dataset path]" + }, + // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc" + "log_dir": "ckpts/svc", + "preprocess": { + // TODO: Fill in the output data path. The default value is "Amphion/data" + "processed_dir": "data", + + "f0_min": 50, + "f0_max": 1100, + // f0_bin in sovits + "pitch_bin": 256, + // filter_length in sovits + "n_fft": 2048, + // hop_length in sovits + "hop_size": 512, + // win_length in sovits + "win_size": 2048, + "segment_size": 8192, + "n_mel": 100, + "sample_rate": 44100, + + // Config for features extraction + "extract_mel": true, + "extract_pitch": true, + "pitch_extractor": "parselmouth", + "extract_energy": false, + "extract_uv": true, + "extract_linear_spec": true, + "extract_audio": true, + // contentvec + "extract_contentvec_feature": true, + "contentvec_sample_rate": 16000, + "contentvec_batch_size": 1, + "contentvec_frameshift": 0.02, + // whisper + "extract_whisper_feature": true, + "whisper_sample_rate": 16000, + "whisper_frameshift": 0.01, + "whisper_downsample_rate": 2, + // Fill in the content-based pretrained model's path + "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt", + "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt", + "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml", + "whisper_model": "medium", + "whisper_model_path": "pretrained/whisper/medium.pt", + // Config for features usage + "use_mel": true, + "use_frame_pitch": true, + "use_uv": true, + "use_spkid": true, + "use_contentvec": true, + "use_whisper": true, + "use_text": false, + "use_phone": false, + + // Extract content features using dataloader + "pin_memory": true, + "num_workers": 8, + "content_feature_batch_size": 16, + // Meta file + "train_file": "train.json", + "valid_file": "test.json", + "spk2id": "singers.json", + "utt2spk": "utt2singer" + }, + "model": { + "condition_encoder": { + // Config for features usage + "merge_mode": "add", + "input_melody_dim": 1, + "use_log_f0": true, + "n_bins_melody": 256, + //# Quantization (0 for not quantization) + "output_melody_dim": 192, + + "use_contentvec": true, + "use_whisper": true, + "use_mert": false, + "use_wenet": false, + "whisper_dim": 1024, + "contentvec_dim": 256, + "content_encoder_dim": 192, + "output_singer_dim": 192, + "singer_table_size": 512, + "output_content_dim": 192, + "use_spkid": true, + + "pitch_max": 1100.0, + "pitch_min": 50.0, + }, + "vits": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 256, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0.1, + "ssl_dim": 256, + "n_flow_layer": 4, + "n_layers_q": 3, + "gin_channels": 256, + "n_speakers": 512, + "use_spectral_norm": false, + }, + "generator": "nsfhifigan", + }, + "train": { + "batch_size": 32, + "learning_rate": 2e-4, + "gradient_accumulation_step": 1, + "max_epoch": -1, // -1 means no limit + "save_checkpoint_stride": [ + 3, + 50 + ], + "keep_last": [ + 3, + 2 + ], + "run_eval": [ + true, + true + ], + "adamw": { + "lr": 2.0e-4 + }, + "reducelronplateau": { + "factor": 0.8, + "patience": 30, + "min_lr": 1.0e-4 + }, + "dataloader": { + "num_worker": 8, + "pin_memory": true + }, + "sampler": { + "holistic_shuffle": false, + "drop_last": true + } + }, + "inference": { + "batch_size": 1, + } +} \ No newline at end of file diff --git a/egs/svc/VitsSVC/run.sh b/egs/svc/VitsSVC/run.sh new file mode 120000 index 0000000000000000000000000000000000000000..f8daac3da463c177e36cdf041342566cc4243257 --- /dev/null +++ b/egs/svc/VitsSVC/run.sh @@ -0,0 +1 @@ +../_template/run.sh \ No newline at end of file diff --git a/egs/svc/_template/run.sh b/egs/svc/_template/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..8dc870fdef8b1464000021def5627f91d1676bbe --- /dev/null +++ b/egs/svc/_template/run.sh @@ -0,0 +1,150 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $exp_dir))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Parse the Given Parameters from the Commond ########### +options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,resume_from_ckpt_path:,resume_type:,infer_expt_dir:,infer_output_dir:,infer_source_file:,infer_source_audio_dir:,infer_target_speaker:,infer_key_shift:,infer_vocoder_dir: -- "$@") +eval set -- "$options" + +while true; do + case $1 in + # Experimental Configuration File + -c | --config) shift; exp_config=$1 ; shift ;; + # Experimental Name + -n | --name) shift; exp_name=$1 ; shift ;; + # Running Stage + -s | --stage) shift; running_stage=$1 ; shift ;; + # Visible GPU machines. The default value is "0". + --gpu) shift; gpu=$1 ; shift ;; + + # [Only for Training] Resume configuration + --resume) shift; resume=$1 ; shift ;; + # [Only for Training] The specific checkpoint path that you want to resume from. + --resume_from_ckpt_path) shift; resume_from_ckpt_path=$1 ; shift ;; + # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights. + --resume_type) shift; resume_type=$1 ; shift ;; + + # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]" + --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;; + # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result" + --infer_output_dir) shift; infer_output_dir=$1 ; shift ;; + # [Only for Inference] The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir can be "$work_dir/source_audio" which includes several audio files (*.wav, *.mp3 or *.flac). + --infer_source_file) shift; infer_source_file=$1 ; shift ;; + --infer_source_audio_dir) shift; infer_source_audio_dir=$1 ; shift ;; + # [Only for Inference] Specify the target speaker you want to convert into. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1". + --infer_target_speaker) shift; infer_target_speaker=$1 ; shift ;; + # [Only for Inference] For advanced users, you can modify the trans_key parameters into an integer (which means the semitones you want to transpose). Its default value is "autoshift". + --infer_key_shift) shift; infer_key_shift=$1 ; shift ;; + # [Only for Inference] The vocoder dir. Its default value is Amphion/pretrained/bigvgan. See Amphion/pretrained/README.md to download the pretrained BigVGAN vocoders. + --infer_vocoder_dir) shift; infer_vocoder_dir=$1 ; shift ;; + + --) shift ; break ;; + *) echo "Invalid option: $1" exit 1 ;; + esac +done + + +### Value check ### +if [ -z "$running_stage" ]; then + echo "[Error] Please specify the running stage" + exit 1 +fi + +if [ -z "$exp_config" ]; then + exp_config="${exp_dir}"/exp_config.json +fi +echo "Exprimental Configuration File: $exp_config" + +if [ -z "$gpu" ]; then + gpu="0" +fi + +######## Features Extraction ########### +if [ $running_stage -eq 1 ]; then + CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/svc/preprocess.py \ + --config $exp_config \ + --num_workers 4 +fi + +######## Training ########### +if [ $running_stage -eq 2 ]; then + if [ -z "$exp_name" ]; then + echo "[Error] Please specify the experiments name" + exit 1 + fi + echo "Exprimental Name: $exp_name" + + if [ "$resume" = true ]; then + echo "Automatically resume from the experimental dir..." + CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/svc/train.py \ + --config "$exp_config" \ + --exp_name "$exp_name" \ + --log_level info \ + --resume + else + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/svc/train.py \ + --config "$exp_config" \ + --exp_name "$exp_name" \ + --log_level info \ + --resume_from_ckpt_path "$resume_from_ckpt_path" \ + --resume_type "$resume_type" + fi +fi + +######## Inference/Conversion ########### +if [ $running_stage -eq 3 ]; then + if [ -z "$infer_expt_dir" ]; then + echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]" + exit 1 + fi + + if [ -z "$infer_output_dir" ]; then + infer_output_dir="$expt_dir/result" + fi + + if [ -z "$infer_source_file" ] && [ -z "$infer_source_audio_dir" ]; then + echo "[Error] Please specify the source file/dir. The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir should include several audio files (*.wav, *.mp3 or *.flac)." + exit 1 + fi + + if [ -z "$infer_source_file" ]; then + infer_source=$infer_source_audio_dir + fi + + if [ -z "$infer_source_audio_dir" ]; then + infer_source=$infer_source_file + fi + + if [ -z "$infer_target_speaker" ]; then + echo "[Error] Please specify the target speaker. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1"" + exit 1 + fi + + if [ -z "$infer_key_shift" ]; then + infer_key_shift="autoshift" + fi + + if [ -z "$infer_vocoder_dir" ]; then + infer_vocoder_dir="$work_dir"/pretrained/bigvgan + echo "[Warning] You don't specify the infer_vocoder_dir. It is set $infer_vocoder_dir by default. Make sure that you have followed Amphoion/pretrained/README.md to download the pretrained BigVGAN vocoder checkpoint." + fi + + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/svc/inference.py \ + --config $exp_config \ + --acoustics_dir $infer_expt_dir \ + --vocoder_dir $infer_vocoder_dir \ + --target_singer $infer_target_speaker \ + --trans_key $infer_key_shift \ + --source $infer_source \ + --output_dir $infer_output_dir \ + --log_level debug +fi \ No newline at end of file diff --git a/egs/tta/README.md b/egs/tta/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1d944688768697da9e0cee60445410d6ce115694 --- /dev/null +++ b/egs/tta/README.md @@ -0,0 +1,19 @@ +# Amphion Text-to-Audio (TTA) Recipe + +## Quick Start + +We provide a **[beginner recipe](RECIPE.md)** to demonstrate how to train a cutting edge TTA model. Specifically, it is designed as a latent diffusion model like [AudioLDM](https://arxiv.org/abs/2301.12503), [Make-an-Audio](https://arxiv.org/abs/2301.12661), and [AUDIT](https://arxiv.org/abs/2304.00830). + +## Supported Model Architectures + +Until now, Amphion has supported a latent diffusion based text-to-audio model: + +
+
+ +
+
+ +Similar to [AUDIT](https://arxiv.org/abs/2304.00830), we implement it in two-stage training: +1. Training the VAE which is called `AutoencoderKL` in Amphion. +2. Training the conditional latent diffusion model which is called `AudioLDM` in Amphion. \ No newline at end of file diff --git a/egs/tta/RECIPE.md b/egs/tta/RECIPE.md new file mode 100644 index 0000000000000000000000000000000000000000..6682921fd88eb20f9c54f4defdfd856b6154c1c2 --- /dev/null +++ b/egs/tta/RECIPE.md @@ -0,0 +1,156 @@ +# Text-to-Audio with Latent Diffusion Model + +This is the quicktour for training a text-to-audio model with the popular and powerful generative model: [Latent Diffusion Model](https://arxiv.org/abs/2112.10752). Specially, this recipe is also the official implementation of the text-to-audio generation part of our NeurIPS 2023 paper "[AUDIT: Audio Editing by Following Instructions with Latent Diffusion Models](https://arxiv.org/abs/2304.00830)". You can check the last part of [AUDIT demos](https://audit-demo.github.io/) to see same text-to-audio examples. + +
+
+ +
+
+ +We train this latent diffusion model in two stages: +1. In the first stage, we aims to obtain a high-quality VAE (called `AutoencoderKL` in Amphion), in order that we can project +the input mel-spectrograms to an efficient, low-dimensional latent space. Specially, we train the VAE with GAN loss to improve the reconstruction quality. +1. In the second stage, we aims to obtain a text-controllable diffusion model (called `AudioLDM` in Amphion). We use U-Net architecture diffusion model, and use T5 encoder as text encoder. + +There are four stages in total for training the text-to-audio model: + +1. Data preparation and processing +2. Train the VAE model +3. Train the latent diffusion model +4. Inference + +> **NOTE:** You need to run every command of this recipe in the `Amphion` root path: +> ```bash +> cd Amphion +> ``` + +## Overview + +```sh +# Train the VAE model +sh egs/tta/autoencoderkl/run_train.sh + +# Train the latent diffusion model +sh egs/tta/audioldm/run_train.sh + +# Inference +sh egs/tta/audioldm/run_inference.sh +``` + +## 1. Data preparation and processing + +### Dataset Download + +We take [AudioCaps](https://audiocaps.github.io/) as an example, AudioCaps is a dataset of around 44K audio-caption pairs, where each audio clip corresponds to a caption with rich semantic information. You can download the dataset [here](https://github.com/cdjkim/audiocaps). + + + + +### Data Processing + +- Download AudioCaps dataset to `[Your path to save tta dataset]` and modify `preprocess.processed_dir` in `egs/tta/.../exp_config.json`. + +```json +{ + "dataset": [ + "AudioCaps" + ], + "preprocess": { + // Specify the output root path to save the processed data + "processed_dir": "[Your path to save tta dataset]", + ... + } +} +``` + +The folder structure of your downloaded data should be similar to: + +```plaintext +.../[Your path to save tta dataset] +┣ AudioCpas +┃   ┣ wav +┃ ┃ ┣ ---1_cCGK4M_0_10000.wav +┃ ┃ ┣ ---lTs1dxhU_30000_40000.wav +┃ ┃ ┣ ... +``` + +- Then you may process the data to mel-specgram and save it as `.npy` format. If you use the data we provide, we have processed all the wav data. + +- Generate a json file to save the metadata, the json file is like: + +```json +[ + { + "Dataset": "AudioCaps", + "Uid": "---1_cCGK4M_0_10000", + "Caption": "Idling car, train blows horn and passes" + }, + { + "Dataset": "AudioCaps", + "Uid": "---lTs1dxhU_30000_40000", + "Caption": "A racing vehicle engine is heard passing by" + }, + ... +] +``` +- Finally, the folder structure is like: + +```plaintext +.../[Your path to save tta dataset] +┣ AudioCpas +┃   ┣ wav +┃ ┃ ┣ ---1_cCGK4M_0_10000.wav +┃ ┃ ┣ ---lTs1dxhU_30000_40000.wav +┃ ┃ ┣ ... +┃   ┣ mel +┃ ┃ ┣ ---1_cCGK4M_0_10000.npy +┃ ┃ ┣ ---lTs1dxhU_30000_40000.npy +┃ ┃ ┣ ... +┃   ┣ train.json +┃   ┣ valid.json +┃   ┣ ... +``` + +## 2. Training the VAE Model + +The first stage model is a VAE model trained with GAN loss (called `AutoencoderKL` in Amphion), run the follow commands: + +```sh +sh egs/tta/autoencoderkl/run_train.sh +``` + +## 3. Training the Latent Diffusion Model + +The second stage model is a condition diffusion model with a T5 text encoder (called `AudioLDM` in Amphion), run the following commands: + +```sh +sh egs/tta/audioldm/run_train.sh +``` + +## 4. Inference + +Now you can generate audio with your pre-trained latent diffusion model, run the following commands and modify the `text` argument. + +```sh +sh egs/tta/audioldm/run_inference.sh \ +--text "A man is whistling" +``` + +## Citations + +```bibtex +@article{wang2023audit, + title={AUDIT: Audio Editing by Following Instructions with Latent Diffusion Models}, + author={Wang, Yuancheng and Ju, Zeqian and Tan, Xu and He, Lei and Wu, Zhizheng and Bian, Jiang and Zhao, Sheng}, + journal={NeurIPS 2023}, + year={2023} +} + +@article{liu2023audioldm, + title={{AudioLDM}: Text-to-Audio Generation with Latent Diffusion Models}, + author={Liu, Haohe and Chen, Zehua and Yuan, Yi and Mei, Xinhao and Liu, Xubo and Mandic, Danilo and Wang, Wenwu and Plumbley, Mark D}, + journal={Proceedings of the International Conference on Machine Learning}, + year={2023} +} +``` \ No newline at end of file diff --git a/egs/tta/audioldm/exp_config.json b/egs/tta/audioldm/exp_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5b57a771593094401fd38747c66951afe6d6d1f3 --- /dev/null +++ b/egs/tta/audioldm/exp_config.json @@ -0,0 +1,90 @@ +{ + "base_config": "egs/tta/audioldm/exp_config_base.json", + "dataset": [ + "AudioCaps" + ], + "preprocess": { + // Specify the output root path to save the processed data + "processed_dir": "data", + // For example: "/home/TTADataset/processed_data" + + // feature + "use_spkid": false, + "use_uv": false, + "use_frame_pitch": false, + "use_phone_pitch": false, + "use_frame_energy": false, + "use_phone_energy": false, + "use_mel": false, + "use_audio": false, + "use_label": false, + "use_one_hot": false, + // feature for text to audio + "use_caption": true, + "use_melspec": true, + "use_wav": false, + // feature dir + "melspec_dir": "mel", + "wav_dir": "wav" + }, + // Specify the output root path to save model ckpts and logs + "log_dir": "ckpts/tta", + // For example: "/home/TTADataset/processed_data/logs" + + // model + "model": { + "audioldm": { + "image_size": 32, + "in_channels": 4, + "out_channels": 4, + "model_channels": 256, + "attention_resolutions": [4, 2, 1], + "num_res_blocks": 2, + "channel_mult": [1, 2, 4], + "num_heads": 8, + "use_spatial_transformer": true, + "transformer_depth": 1, + "context_dim": 768, + "use_checkpoint": true, + "legacy": false + }, + "autoencoderkl": { + "ch": 128, + "ch_mult": [1,1,2,2,4], + "num_res_blocks": 2, + "in_channels": 1, + "z_channels": 4, + "out_ch": 1, + "double_z": true + }, + "noise_scheduler": { + "num_train_timesteps": 1000, + "beta_start": 0.00085, + "beta_end": 0.012, + "beta_schedule": "scaled_linear", + "clip_sample": false, + "steps_offset": 1, + "set_alpha_to_one": false, + "skip_prk_steps": true, + "prediction_type": "epsilon" + }, + "autoencoder_path": "ckpts/tta/autoencoder_kl_debug/checkpoints/step-0445000_loss-0.3306.pt" + }, + + // train + "train": { + "adam": { + "lr": 5.0e-5 + }, + "ddp": false, + "random_seed": 12345, + "batch_size": 12, + "epochs": 50000, + "max_steps": 1000000, + "total_training_steps": 800000, + "save_summary_steps": 1000, + "save_checkpoints_steps": 5000, + "valid_interval": 5000, + "keep_checkpoint_max": 100 + } + } \ No newline at end of file diff --git a/egs/tta/audioldm/exp_config_base.json b/egs/tta/audioldm/exp_config_base.json new file mode 100644 index 0000000000000000000000000000000000000000..8201c070715112119d11750f68f76c83d80a9aa4 --- /dev/null +++ b/egs/tta/audioldm/exp_config_base.json @@ -0,0 +1,11 @@ +{ + "base_config": "config/audioldm.json", + "model_type": "AudioLDM", + "dataset": [ + "AudioCaps" + ], + "preprocess": { + "train_file": "train.json", + "valid_file": "vaild.json" + } +} \ No newline at end of file diff --git a/egs/tta/audioldm/exp_config_latent_4_10_78.json b/egs/tta/audioldm/exp_config_latent_4_10_78.json new file mode 100644 index 0000000000000000000000000000000000000000..09ae7c38cf332e74ecb4bc485d44a5f8985d43c5 --- /dev/null +++ b/egs/tta/audioldm/exp_config_latent_4_10_78.json @@ -0,0 +1,88 @@ +{ + "base_config": "egs/tta/audioldm/exp_config_base.json", + "dataset": [ + "AudioCaps" + ], + "preprocess": { + // Specify the output root path to save the processed data + "processed_dir": "data", + + // feature + "use_spkid": false, + "use_uv": false, + "use_frame_pitch": false, + "use_phone_pitch": false, + "use_frame_energy": false, + "use_phone_energy": false, + "use_mel": false, + "use_audio": false, + "use_label": false, + "use_one_hot": false, + // feature for text to audio + "use_caption": true, + "use_melspec": true, + "use_wav": false, + // feature dir + "melspec_dir": "mel", + "wav_dir": "wav" + }, + // Specify the output root path to save model ckpts and logs + "log_dir": "ckpts/tta", + + // model + "model": { + "audioldm": { + "image_size": 32, + "in_channels": 4, + "out_channels": 4, + "model_channels": 256, + "attention_resolutions": [4, 2, 1], + "num_res_blocks": 2, + "channel_mult": [1, 2, 4], + "num_heads": 8, + "use_spatial_transformer": true, + "transformer_depth": 1, + "context_dim": 768, + "use_checkpoint": true, + "legacy": false + }, + "autoencoderkl": { + "ch": 128, + "ch_mult": [1,2,2,4], + "num_res_blocks": 2, + "in_channels": 1, + "z_channels": 4, + "out_ch": 1, + "double_z": true + }, + "noise_scheduler": { + "num_train_timesteps": 1000, + "beta_start": 0.00085, + "beta_end": 0.012, + "beta_schedule": "scaled_linear", + "clip_sample": false, + "steps_offset": 1, + "set_alpha_to_one": false, + "skip_prk_steps": true, + "prediction_type": "epsilon" + }, + "autoencoder_path": "ckpts/tta/autoencoder_kl_debug_latent_size_4_10_78/checkpoints/step-0390000_loss-0.2876.pt" + }, + + // train + "train": { + "adam": { + "lr": 2.0e-5 + }, + "ddp": false, + "random_seed": 12345, + "batch_size": 12, + "epochs": 50000, + "max_steps": 1000000, + "total_training_steps": 800000, + "save_summary_steps": 1000, + "save_checkpoints_steps": 5000, + "valid_interval": 5000, + "keep_checkpoint_max": 100 + } + } \ No newline at end of file diff --git a/egs/tta/audioldm/run_inference.sh b/egs/tta/audioldm/run_inference.sh new file mode 100644 index 0000000000000000000000000000000000000000..181244efece8e164eb44fb04f051dbb80721c248 --- /dev/null +++ b/egs/tta/audioldm/run_inference.sh @@ -0,0 +1,52 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $exp_dir))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Set Experiment Configuration ########### +exp_config="$exp_dir/exp_config.json" +exp_name="audioldm_debug_latent_size_4_5_39" +checkpoint_path="$work_dir/ckpts/tta/audioldm_debug_latent_size_4_5_39/checkpoints/step-0570000_loss-0.2521.pt" +output_dir="$work_dir/temp" +vocoder_config_path="$work_dir/ckpts/tta/hifigan_checkpoints/config.json" +vocoder_path="$work_dir/ckpts/tta/hifigan_checkpoints/g_01250000" +num_steps=200 +guidance_scale=4.0 + +export CUDA_VISIBLE_DEVICES="0" + +######## Parse Command Line Arguments ########### +while [[ $# -gt 0 ]] +do +key="$1" + +case $key in + --text) + text="$2" + shift # past argument + shift # past value + ;; + *) # unknown option + shift # past argument + ;; +esac +done + +######## Run inference ########### +python "${work_dir}"/bins/tta/inference.py \ + --config=$exp_config \ + --checkpoint_path=$checkpoint_path \ + --text="$text" \ + --vocoder_path=$vocoder_path \ + --vocoder_config_path=$vocoder_config_path \ + --num_steps=$num_steps \ + --guidance_scale=$guidance_scale \ + --output_dir=$output_dir diff --git a/egs/tta/audioldm/run_inference_latent_4_10_78.sh b/egs/tta/audioldm/run_inference_latent_4_10_78.sh new file mode 100644 index 0000000000000000000000000000000000000000..3c247e9e6d2919e14e395deb83397439bb22efc5 --- /dev/null +++ b/egs/tta/audioldm/run_inference_latent_4_10_78.sh @@ -0,0 +1,52 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $exp_dir))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Set Experiment Configuration ########### +exp_config="$exp_dir/exp_config_v2.json" +exp_name="audioldm_debug_latent_size_4_10_78" +checkpoint_path="$work_dir/ckpts/tta/audioldm_debug_latent_size_4_10_78/checkpoints/step-0325000_loss-0.1936.pt" +output_dir="$work_dir/temp" +vocoder_config_path="$work_dir/ckpts/tta/hifigan_checkpoints/config.json" +vocoder_path="$work_dir/ckpts/tta/hifigan_checkpoints/g_01250000" +num_steps=200 +guidance_scale=4.0 + +export CUDA_VISIBLE_DEVICES="0" + +######## Parse Command Line Arguments ########### +while [[ $# -gt 0 ]] +do +key="$1" + +case $key in + --text) + text="$2" + shift # past argument + shift # past value + ;; + *) # unknown option + shift # past argument + ;; +esac +done + +######## Run inference ########### +python "${work_dir}"/bins/tta/inference.py \ + --config=$exp_config \ + --checkpoint_path=$checkpoint_path \ + --text="A man is whistling" \ + --vocoder_path=$vocoder_path \ + --vocoder_config_path=$vocoder_config_path \ + --num_steps=$num_steps \ + --guidance_scale=$guidance_scale \ + --output_dir=$output_dir \ \ No newline at end of file diff --git a/egs/tta/audioldm/run_train.sh b/egs/tta/audioldm/run_train.sh new file mode 100644 index 0000000000000000000000000000000000000000..b1060db6cf02ccd46bdc8ecac304e26233cea354 --- /dev/null +++ b/egs/tta/audioldm/run_train.sh @@ -0,0 +1,26 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $exp_dir))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Set Experiment Configuration ########### +exp_config="$exp_dir/exp_config.json" +exp_name="audioldm_debug_latent_size_4_5_39" + +num_workers=8 +export CUDA_VISIBLE_DEVICES="0" + +######## Train Model ########### +python "${work_dir}"/bins/tta/train_tta.py \ + --config=$exp_config \ + --num_workers=$num_workers \ + --exp_name=$exp_name \ + --stdout_interval=25 \ \ No newline at end of file diff --git a/egs/tta/audioldm/run_train_latent_4_10_78.sh b/egs/tta/audioldm/run_train_latent_4_10_78.sh new file mode 100644 index 0000000000000000000000000000000000000000..f61c0de52983ba6d4976b115fdc22c65696cbc8d --- /dev/null +++ b/egs/tta/audioldm/run_train_latent_4_10_78.sh @@ -0,0 +1,26 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $exp_dir))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Set Experiment Configuration ########### +exp_config="$exp_dir/exp_config_latent_4_10_78.json" +exp_name="audioldm_debug_latent_size_4_10_78" + +num_workers=8 +export CUDA_VISIBLE_DEVICES="0" + +######## Train Model ########### +python "${work_dir}"/bins/tta/train_tta.py \ + --config=$exp_config \ + --num_workers=$num_workers \ + --exp_name=$exp_name \ + --stdout_interval=25 \ \ No newline at end of file diff --git a/egs/tta/autoencoderkl/exp_config.json b/egs/tta/autoencoderkl/exp_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e0d401051fea5920ade4ad7d765e816fdbec5e9d --- /dev/null +++ b/egs/tta/autoencoderkl/exp_config.json @@ -0,0 +1,49 @@ +{ + "base_config": "egs/tta/autoencoderkl/exp_config_base.json", + "dataset": [ + "AudioCaps" + ], + "preprocess": { + // Specify the output root path to save the processed data + "processed_dir": "data", + + // feature + "use_spk": false, + "use_spkid": false, + "use_uv": false, + "use_frame_pitch": false, + "use_phone_pitch": false, + "use_frame_energy": false, + "use_phone_energy": false, + "use_mel": false, + "use_audio": false, + "use_label": false, + "use_one_hot": false, + // feature for text to audio + "use_caption": true, + "use_melspec": true, + "use_wav": false, + // feature dir + "melspec_dir": "mel", + "wav_dir": "wav" + }, + // Specify the output root path to save model ckpts and logs + "log_dir": "ckpts/tta", + + // train + "train": { + "adam": { + "lr": 4.0e-5 + }, + "ddp": false, + "random_seed": 12345, + "batch_size": 12, + "epochs": 50000, + "max_steps": 1000000, + "total_training_steps": 800000, + "save_summary_steps": 1000, + "save_checkpoints_steps": 5000, + "valid_interval": 5000, + "keep_checkpoint_max": 100 + } + } \ No newline at end of file diff --git a/egs/tta/autoencoderkl/exp_config_base.json b/egs/tta/autoencoderkl/exp_config_base.json new file mode 100644 index 0000000000000000000000000000000000000000..d25badd4e99bb435e5eaf0cfc2ceeb96d75775e7 --- /dev/null +++ b/egs/tta/autoencoderkl/exp_config_base.json @@ -0,0 +1,11 @@ +{ + "base_config": "config/autoencoderkl.json", + "model_type": "AutoencoderKL", + "dataset": [ + "AudioCaps" + ], + "preprocess": { + "train_file": "train.json", + "valid_file": "vaild.json" + } +} \ No newline at end of file diff --git a/egs/tta/autoencoderkl/exp_config_latent_4_10_78.json b/egs/tta/autoencoderkl/exp_config_latent_4_10_78.json new file mode 100644 index 0000000000000000000000000000000000000000..911018a28c8e2d6a471a6505261e8050151e4d60 --- /dev/null +++ b/egs/tta/autoencoderkl/exp_config_latent_4_10_78.json @@ -0,0 +1,59 @@ +{ + "base_config": "egs/tta/autoencoderkl/exp_config_base.json", + "dataset": [ + "AudioCaps" + ], + "preprocess": { + // Specify the output root path to save the processed data + "processed_dir": "data", + + // feature + "use_spkid": false, + "use_uv": false, + "use_frame_pitch": false, + "use_phone_pitch": false, + "use_frame_energy": false, + "use_phone_energy": false, + "use_mel": false, + "use_audio": false, + "use_label": false, + "use_one_hot": false, + // feature for text to audio + "use_caption": true, + "use_melspec": true, + "use_wav": false, + // feature dir + "melspec_dir": "mel", + "wav_dir": "wav" + }, + // Specify the output root path to save model ckpts and logs + "log_dir": "ckpts/tta", + + "model": { + "autoencoderkl": { + "ch": 128, + "ch_mult": [1,2,2,4], + "num_res_blocks": 2, + "in_channels": 1, + "z_channels": 4, + "out_ch": 1, + "double_z": true + } + }, + // train + "train": { + "adam": { + "lr": 4.0e-5 + }, + "ddp": false, + "random_seed": 12345, + "batch_size": 12, + "epochs": 50000, + "max_steps": 1000000, + "total_training_steps": 800000, + "save_summary_steps": 1000, + "save_checkpoints_steps": 5000, + "valid_interval": 5000, + "keep_checkpoint_max": 100 + } + } \ No newline at end of file diff --git a/egs/tta/autoencoderkl/run_train.sh b/egs/tta/autoencoderkl/run_train.sh new file mode 100644 index 0000000000000000000000000000000000000000..00a25693166dcee2d7b96dc6aa957ce96f8ef872 --- /dev/null +++ b/egs/tta/autoencoderkl/run_train.sh @@ -0,0 +1,26 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $exp_dir))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Set Experiment Configuration ########### +exp_config="$exp_dir/exp_config.json" +exp_name="autoencoder_kl_debug" + +num_workers=8 +export CUDA_VISIBLE_DEVICES="0" + +######## Train Model ########### +python "${work_dir}"/bins/tta/train_tta.py \ + --config=$exp_config \ + --num_workers=$num_workers \ + --exp_name=$exp_name \ + --stdout_interval=25 \ \ No newline at end of file diff --git a/egs/tta/autoencoderkl/run_train_latent_4_10_78.sh b/egs/tta/autoencoderkl/run_train_latent_4_10_78.sh new file mode 100644 index 0000000000000000000000000000000000000000..041627d9c43b56ee4f1657733062f18eb313e8b0 --- /dev/null +++ b/egs/tta/autoencoderkl/run_train_latent_4_10_78.sh @@ -0,0 +1,26 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $exp_dir))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Set Experiment Configuration ########### +exp_config="$exp_dir/exp_config_latent_4_10_78.json" +exp_name="autoencoder_kl_debug_latent_size_4_10_78" + +num_workers=8 +export CUDA_VISIBLE_DEVICES="0" + +######## Train Model ########### +python "${work_dir}"/bins/tta/train_tta.py \ + --config=$exp_config \ + --num_workers=$num_workers \ + --exp_name=$exp_name \ + --stdout_interval=25 \ \ No newline at end of file diff --git a/egs/tts/FastSpeech2/README.md b/egs/tts/FastSpeech2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..90108cf2a6c84d37dbb0537862e068376ea10c66 --- /dev/null +++ b/egs/tts/FastSpeech2/README.md @@ -0,0 +1,132 @@ + +# FastSpeech2 Recipe + +In this recipe, we will show how to train [FastSpeech2](https://openreview.net/forum?id=piLPYqxtWuA) using Amphion's infrastructure. FastSpeech2 is a non-autoregressive TTS architecture that utilizes feed-forward Transformer blocks. + +There are four stages in total: + +1. Data preparation +2. Features extraction +3. Training +4. Inference + +> **NOTE:** You need to run every command of this recipe in the `Amphion` root path: +> ```bash +> cd Amphion +> ``` + +## 1. Data Preparation + +### Dataset Download +You can use the commonly used TTS dataset to train TTS model, e.g., LJSpeech, VCTK, LibriTTS, etc. We strongly recommend you use LJSpeech to train TTS model for the first time. How to download dataset is detailed [here](../../datasets/README.md). + +### Configuration + +After downloading the dataset, you can set the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets. + +```json + "dataset": [ + "LJSpeech", + ], + "dataset_path": { + // TODO: Fill in your dataset path + "LJSpeech": "[LJSpeech dataset path]", + }, +``` + +## 2. Features Extraction + +### Configuration + +Specify the `processed_dir` and the `log_dir` and for saving the processed data and the checkpoints in `exp_config.json`: + +```json + // TODO: Fill in the output log path + "log_dir": "ckpts/tts", + "preprocess": { + // TODO: Fill in the output data path + "processed_dir": "data", + ... + }, +``` + +### Run + +Run the `run.sh` as the preproces stage (set `--stage 1`): + +```bash +sh egs/tts/FastSpeech2/run.sh --stage 1 +``` + +## 3. Training + +### Configuration + +We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on your GPU machines. + +``` +"train": { + "batch_size": 16, + } +``` + +### Run + +Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `ckpts/tts/[YourExptName]`. + +```bash +sh egs/tts/FastSpeech2/run.sh --stage 2 --name [YourExptName] +``` + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`. + + +## 4. Inference + +### Configuration + +For inference, you need to specify the following configurations when running `run.sh`: + + +| Parameters | Description | Example | +| --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `ckpts/tts/[YourExptName]` | +| `--infer_output_dir` | The output directory to save inferred audios. | `ckpts/tts/[YourExptName]/result` | +| `--infer_mode` | The inference mode, e.g., "`single`", "`batch`". | "`single`" to generate a clip of speech, "`batch`" to generate a batch of speech at a time. | +| `--infer_dataset` | The dataset used for inference. | For LJSpeech dataset, the inference dataset would be `LJSpeech`. | +| `--infer_testing_set` | The subset of the inference dataset used for inference, e.g., train, test, golden_test | For LJSpeech dataset, the testing set would be  "`test`" split from LJSpeech at the feature extraction, or "`golden_test`" cherry-picked from test set as template testing set. | +| `--infer_text` | The text to be synthesized. | "`This is a clip of generated speech with the given text from a TTS model.`" | + +### Run +For example, if you want to generate speech of all testing set split from LJSpeech, just run: + +```bash +sh egs/tts/FastSpeech2/run.sh --stage 3 \ + --infer_expt_dir ckpts/tts/[YourExptName] \ + --infer_output_dir ckpts/tts/[YourExptName]/result \ + --infer_mode "batch" \ + --infer_dataset "LJSpeech" \ + --infer_testing_set "test" +``` + +Or, if you want to generate a single clip of speech from a given text, just run: + +```bash +sh egs/tts/FastSpeech2/run.sh --stage 3 \ + --infer_expt_dir ckpts/tts/[YourExptName] \ + --infer_output_dir ckpts/tts/[YourExptName]/result \ + --infer_mode "single" \ + --infer_text "This is a clip of generated speech with the given text from a TTS model." +``` + +We will release a pre-trained FastSpeech2 model trained on LJSpeech. So you can download the pre-trained model and generate speech following the above inference instruction. + + +```bibtex +@inproceedings{ren2020fastspeech, + title={FastSpeech 2: Fast and High-Quality End-to-End Text to Speech}, + author={Ren, Yi and Hu, Chenxu and Tan, Xu and Qin, Tao and Zhao, Sheng and Zhao, Zhou and Liu, Tie-Yan}, + booktitle={International Conference on Learning Representations}, + year={2020} +} +``` diff --git a/egs/tts/FastSpeech2/exp_config.json b/egs/tts/FastSpeech2/exp_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6ba9022226ea987417913fb587e2a60ddb68bcb6 --- /dev/null +++ b/egs/tts/FastSpeech2/exp_config.json @@ -0,0 +1,21 @@ +{ + "base_config": "config/fs2.json", + "model_type": "FastSpeech2", + "dataset": [ + "LJSpeech" + ], + "dataset_path": { + // TODO: Fill in your dataset path + "LJSpeech": "[LJSpeech dataset path]" + }, + // TODO: Fill in the output log path. The default value is "Amphion/ckpts/tts" + "log_dir": "ckpts/tts", + "preprocess": { + // TODO: Fill in the output data path. The default value is "Amphion/data" + "processed_dir": "data", + "sample_rate": 22050, + }, + "train": { + "batch_size": 16, + } +} diff --git a/egs/tts/FastSpeech2/prepare_mfa.sh b/egs/tts/FastSpeech2/prepare_mfa.sh new file mode 100644 index 0000000000000000000000000000000000000000..e8ce825b6a359f63e77e3ae001a48f14f3c69146 --- /dev/null +++ b/egs/tts/FastSpeech2/prepare_mfa.sh @@ -0,0 +1,14 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +#!/bin/bash +mkdir mfa +cd mfa +wget https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.1.0-beta.2/montreal-forced-aligner_linux.tar.gz +tar -zxvf montreal-forced-aligner_linux.tar.gz +cd mfa +mkdir lexicon +cd lexicon +wget http://www.openslr.org/resources/11/librispeech-lexicon.txt \ No newline at end of file diff --git a/egs/tts/FastSpeech2/run.sh b/egs/tts/FastSpeech2/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..ad8da71e49783d5f47c3becb7d284506236678c2 --- /dev/null +++ b/egs/tts/FastSpeech2/run.sh @@ -0,0 +1,150 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $exp_dir))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +cd $work_dir/modules/monotonic_align +mkdir -p monotonic_align +python setup.py build_ext --inplace +cd $work_dir + +mfa_dir=$work_dir/mfa +echo $mfa_dir + +######## Parse the Given Parameters from the Commond ########### +# options=$(getopt -o c:n:s --long gpu:,config:,infer_expt_dir:,infer_output_dir:,infer_source_file:,infer_source_audio_dir:,infer_target_speaker:,infer_key_shift:,infer_vocoder_dir:,name:,stage: -- "$@") +options=$(getopt -o c:n:s --long gpu:,config:,infer_expt_dir:,infer_output_dir:,infer_mode:,infer_dataset:,infer_testing_set:,infer_text:,name:,stage: -- "$@") +eval set -- "$options" + +while true; do + case $1 in + # Experimental Configuration File + -c | --config) shift; exp_config=$1 ; shift ;; + # Experimental Name + -n | --name) shift; exp_name=$1 ; shift ;; + # Running Stage + -s | --stage) shift; running_stage=$1 ; shift ;; + # Visible GPU machines. The default value is "0". + --gpu) shift; gpu=$1 ; shift ;; + + # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]" + --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;; + # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result" + --infer_output_dir) shift; infer_output_dir=$1 ; shift ;; + # [Only for Inference] The inference mode. It can be "batch" to generate speech by batch, or "single" to generage a single clip of speech. + --infer_mode) shift; infer_mode=$1 ; shift ;; + # [Only for Inference] The inference dataset. It is only used when the inference model is "batch". + --infer_dataset) shift; infer_dataset=$1 ; shift ;; + # [Only for Inference] The inference testing set. It is only used when the inference model is "batch". It can be "test" set split from the dataset, or "golden_test" carefully selected from the testing set. + --infer_testing_set) shift; infer_testing_set=$1 ; shift ;; + # [Only for Inference] The text to be synthesized from. It is only used when the inference model is "single". + --infer_text) shift; infer_text=$1 ; shift ;; + + --) shift ; break ;; + *) echo "Invalid option: $1" exit 1 ;; + esac +done + + +### Value check ### +if [ -z "$running_stage" ]; then + echo "[Error] Please specify the running stage" + exit 1 +fi + +if [ -z "$exp_config" ]; then + exp_config="${exp_dir}"/exp_config.json +fi +echo "Exprimental Configuration File: $exp_config" + +if [ -z "$gpu" ]; then + gpu="0" +fi + +######## Features Extraction ########### +if [ $running_stage -eq 1 ]; then + if [ ! -d "$mfa_dir" ]; then + bash ${exp_dir}/prepare_mfa.sh + fi + CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/tts/preprocess.py \ + --config=$exp_config \ + --num_workers=4 \ + --prepare_alignment=true +fi + +######## Training ########### +if [ $running_stage -eq 2 ]; then + if [ -z "$exp_name" ]; then + echo "[Error] Please specify the experiments name" + exit 1 + fi + echo "Exprimental Name: $exp_name" + + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/tts/train.py \ + --config $exp_config \ + --exp_name $exp_name \ + --log_level debug +fi + +######## Inference ########### +if [ $running_stage -eq 3 ]; then + if [ -z "$infer_expt_dir" ]; then + echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]" + exit 1 + fi + + if [ -z "$infer_output_dir" ]; then + infer_output_dir="$expt_dir/result" + fi + + if [ -z "$infer_mode" ]; then + echo "[Error] Please specify the inference mode, e.g., "batch", "single"" + exit 1 + fi + + if [ "$infer_mode" = "batch" ] && [ -z "$infer_dataset" ]; then + echo "[Error] Please specify the dataset used in inference when the inference mode is batch" + exit 1 + fi + + if [ "$infer_mode" = "batch" ] && [ -z "$infer_testing_set" ]; then + echo "[Error] Please specify the testing set used in inference when the inference mode is batch" + exit 1 + fi + + if [ "$infer_mode" = "single" ] && [ -z "$infer_text" ]; then + echo "[Error] Please specify the text to be synthesized when the inference mode is single" + exit 1 + fi + + if [ "$infer_mode" = "single" ]; then + echo 'Text: ' ${infer_text} + infer_dataset=None + infer_testing_set=None + elif [ "$infer_mode" = "batch" ]; then + infer_text='' + fi + + + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/tts/inference.py \ + --config $exp_config \ + --acoustics_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --mode $infer_mode \ + --dataset $infer_dataset \ + --testing_set $infer_testing_set \ + --text "$infer_text" \ + --log_level debug \ + --vocoder_dir /mntnfs/lee_data1/chenxi/processed_data/ljspeech/model_ckpt/hifigan/checkpoints + + + +fi \ No newline at end of file diff --git a/egs/tts/NaturalSpeech2/exp_config.json b/egs/tts/NaturalSpeech2/exp_config.json new file mode 100644 index 0000000000000000000000000000000000000000..170a15e7b1ceacda03bd6d7422f67a8adfa570b4 --- /dev/null +++ b/egs/tts/NaturalSpeech2/exp_config.json @@ -0,0 +1,39 @@ +{ + "base_config": "egs/tts/NaturalSpeech2/exp_config_base.json", + "dataset": [ + "LibriTTS" + ], + "preprocess": { + // Specify the output root path to save the processed data + "processed_dir": "[LibriTTS dataset path]", + "train_file": "train.json", + "valid_file": "test.json", + "read_metadata": true, + "metadata_dir": "metadata" + }, + // Specify the output root path to save model ckpts and logs + "log_dir": "ckpts/tts", + "train": { + // New trainer and Accelerator + "gradient_accumulation_step": 1, + "tracker": ["tensorboard"], + "max_epoch": 5000, + "save_checkpoint_stride": [1], + "keep_last": [1000], + "run_eval": [true], + "dataloader": { + "num_worker": 16, + "pin_memory": true + }, + "adam": { + "lr": 1.0e-4 + }, + "use_dynamic_batchsize": true, + "batch_size": 8, + "max_tokens": 7500, + "max_sentences": 32, + "lr_warmup_steps": 5000, + "lr_scheduler": "cosine", + "num_train_steps": 800000 + } + } \ No newline at end of file diff --git a/egs/tts/NaturalSpeech2/exp_config_base.json b/egs/tts/NaturalSpeech2/exp_config_base.json new file mode 100644 index 0000000000000000000000000000000000000000..31edb71192f18d0079f66bf51217d7857f5f21ca --- /dev/null +++ b/egs/tts/NaturalSpeech2/exp_config_base.json @@ -0,0 +1,118 @@ +{ + "base_config": "config/ns2.json", + "model_type": "NaturalSpeech2", + "dataset": [ + "LibriTTS" + ], + "preprocess": { + "use_mel": false, + "use_code": true, + "use_spkid": true, + "use_pitch": true, + "use_duration": true, + "use_phone": true, + "use_len": true, + "use_cross_reference": true, + "train_file": "train.json", + "valid_file": "test.json", + "melspec_dir": "mel", + "code_dir": "code", + "pitch_dir": "pitch", + "duration_dir": "duration", + "metadata_dir": "metadata", + "read_metadata": true, + "clip_mode": "start" + }, + "model": { + "latent_dim": 128, + "prior_encoder": { + "vocab_size": 100, + "pitch_min": 50, + "pitch_max": 1100, + "pitch_bins_num": 512, + "encoder": { + "encoder_layer": 6, + "encoder_hidden": 512, + "encoder_head": 8, + "conv_filter_size": 2048, + "conv_kernel_size": 9, + "encoder_dropout": 0.2, + "use_cln": true + }, + "duration_predictor": { + "input_size": 512, + "filter_size": 512, + "kernel_size": 3, + "conv_layers": 30, + "cross_attn_per_layer": 3, + "attn_head": 8, + "drop_out": 0.5 + }, + "pitch_predictor": { + "input_size": 512, + "filter_size": 512, + "kernel_size": 5, + "conv_layers": 30, + "cross_attn_per_layer": 3, + "attn_head": 8, + "drop_out": 0.5 + } + }, + "diffusion": { + "wavenet": { + "input_size": 128, + "hidden_size": 512, + "out_size": 128, + "num_layers": 40, + "cross_attn_per_layer": 3, + "dilation_cycle": 2, + "attn_head": 8, + "drop_out": 0.2 + }, + "beta_min": 0.05, + "beta_max": 20, + "sigma": 1.0, + "noise_factor": 1.0, + "ode_solver": "euler", + "diffusion_type": "diffusion" + }, + "prompt_encoder": { + "encoder_layer": 6, + "encoder_hidden": 512, + "encoder_head": 8, + "conv_filter_size": 2048, + "conv_kernel_size": 9, + "encoder_dropout": 0.2, + "use_cln": false + }, + "query_emb": { + "query_token_num": 32, + "hidden_size": 512, + "head_num": 8 + }, + "inference_step": 500 + }, + "train": { + "use_dynamic_batchsize": true, + "max_tokens": 7500, + "max_sentences": 32, + "lr_warmup_steps": 5000, + "lr_scheduler": "cosine", + "num_train_steps": 800000, + "adam": { + "lr": 7.5e-5 + }, + "diff_ce_loss_lambda": 0.5, + "diff_noise_loss_lambda": 1.0, + "ddp": false, + "random_seed": 114, + "batch_size": 32, + "epochs": 5000, + "max_steps": 1000000, + "total_training_steps": 800000, + "save_summary_steps": 500, + "save_checkpoints_steps": 2000, + "valid_interval": 2000, + "keep_checkpoint_max": 100 + } +} \ No newline at end of file diff --git a/egs/tts/NaturalSpeech2/run_inference.sh b/egs/tts/NaturalSpeech2/run_inference.sh new file mode 100644 index 0000000000000000000000000000000000000000..b1a91b8c4efb20b0ec2ab1f913a9b84ab8fb2389 --- /dev/null +++ b/egs/tts/NaturalSpeech2/run_inference.sh @@ -0,0 +1,43 @@ +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $exp_dir))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Set Experiment Configuration ########### +exp_config="$exp_dir/exp_config.json" +exp_name="ns2_libritts" +ref_audio="$work_dir/egs/tts/NaturalSpeech2/prompt_example/ref_audio.wav" +checkpoint_path="$work_dir/ckpts/tts/ns2_libritts/checkpoint/epoch-0065_step-0376136_loss-7.126379" +output_dir="$work_dir/output" +mode="single" + +export CUDA_VISIBLE_DEVICES="0" + +######## Parse Command Line Arguments ########### +while [[ $# -gt 0 ]] +do +key="$1" + +case $key in + --text) + text="$2" + shift # past argument + shift # past value + ;; + *) # unknown option + shift # past argument + ;; +esac +done + +######## Train Model ########### +python "${work_dir}"/bins/tts/inference.py \ + --config=$exp_config \ + --text="$text" \ + --mode=$mode \ + --checkpoint_path=$checkpoint_path \ + --ref_audio=$ref_audio \ + --output_dir=$output_dir \ \ No newline at end of file diff --git a/egs/tts/NaturalSpeech2/run_train.sh b/egs/tts/NaturalSpeech2/run_train.sh new file mode 100644 index 0000000000000000000000000000000000000000..561be9e8f3ce11b753f1148e2e797d7b5ef5b3f0 --- /dev/null +++ b/egs/tts/NaturalSpeech2/run_train.sh @@ -0,0 +1,18 @@ +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $exp_dir))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Set Experiment Configuration ########### +exp_config="$exp_dir/exp_config.json" +exp_name="ns2_libritts" + +######## Train Model ########### +CUDA_VISIBLE_DEVICES="0" accelerate \ + "${work_dir}"/bins/tts/train.py \ + --config=$exp_config \ + --exp_name=$exp_name \ + --log_level debug \ \ No newline at end of file diff --git a/egs/tts/README.md b/egs/tts/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d1d40196af0e95f401958428511fd754d0d7e449 --- /dev/null +++ b/egs/tts/README.md @@ -0,0 +1,17 @@ + +# Amphion Text-to-Speech (TTS) Recipe + +## Quick Start + +We provide a **[beginner recipe](VALLE/)** to demonstrate how to train a cutting edge TTS model. Specifically, it is Amphion's re-implementation for [Vall-E](https://arxiv.org/abs/2301.02111), which is a zero-shot TTS architecture that uses a neural codec language model with discrete codes. + +## Supported Model Architectures + +Until now, Amphion TTS supports the following models or architectures, +- **[FastSpeech2](FastSpeech2)**: A non-autoregressive TTS architecture that utilizes feed-forward Transformer blocks. +- **[VITS](VITS)**: An end-to-end TTS architecture that utilizes conditional variational autoencoder with adversarial learning +- **[Vall-E](VALLE)**: A zero-shot TTS architecture that uses a neural codec language model with discrete codes. +- **[NaturalSpeech2](NaturalSpeech2)** (👨‍💻 developing): An architecture for TTS that utilizes a latent diffusion model to generate natural-sounding voices. + +## Amphion TTS Demo +Here are some [TTS samples](https://openhlt.github.io/Amphion_TTS_Demo/) from Amphion. diff --git a/egs/tts/VALLE/README.md b/egs/tts/VALLE/README.md new file mode 100644 index 0000000000000000000000000000000000000000..63b837f832bb0aa5138ae31e5d232eeaca27574f --- /dev/null +++ b/egs/tts/VALLE/README.md @@ -0,0 +1,139 @@ +# VALL-E Recipe + +In this recipe, we will show how to train [VALL-E](https://arxiv.org/abs/2301.02111) using Amphion's infrastructure. VALL-E is a zero-shot TTS architecture that uses a neural codec language model with discrete codes. + +There are four stages in total: + +1. Data preparation +2. Features extraction +3. Training +4. Inference + +> **NOTE:** You need to run every command of this recipe in the `Amphion` root path: +> ```bash +> cd Amphion +> ``` + +## 1. Data Preparation + +### Dataset Download +You can use the commonly used TTS dataset to train VALL-E model, e.g., LibriTTS, etc. We strongly recommend you use LibriTTS to train VALL-E model for the first time. How to download dataset is detailed [here](../../datasets/README.md). + +### Configuration + +After downloading the dataset, you can set the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets. + +```json + "dataset": [ + "libritts", + ], + "dataset_path": { + // TODO: Fill in your dataset path + "libritts": "[LibriTTS dataset path]", + }, +``` + +## 2. Features Extraction + +### Configuration + +Specify the `processed_dir` and the `log_dir` and for saving the processed data and the checkpoints in `exp_config.json`: + +```json + // TODO: Fill in the output log path. The default value is "Amphion/ckpts/tts" + "log_dir": "ckpts/tts", + "preprocess": { + // TODO: Fill in the output data path. The default value is "Amphion/data" + "processed_dir": "data", + ... + }, +``` + +### Run + +Run the `run.sh` as the preproces stage (set `--stage 1`): + +```bash +sh egs/tts/VALLE/run.sh --stage 1 +``` + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`. + + +## 3. Training + +### Configuration + +We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on your GPU machines. + +``` +"train": { + "batch_size": 4, + } +``` + +### Run + +Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/tts/[YourExptName]`. + +Specifically, VALL-E need to train a autoregressive (AR) model and then a non-autoregressive (NAR) model. So, you can set `--model_train_stage 1` to train AR model, and set `--model_train_stage 2` to train NAR model, where `--ar_model_ckpt_dir` should be set as the ckeckpoint path to the trained AR model. + + +Train a AR moel, just run: + +```bash +sh egs/tts/VALLE/run.sh --stage 2 --model_train_stage 1 --name [YourExptName] +``` + +Train a NAR model, just run: +```bash +sh egs/tts/VALLE/run.sh --stage 2 --model_train_stage 2 --ar_model_ckpt_dir [ARModelPath] --name [YourExptName] +``` + + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`. + + +## 4. Inference + +### Configuration + +For inference, you need to specify the following configurations when running `run.sh`: + + + +| Parameters | Description | Example | +| --------------------- | -------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `--infer_expt_dir` | The experimental directory of NAR model which contains `checkpoint` | `Amphion/ckpts/tts/[YourExptName]` | +| `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/tts/[YourExptName]/result` | +| `--infer_mode` | The inference mode, e.g., "`single`", "`batch`". | "`single`" to generate a clip of speech, "`batch`" to generate a batch of speech at a time. | +| `--infer_text` | The text to be synthesized. | "`This is a clip of generated speech with the given text from a TTS model.`" | +| `--infer_text_prompt` | The text prompt for inference. | The text prompt should be aligned with the audio prompt. | +| `--infer_audio_prompt` | The audio prompt for inference. | The audio prompt should be aligned with text prompt.| +| `--test_list_file` | The test list file used for batch inference. | The format of test list file is `text\|text_prompt\|audio_prompt`.| + + +### Run +For example, if you want to generate a single clip of speech, just run: + +```bash +sh egs/tts/VALLE/run.sh --stage 3 --gpu "0" \ + --infer_expt_dir Amphion/ckpts/tts/[YourExptName] \ + --infer_output_dir Amphion/ckpts/tts/[YourExptName]/result \ + --infer_mode "single" \ + --infer_text "This is a clip of generated speech with the given text from a TTS model." \ + --infer_text_prompt "But even the unsuccessful dramatist has his moments." \ + --infer_audio_prompt egs/tts/VALLE/prompt_examples/7176_92135_000004_000000.wav +``` + + +We released a pre-trained Amphion VALL-E model. So you can download the pre-trained model [here](https://huggingface.co/amphion/valle-libritts) and generate speech following the above inference instruction. + +```bibtex +@article{wang2023neural, + title={Neural codec language models are zero-shot text to speech synthesizers}, + author={Wang, Chengyi and Chen, Sanyuan and Wu, Yu and Zhang, Ziqiang and Zhou, Long and Liu, Shujie and Chen, Zhuo and Liu, Yanqing and Wang, Huaming and Li, Jinyu and others}, + journal={arXiv preprint arXiv:2301.02111}, + year={2023} +} +``` \ No newline at end of file diff --git a/egs/tts/VALLE/exp_config.json b/egs/tts/VALLE/exp_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f013dad0b7f85c57fcd99a578278fef51cf5d717 --- /dev/null +++ b/egs/tts/VALLE/exp_config.json @@ -0,0 +1,33 @@ +{ + "base_config": "config/valle.json", + "model_type": "VALLE", + "dataset": [ + "libritts" + ], + "dataset_path": { + "libritts": "[LibriTTS dataset path]" + }, + "preprocess": { + "extract_phone": true, + "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)" + "extract_acoustic_token": true, + "use_phone": true, + "use_acoustic_token": true, + "processed_dir": "Amphion/data/", + "sample_rate": 24000, // "Audio sampling rate." + "codec_hop_size": "320", // "Audio codec hop size." + "valid_file": "test.json", + }, + "model": { + "prefix_mode": 1, // "The mode for how to prefix VALL-E NAR Decoder, 0: no prefix, 1: 0 to random, 2: random to random, 4: chunk of pre or post utterance.", + }, + "log_dir": "Amphion/ckpts/tts/valle", + "train": { + "batch_size": 4, + "train_stage": 1, // 0: train all modules, For VALL_E, support 1: AR Decoder 2: NAR Decoder(s) + "max_epoch": 20, // "Number of epochs to train." + "use_dynamic_batchsize": true, // If use dynamic batch size + "max_tokens": 4000, // If use dynamic batch size + "max_sentences": 10 // If use dynamic batch size + } +} diff --git a/egs/tts/VALLE/prompt_examples/260_123440_000010_000004.normalized.txt b/egs/tts/VALLE/prompt_examples/260_123440_000010_000004.normalized.txt new file mode 100644 index 0000000000000000000000000000000000000000..ed9d2b859d3c7bbb7bcc2476323d1a219ee8d53f --- /dev/null +++ b/egs/tts/VALLE/prompt_examples/260_123440_000010_000004.normalized.txt @@ -0,0 +1 @@ +I almost think I can remember feeling a little different. \ No newline at end of file diff --git a/egs/tts/VALLE/prompt_examples/5142_33396_000002_000004.normalized.txt b/egs/tts/VALLE/prompt_examples/5142_33396_000002_000004.normalized.txt new file mode 100644 index 0000000000000000000000000000000000000000..4dab0ebd446cb814acccb2b97fee31a0d5b7444d --- /dev/null +++ b/egs/tts/VALLE/prompt_examples/5142_33396_000002_000004.normalized.txt @@ -0,0 +1 @@ +Ten sons sat at meat with him, and I was the youngest. \ No newline at end of file diff --git a/egs/tts/VALLE/prompt_examples/6829_68771_000027_000000.normalized.txt b/egs/tts/VALLE/prompt_examples/6829_68771_000027_000000.normalized.txt new file mode 100644 index 0000000000000000000000000000000000000000..04db64ff2711c35a78cbb210d11fce06865d831c --- /dev/null +++ b/egs/tts/VALLE/prompt_examples/6829_68771_000027_000000.normalized.txt @@ -0,0 +1 @@ +The girl entered, and gave an involuntary cry of surprise. \ No newline at end of file diff --git a/egs/tts/VALLE/prompt_examples/7176_92135_000004_000000.normalized.txt b/egs/tts/VALLE/prompt_examples/7176_92135_000004_000000.normalized.txt new file mode 100644 index 0000000000000000000000000000000000000000..6400078bd5d5fd4043a0019fc1e8b276b408cd39 --- /dev/null +++ b/egs/tts/VALLE/prompt_examples/7176_92135_000004_000000.normalized.txt @@ -0,0 +1 @@ +But even the unsuccessful dramatist has his moments. \ No newline at end of file diff --git a/egs/tts/VALLE/run.sh b/egs/tts/VALLE/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..104f9f32619145952fac5218a3a24134837c4ad9 --- /dev/null +++ b/egs/tts/VALLE/run.sh @@ -0,0 +1,158 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $exp_dir))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +cd $work_dir/modules/monotonic_align +mkdir -p monotonic_align +python setup.py build_ext --inplace +cd $work_dir + +######## Parse the Given Parameters from the Commond ########### +options=$(getopt -o c:n:s --long gpu:,config:,infer_expt_dir:,ar_model_ckpt_dir:,infer_output_dir:,infer_mode:,infer_test_list_file:,infer_text:,infer_text_prompt:,infer_audio_prompt:,model_train_stage:,name:,stage: -- "$@") +eval set -- "$options" + +while true; do + case $1 in + # Experimental Configuration File + -c | --config) shift; exp_config=$1 ; shift ;; + # Experimental Name + -n | --name) shift; exp_name=$1 ; shift ;; + # Running Stage + -s | --stage) shift; running_stage=$1 ; shift ;; + # Visible GPU machines. The default value is "0". + --gpu) shift; gpu=$1 ; shift ;; + + # [Only for Training] Model training stage. + --model_train_stage) shift; model_train_stage=$1 ; shift ;; + # [Only for Training] The stage1 ckpt dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]" + --ar_model_ckpt_dir) shift; ar_model_ckpt_dir=$1 ; shift ;; + + # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]" + --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;; + # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result" + --infer_output_dir) shift; infer_output_dir=$1 ; shift ;; + + # [Only for Inference] The inference mode. It can be "batch" to generate speech by batch, or "single" to generage a single clip of speech. + --infer_mode) shift; infer_mode=$1 ; shift ;; + # [Only for Inference] The inference test list file. It is only used when the inference model is "batch". + --infer_test_list_file) shift; infer_test_list_file=$1 ; shift ;; + # [Only for Inference] The text to be synthesized from. It is only used when the inference model is "single". + --infer_text) shift; infer_text=$1 ; shift ;; + # [Only for Inference] The inference text prompt. It is only used when the inference model is "single". + --infer_text_prompt) shift; infer_text_prompt=$1 ; shift ;; + # [Only for Inference] The inference audio prompt. It is only used when the inference model is "single". + --infer_audio_prompt) shift; infer_audio_prompt=$1 ; shift ;; + + --) shift ; break ;; + *) echo "Invalid option: $1" exit 1 ;; + esac +done + + +### Value check ### +if [ -z "$running_stage" ]; then + echo "[Error] Please specify the running stage" + exit 1 +fi + +if [ -z "$exp_config" ]; then + exp_config="${exp_dir}"/exp_config.json +fi +echo "Exprimental Configuration File: $exp_config" + +if [ -z "$gpu" ]; then + gpu="0" +fi + +######## Features Extraction ########### +if [ $running_stage -eq 1 ]; then + CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/tts/preprocess.py \ + --config=$exp_config \ + --num_workers=4 +fi + +######## Training ########### +if [ $running_stage -eq 2 ]; then + if [ -z "$exp_name" ]; then + echo "[Error] Please specify the experiments name" + exit 1 + fi + + if [ "$model_train_stage" = "2" ] && [ -z "$ar_model_ckpt_dir" ]; then + echo "[Error] Please specify the ckeckpoint path to the trained model in stage1." + exit 1 + fi + + if [ "$model_train_stage" = "1" ]; then + ar_model_ckpt_dir=None + fi + + echo "Exprimental Name: $exp_name" + + CUDA_VISIBLE_DEVICES=$gpu accelerate launch --main_process_port 29510 \ + "${work_dir}"/bins/tts/train.py \ + --config $exp_config \ + --exp_name $exp_name \ + --log_level debug \ + --train_stage $model_train_stage \ + --checkpoint_path $ar_model_ckpt_dir +fi + + +######## Inference ########### +if [ $running_stage -eq 3 ]; then + if [ -z "$infer_expt_dir" ]; then + echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]" + exit 1 + fi + + if [ -z "$infer_output_dir" ]; then + infer_output_dir="$expt_dir/result" + fi + + if [ -z "$infer_mode" ]; then + echo "[Error] Please specify the inference mode, e.g., "batch", "single"" + exit 1 + fi + + if [ "$infer_mode" = "batch" ] && [ -z "$infer_test_list_file" ]; then + echo "[Error] Please specify the test list file used in inference when the inference mode is batch" + exit 1 + fi + + if [ "$infer_mode" = "single" ] && [ -z "$infer_text" ]; then + echo "[Error] Please specify the text to be synthesized when the inference mode is single" + exit 1 + fi + + if [ "$infer_mode" = "single" ]; then + echo 'Text: ' ${infer_text} + infer_test_list_file=None + elif [ "$infer_mode" = "batch" ]; then + infer_text="" + infer_text_prompt="" + infer_audio_prompt="" + fi + + + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/tts/inference.py \ + --config $exp_config \ + --log_level debug \ + --acoustics_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --mode $infer_mode \ + --text "$infer_text" \ + --text_prompt "$infer_text_prompt" \ + --audio_prompt $infer_audio_prompt\ + --test_list_file $infer_test_list_file \ + +fi diff --git a/egs/tts/VITS/README.md b/egs/tts/VITS/README.md new file mode 100644 index 0000000000000000000000000000000000000000..258f386d8abed927a7aecebff3c37fd55a07255e --- /dev/null +++ b/egs/tts/VITS/README.md @@ -0,0 +1,135 @@ + +# VITS Recipe + +In this recipe, we will show how to train [VITS](https://arxiv.org/abs/2106.06103) using Amphion's infrastructure. VITS is an end-to-end TTS architecture that utilizes conditional variational autoencoder with adversarial learning. + +There are four stages in total: + +1. Data preparation +2. Features extraction +3. Training +4. Inference + +> **NOTE:** You need to run every command of this recipe in the `Amphion` root path: +> ```bash +> cd Amphion +> ``` + +## 1. Data Preparation + +### Dataset Download +You can use the commonly used TTS dataset to train TTS model, e.g., LJSpeech, VCTK, LibriTTS, etc. We strongly recommend you use LJSpeech to train TTS model for the first time. How to download dataset is detailed [here](../../datasets/README.md). + +### Configuration + +After downloading the dataset, you can set the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets. + +```json + "dataset": [ + "LJSpeech", + ], + "dataset_path": { + // TODO: Fill in your dataset path + "LJSpeech": "[LJSpeech dataset path]", + }, +``` + +## 2. Features Extraction + +### Configuration + +Specify the `processed_dir` and the `log_dir` and for saving the processed data and the checkpoints in `exp_config.json`: + +```json + // TODO: Fill in the output log path. The default value is "Amphion/ckpts/tts" + "log_dir": "ckpts/tts", + "preprocess": { + // TODO: Fill in the output data path. The default value is "Amphion/data" + "processed_dir": "data", + ... + }, +``` + +### Run + +Run the `run.sh` as the preproces stage (set `--stage 1`): + +```bash +sh egs/tts/VITS/run.sh --stage 1 +``` + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`. + +## 3. Training + +### Configuration + +We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on your GPU machines. + +``` +"train": { + "batch_size": 16, + } +``` + +### Run + +Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/tts/[YourExptName]`. + +```bash +sh egs/tts/VITS/run.sh --stage 2 --name [YourExptName] +``` + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`. + + +## 4. Inference + +### Configuration + +For inference, you need to specify the following configurations when running `run.sh`: + + +| Parameters | Description | Example | +| --------------------- | -------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `Amphion/ckpts/tts/[YourExptName]` | +| `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/tts/[YourExptName]/result` | +| `--infer_mode` | The inference mode, e.g., "`single`", "`batch`". | "`single`" to generate a clip of speech, "`batch`" to generate a batch of speech at a time. | +| `--infer_dataset` | The dataset used for inference. | For LJSpeech dataset, the inference dataset would be `LJSpeech`. | +| `--infer_testing_set` | The subset of the inference dataset used for inference, e.g., train, test, golden_test | For LJSpeech dataset, the testing set would be  "`test`" split from LJSpeech at the feature extraction, or "`golden_test`" cherry-picked from test set as template testing set. | +| `--infer_text` | The text to be synthesized. | "`This is a clip of generated speech with the given text from a TTS model.`" | + +### Run +For example, if you want to generate speech of all testing set split from LJSpeech, just run: + +```bash +sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \ + --infer_expt_dir Amphion/ckpts/tts/[YourExptName] \ + --infer_output_dir Amphion/ckpts/tts/[YourExptName]/result \ + --infer_mode "batch" \ + --infer_dataset "LJSpeech" \ + --infer_testing_set "test" +``` + +Or, if you want to generate a single clip of speech from a given text, just run: + +```bash +sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \ + --infer_expt_dir Amphion/ckpts/tts/[YourExptName] \ + --infer_output_dir Amphion/ckpts/tts/[YourExptName]/result \ + --infer_mode "single" \ + --infer_text "This is a clip of generated speech with the given text from a TTS model." +``` + +We released a pre-trained Amphion VITS model trained on LJSpeech. So you can download the pre-trained model [here](https://huggingface.co/amphion/vits-ljspeech) and generate speech following the above inference instruction. + + +```bibtex +@inproceedings{kim2021conditional, + title={Conditional variational autoencoder with adversarial learning for end-to-end text-to-speech}, + author={Kim, Jaehyeon and Kong, Jungil and Son, Juhee}, + booktitle={International Conference on Machine Learning}, + pages={5530--5540}, + year={2021}, +} +``` \ No newline at end of file diff --git a/egs/tts/VITS/exp_config.json b/egs/tts/VITS/exp_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b210a2656186844fe1d0fc88e515fdc6cc207dcb --- /dev/null +++ b/egs/tts/VITS/exp_config.json @@ -0,0 +1,27 @@ +{ + "base_config": "config/vits.json", + "model_type": "VITS", + "dataset": [ + "LJSpeech" + ], + "dataset_path": { + // TODO: Fill in your dataset path + "LJSpeech": "[LJSpeech dataset path]" + }, + // TODO: Fill in the output log path. The default value is "Amphion/ckpts/tts" + "log_dir": "ckpts/tts", + "preprocess": { + "use_phone": true, + // linguistic features + "extract_phone": true, + "phone_extractor": "lexicon", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)" + // TODO: Fill in the output data path. The default value is "Amphion/data" + "processed_dir": "data", + + "sample_rate": 22050, + "valid_file": "test.json", // validattion set + }, + "train": { + "batch_size": 16, + } +} \ No newline at end of file diff --git a/egs/tts/VITS/run.sh b/egs/tts/VITS/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..8f330d32bca818b92f1f2c71ed78a6c43b703aea --- /dev/null +++ b/egs/tts/VITS/run.sh @@ -0,0 +1,142 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $exp_dir))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +cd $work_dir/modules/monotonic_align +mkdir -p monotonic_align +python setup.py build_ext --inplace +cd $work_dir + +######## Parse the Given Parameters from the Commond ########### +# options=$(getopt -o c:n:s --long gpu:,config:,infer_expt_dir:,infer_output_dir:,infer_source_file:,infer_source_audio_dir:,infer_target_speaker:,infer_key_shift:,infer_vocoder_dir:,name:,stage: -- "$@") +options=$(getopt -o c:n:s --long gpu:,config:,infer_expt_dir:,infer_output_dir:,infer_mode:,infer_dataset:,infer_testing_set:,infer_text:,name:,stage: -- "$@") +eval set -- "$options" + +while true; do + case $1 in + # Experimental Configuration File + -c | --config) shift; exp_config=$1 ; shift ;; + # Experimental Name + -n | --name) shift; exp_name=$1 ; shift ;; + # Running Stage + -s | --stage) shift; running_stage=$1 ; shift ;; + # Visible GPU machines. The default value is "0". + --gpu) shift; gpu=$1 ; shift ;; + + # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]" + --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;; + # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result" + --infer_output_dir) shift; infer_output_dir=$1 ; shift ;; + # [Only for Inference] The inference mode. It can be "batch" to generate speech by batch, or "single" to generage a single clip of speech. + --infer_mode) shift; infer_mode=$1 ; shift ;; + # [Only for Inference] The inference dataset. It is only used when the inference model is "batch". + --infer_dataset) shift; infer_dataset=$1 ; shift ;; + # [Only for Inference] The inference testing set. It is only used when the inference model is "batch". It can be "test" set split from the dataset, or "golden_test" carefully selected from the testing set. + --infer_testing_set) shift; infer_testing_set=$1 ; shift ;; + # [Only for Inference] The text to be synthesized from. It is only used when the inference model is "single". + --infer_text) shift; infer_text=$1 ; shift ;; + + --) shift ; break ;; + *) echo "Invalid option: $1" exit 1 ;; + esac +done + + +### Value check ### +if [ -z "$running_stage" ]; then + echo "[Error] Please specify the running stage" + exit 1 +fi + +if [ -z "$exp_config" ]; then + exp_config="${exp_dir}"/exp_config.json +fi +echo "Exprimental Configuration File: $exp_config" + +if [ -z "$gpu" ]; then + gpu="0" +fi + +######## Features Extraction ########### +if [ $running_stage -eq 1 ]; then + CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/tts/preprocess.py \ + --config=$exp_config \ + --num_workers=4 +fi + +######## Training ########### +if [ $running_stage -eq 2 ]; then + if [ -z "$exp_name" ]; then + echo "[Error] Please specify the experiments name" + exit 1 + fi + echo "Exprimental Name: $exp_name" + + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/tts/train.py \ + --config $exp_config \ + --exp_name $exp_name \ + --log_level debug +fi + +######## Inference ########### +if [ $running_stage -eq 3 ]; then + if [ -z "$infer_expt_dir" ]; then + echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]" + exit 1 + fi + + if [ -z "$infer_output_dir" ]; then + infer_output_dir="$expt_dir/result" + fi + + if [ -z "$infer_mode" ]; then + echo "[Error] Please specify the inference mode, e.g., "batch", "single"" + exit 1 + fi + + if [ "$infer_mode" = "batch" ] && [ -z "$infer_dataset" ]; then + echo "[Error] Please specify the dataset used in inference when the inference mode is batch" + exit 1 + fi + + if [ "$infer_mode" = "batch" ] && [ -z "$infer_testing_set" ]; then + echo "[Error] Please specify the testing set used in inference when the inference mode is batch" + exit 1 + fi + + if [ "$infer_mode" = "single" ] && [ -z "$infer_text" ]; then + echo "[Error] Please specify the text to be synthesized when the inference mode is single" + exit 1 + fi + + if [ "$infer_mode" = "single" ]; then + echo 'Text: ' ${infer_text} + infer_dataset=None + infer_testing_set=None + elif [ "$infer_mode" = "batch" ]; then + infer_text='' + fi + + + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/tts/inference.py \ + --config $exp_config \ + --acoustics_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --mode $infer_mode \ + --dataset $infer_dataset \ + --testing_set $infer_testing_set \ + --text "$infer_text" \ + --log_level debug + + + +fi diff --git a/egs/vocoder/README.md b/egs/vocoder/README.md new file mode 100644 index 0000000000000000000000000000000000000000..213253570b19b5c1d3746ee17911c7eb464ecc73 --- /dev/null +++ b/egs/vocoder/README.md @@ -0,0 +1,23 @@ +# Amphion Vocoder Recipe + +## Quick Start + +We provide a [**beginner recipe**](gan/tfr_enhanced_hifigan/README.md) to demonstrate how to train a high quality HiFi-GAN speech vocoder. Specially, it is also an official implementation of our paper "[Multi-Scale Sub-Band Constant-Q Transform Discriminator for High-Fidelity Vocoder](https://arxiv.org/abs/2311.14957)". Some demos can be seen [here](https://vocodexelysium.github.io/MS-SB-CQTD/). + +## Supported Models + +Neural vocoder generates audible waveforms from acoustic representations, which is one of the key parts for current audio generation systems. Until now, Amphion has supported various widely-used vocoders according to different vocoder types, including: + +- **GAN-based vocoders**, which we have provided [**a unified recipe**](gan/README.md) : + - [MelGAN](https://arxiv.org/abs/1910.06711) + - [HiFi-GAN](https://arxiv.org/abs/2010.05646) + - [NSF-HiFiGAN](https://github.com/nii-yamagishilab/project-NN-Pytorch-scripts) + - [BigVGAN](https://arxiv.org/abs/2206.04658) + - [APNet](https://arxiv.org/abs/2305.07952) +- **Flow-based vocoders** (👨‍💻 developing): + - [WaveGlow](https://arxiv.org/abs/1811.00002) +- **Diffusion-based vocoders** (👨‍💻 developing): + - [Diffwave](https://arxiv.org/abs/2009.09761) +- **Auto-regressive based vocoders** (👨‍💻 developing): + - [WaveNet](https://arxiv.org/abs/1609.03499) + - [WaveRNN](https://arxiv.org/abs/1802.08435v1) \ No newline at end of file diff --git a/egs/vocoder/diffusion/README.md b/egs/vocoder/diffusion/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/egs/vocoder/diffusion/exp_config_base.json b/egs/vocoder/diffusion/exp_config_base.json new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/egs/vocoder/gan/README.md b/egs/vocoder/gan/README.md new file mode 100644 index 0000000000000000000000000000000000000000..dcefd84d6b9c02223efac119d29dcfd88cd0c026 --- /dev/null +++ b/egs/vocoder/gan/README.md @@ -0,0 +1,234 @@ +# Amphion GAN-based Vocoder Recipe + +## Supported Model Architectures + +GAN-based Vocoder consists of a generator and multiple discriminators, as illustrated below: + +
+
+ +
+
+ +Until now, Amphion GAN-based Vocoder has supported the following generators and discriminators. + +- **Generators** + - [MelGAN](https://arxiv.org/abs/1910.06711) + - [HiFi-GAN](https://arxiv.org/abs/2010.05646) + - [NSF-HiFiGAN](https://github.com/nii-yamagishilab/project-NN-Pytorch-scripts) + - [BigVGAN](https://arxiv.org/abs/2206.04658) + - [APNet](https://arxiv.org/abs/2305.07952) +- **Discriminators** + - [Multi-Scale Discriminator](https://arxiv.org/abs/2010.05646) + - [Multi-Period Discriminator](https://arxiv.org/abs/2010.05646) + - [Multi-Resolution Discriminator](https://arxiv.org/abs/2011.09631) + - [Multi-Scale Short-Time Fourier Transform Discriminator](https://arxiv.org/abs/2210.13438) + - [**Multi-Scale Constant-Q Transfrom Discriminator (ours)**](https://arxiv.org/abs/2311.14957) + +You can use any vocoder architecture with any dataset you want. There are four steps in total: + +1. Data preparation +2. Feature extraction +3. Training +4. Inference + +> **NOTE:** You need to run every command of this recipe in the `Amphion` root path: +> ```bash +> cd Amphion +> ``` + +## 1. Data Preparation + +You can train the vocoder with any datasets. Amphion's supported open-source datasets are detailed [here](../../../datasets/README.md). + +### Configuration + +Specify the dataset path in `exp_config_base.json`. Note that you can change the `dataset` list to use your preferred datasets. + +```json +"dataset": [ + "csd", + "kising", + "m4singer", + "nus48e", + "opencpop", + "opensinger", + "opera", + "pjs", + "popbutfy", + "popcs", + "ljspeech", + "vctk", + "libritts", +], +"dataset_path": { + // TODO: Fill in your dataset path + "csd": "[dataset path]", + "kising": "[dataset path]", + "m4singer": "[dataset path]", + "nus48e": "[dataset path]", + "opencpop": "[dataset path]", + "opensinger": "[dataset path]", + "opera": "[dataset path]", + "pjs": "[dataset path]", + "popbutfy": "[dataset path]", + "popcs": "[dataset path]", + "ljspeech": "[dataset path]", + "vctk": "[dataset path]", + "libritts": "[dataset path]", +}, +``` + +### 2. Feature Extraction + +The needed features are speficied in the individual vocoder direction so it doesn't require any modification. + +### Configuration + +Specify the dataset path and the output path for saving the processed data and the training model in `exp_config_base.json`: + +```json + // TODO: Fill in the output log path. The default value is "Amphion/ckpts/vocoder" + "log_dir": "ckpts/vocoder", + "preprocess": { + // TODO: Fill in the output data path. The default value is "Amphion/data" + "processed_dir": "data", + ... + }, +``` + +### Run + +Run the `run.sh` as the preproces stage (set `--stage 1`). + +```bash +sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 1 +``` + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`. + +## 3. Training + +### Configuration + +We provide the default hyparameters in the `exp_config_base.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines. + +```json +"train": { + "batch_size": 32, + "max_epoch": 1000000, + "save_checkpoint_stride": [20], + "adamw": { + "lr": 2.0e-4, + "adam_b1": 0.8, + "adam_b2": 0.99 + }, + "exponential_lr": { + "lr_decay": 0.999 + }, +} +``` + +You can also choose any amount of prefered discriminators for training in the `exp_config_base.json`. + +```json +"discriminators": [ + "msd", + "mpd", + "msstftd", + "mssbcqtd", +], +``` + +### Run + +Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/vocoder/[YourExptName]`. + +```bash +sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 2 --name [YourExptName] +``` + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`. + +If you want to resume or finetune from a pretrained model, run: + +```bash +sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 2 \ + --name [YourExptName] \ + --resume_type ["resume" for resuming training and "finetune" for loading parameters only] \ + --checkpoint Amphion/ckpts/vocoder/[YourExptName]/checkpoint \ +``` + +> **NOTE:** For multi-gpu training, the `main_process_port` is set as `29500` in default. You can change it when running `run.sh` by specifying such as `--main_process_port 29501`. + +## 4. Inference + +### Run + +Run the `run.sh` as the training stage (set `--stage 3`), we provide three different inference modes, including `infer_from_dataset`, `infer_from_feature`, `and infer_from_audio`. + +```bash +sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 3 \ + --infer_mode [Your chosen inference mode] \ + --infer_datasets [Datasets you want to inference, needed when infer_from_dataset] \ + --infer_feature_dir [Your path to your predicted acoustic features, needed when infer_from_feature] \ + --infer_audio_dir [Your path to your audio files, needed when infer_form_audio] \ + --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \ + --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \ +``` + +#### a. Inference from Dataset + +Run the `run.sh` with specified datasets, here is an example. + +```bash +sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 3 \ + --infer_mode infer_from_dataset \ + --infer_datasets "libritts vctk ljspeech" \ + --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \ + --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \ +``` + +#### b. Inference from Features + +If you want to inference from your generated acoustic features, you should first prepare your acoustic features into the following structure: + +```plaintext + ┣ {infer_feature_dir} + ┃ ┣ mels + ┃ ┃ ┣ sample1.npy + ┃ ┃ ┣ sample2.npy + ┃ ┣ f0s (required if you use NSF-HiFiGAN) + ┃ ┃ ┣ sample1.npy + ┃ ┃ ┣ sample2.npy +``` + +Then run the `run.sh` with specificed folder direction, here is an example. + +```bash +sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 3 \ + --infer_mode infer_from_feature \ + --infer_feature_dir [Your path to your predicted acoustic features] \ + --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \ + --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \ +``` + +#### c. Inference from Audios + +If you want to inference from audios for quick analysis synthesis, you should first prepare your audios into the following structure: + +```plaintext + ┣ audios + ┃ ┣ sample1.wav + ┃ ┣ sample2.wav +``` + +Then run the `run.sh` with specificed folder direction, here is an example. + +```bash +sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 3 \ + --infer_mode infer_from_audio \ + --infer_audio_dir [Your path to your audio files] \ + --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \ + --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \ +``` diff --git a/egs/vocoder/gan/_template/run.sh b/egs/vocoder/gan/_template/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..9d7a29276b2ea648f62b079f4d7d5daa871484f9 --- /dev/null +++ b/egs/vocoder/gan/_template/run.sh @@ -0,0 +1,141 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir)))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Parse the Given Parameters from the Commond ########### +options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,checkpoint:,resume_type:,main_process_port:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@") +eval set -- "$options" + +while true; do + case $1 in + # Experimental Configuration File + -c | --config) shift; exp_config=$1 ; shift ;; + # Experimental Name + -n | --name) shift; exp_name=$1 ; shift ;; + # Running Stage + -s | --stage) shift; running_stage=$1 ; shift ;; + # Visible GPU machines. The default value is "0". + --gpu) shift; gpu=$1 ; shift ;; + + # [Only for Training] The specific checkpoint path that you want to resume from. + --checkpoint) shift; checkpoint=$1 ; shift ;; + # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights. + --resume_type) shift; resume_type=$1 ; shift ;; + # [Only for Traiing] `main_process_port` for multi gpu training + --main_process_port) shift; main_process_port=$1 ; shift ;; + + # [Only for Inference] The inference mode + --infer_mode) shift; infer_mode=$1 ; shift ;; + # [Only for Inference] The inferenced datasets + --infer_datasets) shift; infer_datasets=$1 ; shift ;; + # [Only for Inference] The feature dir for inference + --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;; + # [Only for Inference] The audio dir for inference + --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;; + # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]" + --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;; + # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result" + --infer_output_dir) shift; infer_output_dir=$1 ; shift ;; + + --) shift ; break ;; + *) echo "Invalid option: $1" exit 1 ;; + esac +done + + +### Value check ### +if [ -z "$running_stage" ]; then + echo "[Error] Please specify the running stage" + exit 1 +fi + +if [ -z "$exp_config" ]; then + exp_config="${exp_dir}"/exp_config.json +fi +echo "Exprimental Configuration File: $exp_config" + +if [ -z "$gpu" ]; then + gpu="0" +fi + +if [ -z "$main_process_port" ]; then + main_process_port=29500 +fi +echo "Main Process Port: $main_process_port" + +######## Features Extraction ########### +if [ $running_stage -eq 1 ]; then + CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \ + --config $exp_config \ + --num_workers 8 +fi + +######## Training ########### +if [ $running_stage -eq 2 ]; then + if [ -z "$exp_name" ]; then + echo "[Error] Please specify the experiments name" + exit 1 + fi + echo "Exprimental Name: $exp_name" + + CUDA_VISIBLE_DEVICES=$gpu accelerate launch \ + --main_process_port "$main_process_port" \ + "${work_dir}"/bins/vocoder/train.py \ + --config "$exp_config" \ + --exp_name "$exp_name" \ + --log_level info \ + --checkpoint "$checkpoint" \ + --resume_type "$resume_type" +fi + +######## Inference/Conversion ########### +if [ $running_stage -eq 3 ]; then + if [ -z "$infer_expt_dir" ]; then + echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]" + exit 1 + fi + + if [ -z "$infer_output_dir" ]; then + infer_output_dir="$infer_expt_dir/result" + fi + + if [ $infer_mode = "infer_from_dataset" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --infer_datasets $infer_datasets \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + + if [ $infer_mode = "infer_from_feature" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --feature_folder $infer_feature_dir \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + + if [ $infer_mode = "infer_from_audio" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --audio_folder $infer_audio_dir \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + +fi \ No newline at end of file diff --git a/egs/vocoder/gan/apnet/exp_config.json b/egs/vocoder/gan/apnet/exp_config.json new file mode 100644 index 0000000000000000000000000000000000000000..520d3da3ea36201b8fa71e0fb1fd0d1ef9bbfdc0 --- /dev/null +++ b/egs/vocoder/gan/apnet/exp_config.json @@ -0,0 +1,45 @@ +{ + "base_config": "egs/vocoder/gan/exp_config_base.json", + "preprocess": { + // acoustic features + "extract_mel": true, + "extract_audio": true, + "extract_amplitude_phase": true, + + // Features used for model training + "use_mel": true, + "use_audio": true, + "use_amplitude_phase": true + }, + "model": { + "generator": "apnet", + "apnet": { + "ASP_channel": 512, + "ASP_resblock_kernel_sizes": [3,7,11], + "ASP_resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "ASP_input_conv_kernel_size": 7, + "ASP_output_conv_kernel_size": 7, + + "PSP_channel": 512, + "PSP_resblock_kernel_sizes": [3,7,11], + "PSP_resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "PSP_input_conv_kernel_size": 7, + "PSP_output_R_conv_kernel_size": 7, + "PSP_output_I_conv_kernel_size": 7, + } + }, + "train": { + "criterions": [ + "feature", + "discriminator", + "generator", + "mel", + "phase", + "amplitude", + "consistency" + ] + }, + "inference": { + "batch_size": 1, + } +} diff --git a/egs/vocoder/gan/apnet/run.sh b/egs/vocoder/gan/apnet/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..9d7a29276b2ea648f62b079f4d7d5daa871484f9 --- /dev/null +++ b/egs/vocoder/gan/apnet/run.sh @@ -0,0 +1,141 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir)))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Parse the Given Parameters from the Commond ########### +options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,checkpoint:,resume_type:,main_process_port:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@") +eval set -- "$options" + +while true; do + case $1 in + # Experimental Configuration File + -c | --config) shift; exp_config=$1 ; shift ;; + # Experimental Name + -n | --name) shift; exp_name=$1 ; shift ;; + # Running Stage + -s | --stage) shift; running_stage=$1 ; shift ;; + # Visible GPU machines. The default value is "0". + --gpu) shift; gpu=$1 ; shift ;; + + # [Only for Training] The specific checkpoint path that you want to resume from. + --checkpoint) shift; checkpoint=$1 ; shift ;; + # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights. + --resume_type) shift; resume_type=$1 ; shift ;; + # [Only for Traiing] `main_process_port` for multi gpu training + --main_process_port) shift; main_process_port=$1 ; shift ;; + + # [Only for Inference] The inference mode + --infer_mode) shift; infer_mode=$1 ; shift ;; + # [Only for Inference] The inferenced datasets + --infer_datasets) shift; infer_datasets=$1 ; shift ;; + # [Only for Inference] The feature dir for inference + --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;; + # [Only for Inference] The audio dir for inference + --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;; + # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]" + --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;; + # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result" + --infer_output_dir) shift; infer_output_dir=$1 ; shift ;; + + --) shift ; break ;; + *) echo "Invalid option: $1" exit 1 ;; + esac +done + + +### Value check ### +if [ -z "$running_stage" ]; then + echo "[Error] Please specify the running stage" + exit 1 +fi + +if [ -z "$exp_config" ]; then + exp_config="${exp_dir}"/exp_config.json +fi +echo "Exprimental Configuration File: $exp_config" + +if [ -z "$gpu" ]; then + gpu="0" +fi + +if [ -z "$main_process_port" ]; then + main_process_port=29500 +fi +echo "Main Process Port: $main_process_port" + +######## Features Extraction ########### +if [ $running_stage -eq 1 ]; then + CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \ + --config $exp_config \ + --num_workers 8 +fi + +######## Training ########### +if [ $running_stage -eq 2 ]; then + if [ -z "$exp_name" ]; then + echo "[Error] Please specify the experiments name" + exit 1 + fi + echo "Exprimental Name: $exp_name" + + CUDA_VISIBLE_DEVICES=$gpu accelerate launch \ + --main_process_port "$main_process_port" \ + "${work_dir}"/bins/vocoder/train.py \ + --config "$exp_config" \ + --exp_name "$exp_name" \ + --log_level info \ + --checkpoint "$checkpoint" \ + --resume_type "$resume_type" +fi + +######## Inference/Conversion ########### +if [ $running_stage -eq 3 ]; then + if [ -z "$infer_expt_dir" ]; then + echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]" + exit 1 + fi + + if [ -z "$infer_output_dir" ]; then + infer_output_dir="$infer_expt_dir/result" + fi + + if [ $infer_mode = "infer_from_dataset" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --infer_datasets $infer_datasets \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + + if [ $infer_mode = "infer_from_feature" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --feature_folder $infer_feature_dir \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + + if [ $infer_mode = "infer_from_audio" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --audio_folder $infer_audio_dir \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + +fi \ No newline at end of file diff --git a/egs/vocoder/gan/bigvgan/exp_config.json b/egs/vocoder/gan/bigvgan/exp_config.json new file mode 100644 index 0000000000000000000000000000000000000000..cbed8a4bcd082f8a4cfa368a372f35c7e1a94973 --- /dev/null +++ b/egs/vocoder/gan/bigvgan/exp_config.json @@ -0,0 +1,66 @@ +{ + "base_config": "egs/vocoder/gan/exp_config_base.json", + "preprocess": { + // acoustic features + "extract_mel": true, + "extract_audio": true, + + // Features used for model training + "use_mel": true, + "use_audio": true + }, + "model": { + "generator": "bigvgan", + "bigvgan": { + "resblock": "1", + "activation": "snakebeta", + "snake_logscale": true, + "upsample_rates": [ + 8, + 8, + 2, + 2, + ], + "upsample_kernel_sizes": [ + 16, + 16, + 4, + 4 + ], + "upsample_initial_channel": 512, + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ] + } + }, + "train": { + "criterions": [ + "feature", + "discriminator", + "generator", + "mel", + ] + }, + "inference": { + "batch_size": 1, + } +} diff --git a/egs/vocoder/gan/bigvgan/run.sh b/egs/vocoder/gan/bigvgan/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..9d7a29276b2ea648f62b079f4d7d5daa871484f9 --- /dev/null +++ b/egs/vocoder/gan/bigvgan/run.sh @@ -0,0 +1,141 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir)))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Parse the Given Parameters from the Commond ########### +options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,checkpoint:,resume_type:,main_process_port:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@") +eval set -- "$options" + +while true; do + case $1 in + # Experimental Configuration File + -c | --config) shift; exp_config=$1 ; shift ;; + # Experimental Name + -n | --name) shift; exp_name=$1 ; shift ;; + # Running Stage + -s | --stage) shift; running_stage=$1 ; shift ;; + # Visible GPU machines. The default value is "0". + --gpu) shift; gpu=$1 ; shift ;; + + # [Only for Training] The specific checkpoint path that you want to resume from. + --checkpoint) shift; checkpoint=$1 ; shift ;; + # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights. + --resume_type) shift; resume_type=$1 ; shift ;; + # [Only for Traiing] `main_process_port` for multi gpu training + --main_process_port) shift; main_process_port=$1 ; shift ;; + + # [Only for Inference] The inference mode + --infer_mode) shift; infer_mode=$1 ; shift ;; + # [Only for Inference] The inferenced datasets + --infer_datasets) shift; infer_datasets=$1 ; shift ;; + # [Only for Inference] The feature dir for inference + --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;; + # [Only for Inference] The audio dir for inference + --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;; + # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]" + --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;; + # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result" + --infer_output_dir) shift; infer_output_dir=$1 ; shift ;; + + --) shift ; break ;; + *) echo "Invalid option: $1" exit 1 ;; + esac +done + + +### Value check ### +if [ -z "$running_stage" ]; then + echo "[Error] Please specify the running stage" + exit 1 +fi + +if [ -z "$exp_config" ]; then + exp_config="${exp_dir}"/exp_config.json +fi +echo "Exprimental Configuration File: $exp_config" + +if [ -z "$gpu" ]; then + gpu="0" +fi + +if [ -z "$main_process_port" ]; then + main_process_port=29500 +fi +echo "Main Process Port: $main_process_port" + +######## Features Extraction ########### +if [ $running_stage -eq 1 ]; then + CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \ + --config $exp_config \ + --num_workers 8 +fi + +######## Training ########### +if [ $running_stage -eq 2 ]; then + if [ -z "$exp_name" ]; then + echo "[Error] Please specify the experiments name" + exit 1 + fi + echo "Exprimental Name: $exp_name" + + CUDA_VISIBLE_DEVICES=$gpu accelerate launch \ + --main_process_port "$main_process_port" \ + "${work_dir}"/bins/vocoder/train.py \ + --config "$exp_config" \ + --exp_name "$exp_name" \ + --log_level info \ + --checkpoint "$checkpoint" \ + --resume_type "$resume_type" +fi + +######## Inference/Conversion ########### +if [ $running_stage -eq 3 ]; then + if [ -z "$infer_expt_dir" ]; then + echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]" + exit 1 + fi + + if [ -z "$infer_output_dir" ]; then + infer_output_dir="$infer_expt_dir/result" + fi + + if [ $infer_mode = "infer_from_dataset" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --infer_datasets $infer_datasets \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + + if [ $infer_mode = "infer_from_feature" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --feature_folder $infer_feature_dir \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + + if [ $infer_mode = "infer_from_audio" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --audio_folder $infer_audio_dir \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + +fi \ No newline at end of file diff --git a/egs/vocoder/gan/bigvgan_large/exp_config.json b/egs/vocoder/gan/bigvgan_large/exp_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a15dc3573839cd05085ac4540b651b98f94a1bf4 --- /dev/null +++ b/egs/vocoder/gan/bigvgan_large/exp_config.json @@ -0,0 +1,70 @@ +{ + "base_config": "egs/vocoder/gan/exp_config_base.json", + "preprocess": { + // acoustic features + "extract_mel": true, + "extract_audio": true, + + // Features used for model training + "use_mel": true, + "use_audio": true + }, + "model": { + "generator": "bigvgan", + "bigvgan": { + "resblock": "1", + "activation": "snakebeta", + "snake_logscale": true, + "upsample_rates": [ + 4, + 4, + 2, + 2, + 2, + 2 + ], + "upsample_kernel_sizes": [ + 8, + 8, + 4, + 4, + 4, + 4 + ], + "upsample_initial_channel": 1536, + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ] + }, + }, + "train": { + "criterions": [ + "feature", + "discriminator", + "generator", + "mel", + ] + }, + "inference": { + "batch_size": 1, + } +} diff --git a/egs/vocoder/gan/bigvgan_large/run.sh b/egs/vocoder/gan/bigvgan_large/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..9d7a29276b2ea648f62b079f4d7d5daa871484f9 --- /dev/null +++ b/egs/vocoder/gan/bigvgan_large/run.sh @@ -0,0 +1,141 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir)))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Parse the Given Parameters from the Commond ########### +options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,checkpoint:,resume_type:,main_process_port:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@") +eval set -- "$options" + +while true; do + case $1 in + # Experimental Configuration File + -c | --config) shift; exp_config=$1 ; shift ;; + # Experimental Name + -n | --name) shift; exp_name=$1 ; shift ;; + # Running Stage + -s | --stage) shift; running_stage=$1 ; shift ;; + # Visible GPU machines. The default value is "0". + --gpu) shift; gpu=$1 ; shift ;; + + # [Only for Training] The specific checkpoint path that you want to resume from. + --checkpoint) shift; checkpoint=$1 ; shift ;; + # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights. + --resume_type) shift; resume_type=$1 ; shift ;; + # [Only for Traiing] `main_process_port` for multi gpu training + --main_process_port) shift; main_process_port=$1 ; shift ;; + + # [Only for Inference] The inference mode + --infer_mode) shift; infer_mode=$1 ; shift ;; + # [Only for Inference] The inferenced datasets + --infer_datasets) shift; infer_datasets=$1 ; shift ;; + # [Only for Inference] The feature dir for inference + --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;; + # [Only for Inference] The audio dir for inference + --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;; + # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]" + --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;; + # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result" + --infer_output_dir) shift; infer_output_dir=$1 ; shift ;; + + --) shift ; break ;; + *) echo "Invalid option: $1" exit 1 ;; + esac +done + + +### Value check ### +if [ -z "$running_stage" ]; then + echo "[Error] Please specify the running stage" + exit 1 +fi + +if [ -z "$exp_config" ]; then + exp_config="${exp_dir}"/exp_config.json +fi +echo "Exprimental Configuration File: $exp_config" + +if [ -z "$gpu" ]; then + gpu="0" +fi + +if [ -z "$main_process_port" ]; then + main_process_port=29500 +fi +echo "Main Process Port: $main_process_port" + +######## Features Extraction ########### +if [ $running_stage -eq 1 ]; then + CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \ + --config $exp_config \ + --num_workers 8 +fi + +######## Training ########### +if [ $running_stage -eq 2 ]; then + if [ -z "$exp_name" ]; then + echo "[Error] Please specify the experiments name" + exit 1 + fi + echo "Exprimental Name: $exp_name" + + CUDA_VISIBLE_DEVICES=$gpu accelerate launch \ + --main_process_port "$main_process_port" \ + "${work_dir}"/bins/vocoder/train.py \ + --config "$exp_config" \ + --exp_name "$exp_name" \ + --log_level info \ + --checkpoint "$checkpoint" \ + --resume_type "$resume_type" +fi + +######## Inference/Conversion ########### +if [ $running_stage -eq 3 ]; then + if [ -z "$infer_expt_dir" ]; then + echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]" + exit 1 + fi + + if [ -z "$infer_output_dir" ]; then + infer_output_dir="$infer_expt_dir/result" + fi + + if [ $infer_mode = "infer_from_dataset" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --infer_datasets $infer_datasets \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + + if [ $infer_mode = "infer_from_feature" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --feature_folder $infer_feature_dir \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + + if [ $infer_mode = "infer_from_audio" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --audio_folder $infer_audio_dir \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + +fi \ No newline at end of file diff --git a/egs/vocoder/gan/exp_config_base.json b/egs/vocoder/gan/exp_config_base.json new file mode 100644 index 0000000000000000000000000000000000000000..84dbf7bcc0f5044c1e1f1d67b59cf26a338dff3a --- /dev/null +++ b/egs/vocoder/gan/exp_config_base.json @@ -0,0 +1,111 @@ +{ + "base_config": "config/vocoder.json", + "model_type": "GANVocoder", + // TODO: Choose your needed datasets + "dataset": [ + "csd", + "kising", + "m4singer", + "nus48e", + "opencpop", + "opensinger", + "opera", + "pjs", + "popbutfy", + "popcs", + "ljspeech", + "vctk", + "libritts", + ], + "dataset_path": { + // TODO: Fill in your dataset path + "csd": "[dataset path]", + "kising": "[dataset path]", + "m4singer": "[dataset path]", + "nus48e": "[dataset path]", + "opencpop": "[dataset path]", + "opensinger": "[dataset path]", + "opera": "[dataset path]", + "pjs": "[dataset path]", + "popbutfy": "[dataset path]", + "popcs": "[dataset path]", + "ljspeech": "[dataset path]", + "vctk": "[dataset path]", + "libritts": "[dataset path]", + }, + // TODO: Fill in the output log path + "log_dir": "ckpts/vocoder", + "preprocess": { + // Acoustic features + "extract_mel": true, + "extract_audio": true, + "extract_pitch": false, + "extract_uv": false, + "pitch_extractor": "parselmouth", + + // Features used for model training + "use_mel": true, + "use_frame_pitch": false, + "use_uv": false, + "use_audio": true, + + // TODO: Fill in the output data path + "processed_dir": "data/", + "n_mel": 100, + "sample_rate": 24000 + }, + "model": { + // TODO: Choose your needed discriminators + "discriminators": [ + "msd", + "mpd", + "msstftd", + "mssbcqtd", + ], + "mpd": { + "mpd_reshapes": [ + 2, + 3, + 5, + 7, + 11 + ], + "use_spectral_norm": false, + "discriminator_channel_mult_factor": 1 + }, + "mrd": { + "resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]], + "use_spectral_norm": false, + "discriminator_channel_mult_factor": 1, + "mrd_override": false + }, + "msstftd": { + "filters": 32 + }, + "mssbcqtd": { + hop_lengths: [512, 256, 256], + filters: 32, + max_filters: 1024, + filters_scale: 1, + dilations: [1, 2, 4], + in_channels: 1, + out_channels: 1, + n_octaves: [9, 9, 9], + bins_per_octaves: [24, 36, 48] + }, + }, + "train": { + // TODO: Choose a suitable batch size, training epoch, and save stride + "batch_size": 32, + "max_epoch": 1000000, + "save_checkpoint_stride": [20], + "adamw": { + "lr": 2.0e-4, + "adam_b1": 0.8, + "adam_b2": 0.99 + }, + "exponential_lr": { + "lr_decay": 0.999 + }, + } +} \ No newline at end of file diff --git a/egs/vocoder/gan/hifigan/exp_config.json b/egs/vocoder/gan/hifigan/exp_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b06712eac472a8452d7cfbfcf8bdf9bf3b232514 --- /dev/null +++ b/egs/vocoder/gan/hifigan/exp_config.json @@ -0,0 +1,59 @@ +{ + "base_config": "egs/vocoder/gan/exp_config_base.json", + "preprocess": { + // acoustic features + "extract_mel": true, + "extract_audio": true, + + // Features used for model training + "use_mel": true, + "use_audio": true + }, + "model": { + "generator": "hifigan", + "hifigan": { + "resblock": "2", + "upsample_rates": [ + 8, + 8, + 4 + ], + "upsample_kernel_sizes": [ + 16, + 16, + 8 + ], + "upsample_initial_channel": 256, + "resblock_kernel_sizes": [ + 3, + 5, + 7 + ], + "resblock_dilation_sizes": [ + [ + 1, + 2 + ], + [ + 2, + 6 + ], + [ + 3, + 12 + ] + ] + } + }, + "train": { + "criterions": [ + "feature", + "discriminator", + "generator", + "mel", + ] + }, + "inference": { + "batch_size": 1, + } +} diff --git a/egs/vocoder/gan/hifigan/run.sh b/egs/vocoder/gan/hifigan/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..9d7a29276b2ea648f62b079f4d7d5daa871484f9 --- /dev/null +++ b/egs/vocoder/gan/hifigan/run.sh @@ -0,0 +1,141 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir)))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Parse the Given Parameters from the Commond ########### +options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,checkpoint:,resume_type:,main_process_port:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@") +eval set -- "$options" + +while true; do + case $1 in + # Experimental Configuration File + -c | --config) shift; exp_config=$1 ; shift ;; + # Experimental Name + -n | --name) shift; exp_name=$1 ; shift ;; + # Running Stage + -s | --stage) shift; running_stage=$1 ; shift ;; + # Visible GPU machines. The default value is "0". + --gpu) shift; gpu=$1 ; shift ;; + + # [Only for Training] The specific checkpoint path that you want to resume from. + --checkpoint) shift; checkpoint=$1 ; shift ;; + # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights. + --resume_type) shift; resume_type=$1 ; shift ;; + # [Only for Traiing] `main_process_port` for multi gpu training + --main_process_port) shift; main_process_port=$1 ; shift ;; + + # [Only for Inference] The inference mode + --infer_mode) shift; infer_mode=$1 ; shift ;; + # [Only for Inference] The inferenced datasets + --infer_datasets) shift; infer_datasets=$1 ; shift ;; + # [Only for Inference] The feature dir for inference + --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;; + # [Only for Inference] The audio dir for inference + --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;; + # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]" + --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;; + # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result" + --infer_output_dir) shift; infer_output_dir=$1 ; shift ;; + + --) shift ; break ;; + *) echo "Invalid option: $1" exit 1 ;; + esac +done + + +### Value check ### +if [ -z "$running_stage" ]; then + echo "[Error] Please specify the running stage" + exit 1 +fi + +if [ -z "$exp_config" ]; then + exp_config="${exp_dir}"/exp_config.json +fi +echo "Exprimental Configuration File: $exp_config" + +if [ -z "$gpu" ]; then + gpu="0" +fi + +if [ -z "$main_process_port" ]; then + main_process_port=29500 +fi +echo "Main Process Port: $main_process_port" + +######## Features Extraction ########### +if [ $running_stage -eq 1 ]; then + CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \ + --config $exp_config \ + --num_workers 8 +fi + +######## Training ########### +if [ $running_stage -eq 2 ]; then + if [ -z "$exp_name" ]; then + echo "[Error] Please specify the experiments name" + exit 1 + fi + echo "Exprimental Name: $exp_name" + + CUDA_VISIBLE_DEVICES=$gpu accelerate launch \ + --main_process_port "$main_process_port" \ + "${work_dir}"/bins/vocoder/train.py \ + --config "$exp_config" \ + --exp_name "$exp_name" \ + --log_level info \ + --checkpoint "$checkpoint" \ + --resume_type "$resume_type" +fi + +######## Inference/Conversion ########### +if [ $running_stage -eq 3 ]; then + if [ -z "$infer_expt_dir" ]; then + echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]" + exit 1 + fi + + if [ -z "$infer_output_dir" ]; then + infer_output_dir="$infer_expt_dir/result" + fi + + if [ $infer_mode = "infer_from_dataset" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --infer_datasets $infer_datasets \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + + if [ $infer_mode = "infer_from_feature" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --feature_folder $infer_feature_dir \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + + if [ $infer_mode = "infer_from_audio" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --audio_folder $infer_audio_dir \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + +fi \ No newline at end of file diff --git a/egs/vocoder/gan/melgan/exp_config.json b/egs/vocoder/gan/melgan/exp_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9470600bc0d15afd07107f816995e8c67ad52d9a --- /dev/null +++ b/egs/vocoder/gan/melgan/exp_config.json @@ -0,0 +1,34 @@ +{ + "base_config": "egs/vocoder/gan/exp_config_base.json", + "preprocess": { + // acoustic features + "extract_mel": true, + "extract_audio": true, + + // Features used for model training + "use_mel": true, + "use_audio": true + }, + "model": { + "generator": "melgan", + "melgan": { + "ratios": [8, 8, 2, 2], + "ngf": 32, + "n_residual_layers": 3, + "num_D": 3, + "ndf": 16, + "n_layers": 4, + "downsampling_factor": 4 + }, + }, + "train": { + "criterions": [ + "feature", + "discriminator", + "generator", + ] + }, + "inference": { + "batch_size": 1, + } +} diff --git a/egs/vocoder/gan/melgan/run.sh b/egs/vocoder/gan/melgan/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..9d7a29276b2ea648f62b079f4d7d5daa871484f9 --- /dev/null +++ b/egs/vocoder/gan/melgan/run.sh @@ -0,0 +1,141 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir)))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Parse the Given Parameters from the Commond ########### +options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,checkpoint:,resume_type:,main_process_port:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@") +eval set -- "$options" + +while true; do + case $1 in + # Experimental Configuration File + -c | --config) shift; exp_config=$1 ; shift ;; + # Experimental Name + -n | --name) shift; exp_name=$1 ; shift ;; + # Running Stage + -s | --stage) shift; running_stage=$1 ; shift ;; + # Visible GPU machines. The default value is "0". + --gpu) shift; gpu=$1 ; shift ;; + + # [Only for Training] The specific checkpoint path that you want to resume from. + --checkpoint) shift; checkpoint=$1 ; shift ;; + # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights. + --resume_type) shift; resume_type=$1 ; shift ;; + # [Only for Traiing] `main_process_port` for multi gpu training + --main_process_port) shift; main_process_port=$1 ; shift ;; + + # [Only for Inference] The inference mode + --infer_mode) shift; infer_mode=$1 ; shift ;; + # [Only for Inference] The inferenced datasets + --infer_datasets) shift; infer_datasets=$1 ; shift ;; + # [Only for Inference] The feature dir for inference + --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;; + # [Only for Inference] The audio dir for inference + --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;; + # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]" + --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;; + # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result" + --infer_output_dir) shift; infer_output_dir=$1 ; shift ;; + + --) shift ; break ;; + *) echo "Invalid option: $1" exit 1 ;; + esac +done + + +### Value check ### +if [ -z "$running_stage" ]; then + echo "[Error] Please specify the running stage" + exit 1 +fi + +if [ -z "$exp_config" ]; then + exp_config="${exp_dir}"/exp_config.json +fi +echo "Exprimental Configuration File: $exp_config" + +if [ -z "$gpu" ]; then + gpu="0" +fi + +if [ -z "$main_process_port" ]; then + main_process_port=29500 +fi +echo "Main Process Port: $main_process_port" + +######## Features Extraction ########### +if [ $running_stage -eq 1 ]; then + CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \ + --config $exp_config \ + --num_workers 8 +fi + +######## Training ########### +if [ $running_stage -eq 2 ]; then + if [ -z "$exp_name" ]; then + echo "[Error] Please specify the experiments name" + exit 1 + fi + echo "Exprimental Name: $exp_name" + + CUDA_VISIBLE_DEVICES=$gpu accelerate launch \ + --main_process_port "$main_process_port" \ + "${work_dir}"/bins/vocoder/train.py \ + --config "$exp_config" \ + --exp_name "$exp_name" \ + --log_level info \ + --checkpoint "$checkpoint" \ + --resume_type "$resume_type" +fi + +######## Inference/Conversion ########### +if [ $running_stage -eq 3 ]; then + if [ -z "$infer_expt_dir" ]; then + echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]" + exit 1 + fi + + if [ -z "$infer_output_dir" ]; then + infer_output_dir="$infer_expt_dir/result" + fi + + if [ $infer_mode = "infer_from_dataset" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --infer_datasets $infer_datasets \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + + if [ $infer_mode = "infer_from_feature" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --feature_folder $infer_feature_dir \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + + if [ $infer_mode = "infer_from_audio" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --audio_folder $infer_audio_dir \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + +fi \ No newline at end of file diff --git a/egs/vocoder/gan/nsfhifigan/exp_config.json b/egs/vocoder/gan/nsfhifigan/exp_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ead9dc8909b6a5283dd12b721cfb4519cc3953ca --- /dev/null +++ b/egs/vocoder/gan/nsfhifigan/exp_config.json @@ -0,0 +1,83 @@ +{ + "base_config": "egs/vocoder/gan/exp_config_base.json", + "preprocess": { + // acoustic features + "extract_mel": true, + "extract_audio": true, + "extract_pitch": true, + + // Features used for model training + "use_mel": true, + "use_audio": true, + "use_frame_pitch": true + }, + "model": { + "generator": "nsfhifigan", + "nsfhifigan": { + "resblock": "1", + "harmonic_num": 8, + "upsample_rates": [ + 8, + 4, + 2, + 2, + 2 + ], + "upsample_kernel_sizes": [ + 16, + 8, + 4, + 4, + 4 + ], + "upsample_initial_channel": 768, + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ] + }, + "mpd": { + "mpd_reshapes": [ + 2, + 3, + 5, + 7, + 11, + 17, + 23, + 37 + ], + "use_spectral_norm": false, + "discriminator_channel_multi": 1 + } + }, + "train": { + "criterions": [ + "feature", + "discriminator", + "generator", + "mel", + ] + }, + "inference": { + "batch_size": 1, + } +} diff --git a/egs/vocoder/gan/nsfhifigan/run.sh b/egs/vocoder/gan/nsfhifigan/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..9d7a29276b2ea648f62b079f4d7d5daa871484f9 --- /dev/null +++ b/egs/vocoder/gan/nsfhifigan/run.sh @@ -0,0 +1,141 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir)))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Parse the Given Parameters from the Commond ########### +options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,checkpoint:,resume_type:,main_process_port:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@") +eval set -- "$options" + +while true; do + case $1 in + # Experimental Configuration File + -c | --config) shift; exp_config=$1 ; shift ;; + # Experimental Name + -n | --name) shift; exp_name=$1 ; shift ;; + # Running Stage + -s | --stage) shift; running_stage=$1 ; shift ;; + # Visible GPU machines. The default value is "0". + --gpu) shift; gpu=$1 ; shift ;; + + # [Only for Training] The specific checkpoint path that you want to resume from. + --checkpoint) shift; checkpoint=$1 ; shift ;; + # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights. + --resume_type) shift; resume_type=$1 ; shift ;; + # [Only for Traiing] `main_process_port` for multi gpu training + --main_process_port) shift; main_process_port=$1 ; shift ;; + + # [Only for Inference] The inference mode + --infer_mode) shift; infer_mode=$1 ; shift ;; + # [Only for Inference] The inferenced datasets + --infer_datasets) shift; infer_datasets=$1 ; shift ;; + # [Only for Inference] The feature dir for inference + --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;; + # [Only for Inference] The audio dir for inference + --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;; + # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]" + --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;; + # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result" + --infer_output_dir) shift; infer_output_dir=$1 ; shift ;; + + --) shift ; break ;; + *) echo "Invalid option: $1" exit 1 ;; + esac +done + + +### Value check ### +if [ -z "$running_stage" ]; then + echo "[Error] Please specify the running stage" + exit 1 +fi + +if [ -z "$exp_config" ]; then + exp_config="${exp_dir}"/exp_config.json +fi +echo "Exprimental Configuration File: $exp_config" + +if [ -z "$gpu" ]; then + gpu="0" +fi + +if [ -z "$main_process_port" ]; then + main_process_port=29500 +fi +echo "Main Process Port: $main_process_port" + +######## Features Extraction ########### +if [ $running_stage -eq 1 ]; then + CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \ + --config $exp_config \ + --num_workers 8 +fi + +######## Training ########### +if [ $running_stage -eq 2 ]; then + if [ -z "$exp_name" ]; then + echo "[Error] Please specify the experiments name" + exit 1 + fi + echo "Exprimental Name: $exp_name" + + CUDA_VISIBLE_DEVICES=$gpu accelerate launch \ + --main_process_port "$main_process_port" \ + "${work_dir}"/bins/vocoder/train.py \ + --config "$exp_config" \ + --exp_name "$exp_name" \ + --log_level info \ + --checkpoint "$checkpoint" \ + --resume_type "$resume_type" +fi + +######## Inference/Conversion ########### +if [ $running_stage -eq 3 ]; then + if [ -z "$infer_expt_dir" ]; then + echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]" + exit 1 + fi + + if [ -z "$infer_output_dir" ]; then + infer_output_dir="$infer_expt_dir/result" + fi + + if [ $infer_mode = "infer_from_dataset" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --infer_datasets $infer_datasets \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + + if [ $infer_mode = "infer_from_feature" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --feature_folder $infer_feature_dir \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + + if [ $infer_mode = "infer_from_audio" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --audio_folder $infer_audio_dir \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + +fi \ No newline at end of file diff --git a/egs/vocoder/gan/tfr_enhanced_hifigan/README.md b/egs/vocoder/gan/tfr_enhanced_hifigan/README.md new file mode 100644 index 0000000000000000000000000000000000000000..363a47603a87304e308bef0e66198ab69a20d92a --- /dev/null +++ b/egs/vocoder/gan/tfr_enhanced_hifigan/README.md @@ -0,0 +1,196 @@ +# Multi-Scale Sub-Band Constant-Q Transform Discriminator for High-Fedility Vocoder + +[![arXiv](https://img.shields.io/badge/arXiv-Paper-.svg)](https://arxiv.org/abs/2311.14957) +[![demo](https://img.shields.io/badge/Vocoder-Demo-red)](https://vocodexelysium.github.io/MS-SB-CQTD/) + +
+
+ +
+
+ +This is the official implementation of the paper "[Multi-Scale Sub-Band Constant-Q Transform Discriminator for High-Fidelity Vocoder](https://arxiv.org/abs/2311.14957)". In this recipe, we will illustrate how to train a high quality HiFi-GAN on LibriTTS, VCTK and LJSpeech via utilizing multiple Time-Frequency-Representation-based Discriminators. + +There are four stages in total: + +1. Data preparation +2. Feature extraction +3. Training +4. Inference + +> **NOTE:** You need to run every command of this recipe in the `Amphion` root path: +> ```bash +> cd Amphion +> ``` + +## 1. Data Preparation + +### Dataset Download + +By default, we utilize the three datasets for training: LibriTTS, VCTK and LJSpeech. How to download them is detailed in [here](../../../datasets/README.md). + +### Configuration + +Specify the dataset path in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets. + +```json +"dataset": [ + "ljspeech", + "vctk", + "libritts", +], +"dataset_path": { + // TODO: Fill in your dataset path + "ljspeech": "[LJSpeech dataset path]", + "vctk": "[VCTK dataset path]", + "libritts": "[LibriTTS dataset path]", +}, +``` + +## 2. Features Extraction + +For HiFiGAN, only the Mel-Spectrogram and the Output Audio are needed for training. + +### Configuration + +Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`: + +```json + // TODO: Fill in the output log path. The default value is "Amphion/ckpts/vocoder" + "log_dir": "ckpts/vocoder", + "preprocess": { + // TODO: Fill in the output data path. The default value is "Amphion/data" + "processed_dir": "data", + ... + }, +``` + +### Run + +Run the `run.sh` as the preproces stage (set `--stage 1`). + +```bash +sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 1 +``` + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`. + +## 3. Training + +### Configuration + +We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines. + +```json +"train": { + "batch_size": 32, + ... +} +``` + +### Run + +Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/vocoder/[YourExptName]`. + +```bash +sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 2 --name [YourExptName] +``` + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`. + +If you want to resume or finetune from a pretrained model, run: + +```bash +sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 2 \ + --name [YourExptName] \ + --resume_type ["resume" for resuming training and "finetune" for loading parameters only] \ + --checkpoint Amphion/ckpts/vocoder/[YourExptName]/checkpoint \ +``` + +> **NOTE:** For multi-gpu training, the `main_process_port` is set as `29500` in default. You can change it when running `run.sh` by specifying such as `--main_process_port 29501`. + +## 4. Inference + +### Pretrained Vocoder Download + +We trained a HiFiGAN checkpoint with around 685 hours Speech data. The final pretrained checkpoint is released [here](../../../../pretrained/hifigan/README.md). + +### Run + +Run the `run.sh` as the training stage (set `--stage 3`), we provide three different inference modes, including `infer_from_dataset`, `infer_from_feature`, `and infer_from audio`. + +```bash +sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 3 \ + --infer_mode [Your chosen inference mode] \ + --infer_datasets [Datasets you want to inference, needed when infer_from_dataset] \ + --infer_feature_dir [Your path to your predicted acoustic features, needed when infer_from_feature] \ + --infer_audio_dir [Your path to your audio files, needed when infer_form_audio] \ + --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \ + --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \ +``` + +#### a. Inference from Dataset + +Run the `run.sh` with specified datasets, here is an example. + +```bash +sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 3 \ + --infer_mode infer_from_dataset \ + --infer_datasets "libritts vctk ljspeech" \ + --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \ + --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \ +``` + +#### b. Inference from Features + +If you want to inference from your generated acoustic features, you should first prepare your acoustic features into the following structure: + +```plaintext + ┣ {infer_feature_dir} + ┃ ┣ mels + ┃ ┃ ┣ sample1.npy + ┃ ┃ ┣ sample2.npy +``` + +Then run the `run.sh` with specificed folder direction, here is an example. + +```bash +sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 3 \ + --infer_mode infer_from_feature \ + --infer_feature_dir [Your path to your predicted acoustic features] \ + --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \ + --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \ +``` + +#### c. Inference from Audios + +If you want to inference from audios for quick analysis synthesis, you should first prepare your audios into the following structure: + +```plaintext + ┣ audios + ┃ ┣ sample1.wav + ┃ ┣ sample2.wav +``` + +Then run the `run.sh` with specificed folder direction, here is an example. + +```bash +sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 3 \ + --infer_mode infer_from_audio \ + --infer_audio_dir [Your path to your audio files] \ + --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \ + --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \ +``` + +## Citations + +```bibtex +@misc{gu2023cqt, + title={Multi-Scale Sub-Band Constant-Q Transform Discriminator for High-Fidelity Vocoder}, + author={Yicheng Gu and Xueyao Zhang and Liumeng Xue and Zhizheng Wu}, + year={2023}, + eprint={2311.14957}, + archivePrefix={arXiv}, + primaryClass={cs.SD} +} +``` \ No newline at end of file diff --git a/egs/vocoder/gan/tfr_enhanced_hifigan/exp_config.json b/egs/vocoder/gan/tfr_enhanced_hifigan/exp_config.json new file mode 100644 index 0000000000000000000000000000000000000000..854c575aace19e0da5991e2a29321fb8c07a76ca --- /dev/null +++ b/egs/vocoder/gan/tfr_enhanced_hifigan/exp_config.json @@ -0,0 +1,118 @@ +{ + "base_config": "egs/vocoder/gan/exp_config_base.json", + "model_type": "GANVocoder", + "dataset": [ + "ljspeech", + "vctk", + "libritts", + ], + "dataset_path": { + // TODO: Fill in your dataset path + "ljspeech": "[dataset path]", + "vctk": "[dataset path]", + "libritts": "[dataset path]", + }, + // TODO: Fill in the output log path. The default value is "Amphion/ckpts/vocoder" + "log_dir": "ckpts/vocoder", + "preprocess": { + // TODO: Fill in the output data path. The default value is "Amphion/data" + "processed_dir": "data", + // acoustic features + "extract_mel": true, + "extract_audio": true, + "extract_pitch": false, + "extract_uv": false, + "extract_amplitude_phase": false, + "pitch_extractor": "parselmouth", + // Features used for model training + "use_mel": true, + "use_frame_pitch": false, + "use_uv": false, + "use_audio": true, + "n_mel": 100, + "sample_rate": 24000 + }, + "model": { + "generator": "hifigan", + "discriminators": [ + "msd", + "mpd", + "mssbcqtd", + "msstftd", + ], + "hifigan": { + "resblock": "1", + "upsample_rates": [ + 8, + 4, + 2, + 2, + 2 + ], + "upsample_kernel_sizes": [ + 16, + 8, + 4, + 4, + 4 + ], + "upsample_initial_channel": 768, + "resblock_kernel_sizes": [ + 3, + 5, + 7 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ] + }, + "mpd": { + "mpd_reshapes": [ + 2, + 3, + 5, + 7, + 11, + 17, + 23, + 37 + ], + "use_spectral_norm": false, + "discriminator_channel_multi": 1 + } + }, + "train": { + "batch_size": 32, + "adamw": { + "lr": 2.0e-4, + "adam_b1": 0.8, + "adam_b2": 0.99 + }, + "exponential_lr": { + "lr_decay": 0.999 + }, + "criterions": [ + "feature", + "discriminator", + "generator", + "mel", + ] + }, + "inference": { + "batch_size": 1, + } +} \ No newline at end of file diff --git a/egs/vocoder/gan/tfr_enhanced_hifigan/run.sh b/egs/vocoder/gan/tfr_enhanced_hifigan/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..9d7a29276b2ea648f62b079f4d7d5daa871484f9 --- /dev/null +++ b/egs/vocoder/gan/tfr_enhanced_hifigan/run.sh @@ -0,0 +1,141 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir)))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Parse the Given Parameters from the Commond ########### +options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,checkpoint:,resume_type:,main_process_port:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@") +eval set -- "$options" + +while true; do + case $1 in + # Experimental Configuration File + -c | --config) shift; exp_config=$1 ; shift ;; + # Experimental Name + -n | --name) shift; exp_name=$1 ; shift ;; + # Running Stage + -s | --stage) shift; running_stage=$1 ; shift ;; + # Visible GPU machines. The default value is "0". + --gpu) shift; gpu=$1 ; shift ;; + + # [Only for Training] The specific checkpoint path that you want to resume from. + --checkpoint) shift; checkpoint=$1 ; shift ;; + # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights. + --resume_type) shift; resume_type=$1 ; shift ;; + # [Only for Traiing] `main_process_port` for multi gpu training + --main_process_port) shift; main_process_port=$1 ; shift ;; + + # [Only for Inference] The inference mode + --infer_mode) shift; infer_mode=$1 ; shift ;; + # [Only for Inference] The inferenced datasets + --infer_datasets) shift; infer_datasets=$1 ; shift ;; + # [Only for Inference] The feature dir for inference + --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;; + # [Only for Inference] The audio dir for inference + --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;; + # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]" + --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;; + # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result" + --infer_output_dir) shift; infer_output_dir=$1 ; shift ;; + + --) shift ; break ;; + *) echo "Invalid option: $1" exit 1 ;; + esac +done + + +### Value check ### +if [ -z "$running_stage" ]; then + echo "[Error] Please specify the running stage" + exit 1 +fi + +if [ -z "$exp_config" ]; then + exp_config="${exp_dir}"/exp_config.json +fi +echo "Exprimental Configuration File: $exp_config" + +if [ -z "$gpu" ]; then + gpu="0" +fi + +if [ -z "$main_process_port" ]; then + main_process_port=29500 +fi +echo "Main Process Port: $main_process_port" + +######## Features Extraction ########### +if [ $running_stage -eq 1 ]; then + CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \ + --config $exp_config \ + --num_workers 8 +fi + +######## Training ########### +if [ $running_stage -eq 2 ]; then + if [ -z "$exp_name" ]; then + echo "[Error] Please specify the experiments name" + exit 1 + fi + echo "Exprimental Name: $exp_name" + + CUDA_VISIBLE_DEVICES=$gpu accelerate launch \ + --main_process_port "$main_process_port" \ + "${work_dir}"/bins/vocoder/train.py \ + --config "$exp_config" \ + --exp_name "$exp_name" \ + --log_level info \ + --checkpoint "$checkpoint" \ + --resume_type "$resume_type" +fi + +######## Inference/Conversion ########### +if [ $running_stage -eq 3 ]; then + if [ -z "$infer_expt_dir" ]; then + echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]" + exit 1 + fi + + if [ -z "$infer_output_dir" ]; then + infer_output_dir="$infer_expt_dir/result" + fi + + if [ $infer_mode = "infer_from_dataset" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --infer_datasets $infer_datasets \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + + if [ $infer_mode = "infer_from_feature" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --feature_folder $infer_feature_dir \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + + if [ $infer_mode = "infer_from_audio" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --audio_folder $infer_audio_dir \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + +fi \ No newline at end of file