Spaces:

akhaliq
/

VQMIVC

Runtime error

App Files Files Community

akhaliq3 commited on Sep 28, 2021

Commit

2b7bf83

•

1 Parent(s): d25d456

spaces demo

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

ParallelWaveGAN/.github/FUNDING.yml +1 -0
ParallelWaveGAN/.github/workflows/ci.yaml +97 -0
ParallelWaveGAN/.gitignore +36 -0
ParallelWaveGAN/LICENSE +21 -0
ParallelWaveGAN/egs/README.md +165 -0
ParallelWaveGAN/egs/arctic/voc1/cmd.sh +91 -0
ParallelWaveGAN/egs/arctic/voc1/conf/parallel_wavegan.v1.yaml +122 -0
ParallelWaveGAN/egs/arctic/voc1/conf/slurm.conf +12 -0
ParallelWaveGAN/egs/arctic/voc1/local/data_download.sh +40 -0
ParallelWaveGAN/egs/arctic/voc1/local/data_prep.sh +113 -0
ParallelWaveGAN/egs/arctic/voc1/path.sh +33 -0
ParallelWaveGAN/egs/arctic/voc1/run.sh +167 -0
ParallelWaveGAN/egs/arctic/voc1/utils +1 -0
ParallelWaveGAN/egs/csmsc/voc1/cmd.sh +91 -0
ParallelWaveGAN/egs/csmsc/voc1/conf/hifigan.v1.yaml +180 -0
ParallelWaveGAN/egs/csmsc/voc1/conf/multi_band_melgan.v2.yaml +150 -0
ParallelWaveGAN/egs/csmsc/voc1/conf/parallel_wavegan.v1.yaml +122 -0
ParallelWaveGAN/egs/csmsc/voc1/conf/slurm.conf +12 -0
ParallelWaveGAN/egs/csmsc/voc1/conf/style_melgan.v1.yaml +147 -0
ParallelWaveGAN/egs/csmsc/voc1/local/data_download.sh +32 -0
ParallelWaveGAN/egs/csmsc/voc1/local/data_prep.sh +94 -0
ParallelWaveGAN/egs/csmsc/voc1/path.sh +33 -0
ParallelWaveGAN/egs/csmsc/voc1/run.sh +164 -0
ParallelWaveGAN/egs/csmsc/voc1/utils +1 -0
ParallelWaveGAN/egs/jnas/voc1/cmd.sh +91 -0
ParallelWaveGAN/egs/jnas/voc1/conf/parallel_wavegan.v1.long.yaml +123 -0
ParallelWaveGAN/egs/jnas/voc1/conf/parallel_wavegan.v1.yaml +122 -0
ParallelWaveGAN/egs/jnas/voc1/conf/slurm.conf +12 -0
ParallelWaveGAN/egs/jnas/voc1/conf/train_speakers.txt +261 -0
ParallelWaveGAN/egs/jnas/voc1/local/data_prep.sh +89 -0
ParallelWaveGAN/egs/jnas/voc1/path.sh +33 -0
ParallelWaveGAN/egs/jnas/voc1/run.sh +158 -0
ParallelWaveGAN/egs/jnas/voc1/utils +1 -0
ParallelWaveGAN/egs/jsss/voc1/cmd.sh +91 -0
ParallelWaveGAN/egs/jsss/voc1/conf/parallel_wavegan.v1.yaml +122 -0
ParallelWaveGAN/egs/jsss/voc1/conf/slurm.conf +12 -0
ParallelWaveGAN/egs/jsss/voc1/local/data_download.sh +41 -0
ParallelWaveGAN/egs/jsss/voc1/local/data_prep.sh +180 -0
ParallelWaveGAN/egs/jsss/voc1/path.sh +33 -0
ParallelWaveGAN/egs/jsss/voc1/run.sh +186 -0
ParallelWaveGAN/egs/jsss/voc1/utils +1 -0
ParallelWaveGAN/egs/jsut/voc1/cmd.sh +91 -0
ParallelWaveGAN/egs/jsut/voc1/conf/hifigan.v1.yaml +180 -0
ParallelWaveGAN/egs/jsut/voc1/conf/multi_band_melgan.v2.yaml +150 -0
ParallelWaveGAN/egs/jsut/voc1/conf/parallel_wavegan.v1.yaml +122 -0
ParallelWaveGAN/egs/jsut/voc1/conf/slurm.conf +12 -0
ParallelWaveGAN/egs/jsut/voc1/conf/style_melgan.v1.yaml +147 -0
ParallelWaveGAN/egs/jsut/voc1/local/data_download.sh +39 -0
ParallelWaveGAN/egs/jsut/voc1/local/data_prep.sh +93 -0
ParallelWaveGAN/egs/jsut/voc1/path.sh +33 -0

ParallelWaveGAN/.github/FUNDING.yml ADDED Viewed

	@@ -0,0 +1 @@


1	+ github: kan-bayashi

ParallelWaveGAN/.github/workflows/ci.yaml ADDED Viewed

	@@ -0,0 +1,97 @@

+name: CI
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+  schedule:
+    - cron: 0 0 * * 1
+jobs:
+  linter_and_test:
+    runs-on: ubuntu-20.04
+    strategy:
+      max-parallel: 5
+      matrix:
+        python-version: [3.6]
+        # 1.6 is failed on cpu: https://github.com/kan-bayashi/ParallelWaveGAN/issues/198
+        pytorch-version: [1.4, 1.5.1, 1.7.1, 1.8.1, 1.9]
+    steps:
+      - uses: actions/checkout@master
+      - uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+          architecture: 'x64'
+      - uses: actions/cache@v2
+        with:
+          path: ~/.cache/pip
+          key: ${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}-pip-${{ hashFiles('**/setup.py') }}
+          restore-keys: |
+            ${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}-pip-
+      - name: Install dependencies
+        run: |
+          sudo apt-get install libsndfile-dev
+          # make python env
+          cd tools; make CUDA_VERSION="" PYTHON=python${{ matrix.python-version }} PYTORCH_VERSION=${{ matrix.pytorch-version }}
+          # install shell check
+          wget https://github.com/koalaman/shellcheck/releases/download/stable/shellcheck-stable.linux.x86_64.tar.xz
+          tar -xvf shellcheck-stable.linux.x86_64.tar.xz
+      - name: ShellCheck
+        run: |
+          export PATH=shellcheck-stable:$PATH
+          find egs -name "*.sh" | grep -v path.sh | while read line; do shellcheck -x --shell=bash -P $(dirname $line) ${line}; done
+      - name: Black & Flake8
+        run: |
+          source tools/venv/bin/activate
+          black --diff parallel_wavegan
+          flake8 parallel_wavegan
+          flake8 --extend-ignore=D test
+      - name: Pytest
+        run: |
+          source tools/venv/bin/activate
+          pytest test
+  integration:
+    runs-on: ubuntu-20.04
+    strategy:
+      max-parallel: 10
+      matrix:
+        python-version: [3.7]
+        pytorch-version: [1.9]
+        config:
+          - "parallel_wavegan.v1.debug.yaml"
+          - "melgan.v1.debug.yaml"
+          - "melgan.v3.debug.yaml"
+          - "multi_band_melgan.v1.debug.yaml"
+          - "parallel_wavegan.v1.debug.npy.yaml"
+          - "parallel_wavegan.v1.debug.diff_fs.yaml"
+          - "hifigan.v1.debug.yaml"
+          - "style_melgan.v1.debug.yaml"
+    steps:
+      - uses: actions/checkout@master
+      - uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+          architecture: 'x64'
+      - uses: actions/cache@v2
+        with:
+          path: ~/.cache/pip
+          key: ${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}-pip-${{ hashFiles('**/setup.py') }}
+          restore-keys: |
+            ${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}-pip-
+      - name: Install dependencies
+        run: |
+          sudo apt-get install libsndfile-dev jq
+          # make python env
+          cd tools; make CUDA_VERSION="" PYTHON=python${{ matrix.python-version }} PYTORCH_VERSION=${{ matrix.pytorch-version }}
+      - name: Integration
+        run: |
+          cd egs/yesno/voc1 && ./run.sh --conf conf/${{ matrix.config }}
+      - uses: actions/upload-artifact@v1
+        if: failure()
+        with:
+          name: artifacts-${{ matrix.config }}
+          path: egs/yesno/voc1

ParallelWaveGAN/.gitignore ADDED Viewed

	@@ -0,0 +1,36 @@

+# general
+*~
+*.pyc
+\#*\#
+.\#*
+*DS_Store
+out.txt
+parallel_wavegan.egg-info/
+doc/_build
+slurm-*.out
+tmp*
+.eggs/
+.hypothesis/
+.idea
+.backup/
+.pytest_cache/
+__pycache__/
+.coverage*
+coverage.xml*
+.vscode*
+.nfs*
+.ipynb_checkpoints
+.d000*
+*.out
+*.err
+# recipe related
+egs/*/*/data
+egs/*/*/downloads
+egs/*/*/dump
+egs/*/*/exp
+egs/*/*/conf/tuning
+# tools related
+tools/venv/
+tools/apex/

ParallelWaveGAN/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+The MIT License (MIT)
+Copyright (c) 2020 Tomoki Hayashi <hayashi.tomoki@g.sp.m.is.nagoya-u.ac.jp>
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.

ParallelWaveGAN/egs/README.md ADDED Viewed

	@@ -0,0 +1,165 @@

+# Kaldi-style all-in-one recipes
+This repository provides [Kaldi](https://github.com/kaldi-asr/kaldi)-style recipes, as the same as [ESPnet](https://github.com/espnet/espnet).
+Currently, the following recipes are supported.
+- [LJSpeech](https://keithito.com/LJ-Speech-Dataset/): English female speaker
+- [JSUT](https://sites.google.com/site/shinnosuketakamichi/publication/jsut): Japanese female speaker
+- [JSSS](https://sites.google.com/site/shinnosuketakamichi/research-topics/jsss_corpus): Japanese female speaker
+- [CSMSC](https://www.data-baker.com/open_source.html): Mandarin female speaker
+- [CMU Arctic](http://www.festvox.org/cmu_arctic/): English speakers
+- [JNAS](http://research.nii.ac.jp/src/en/JNAS.html): Japanese multi-speaker
+- [VCTK](https://homepages.inf.ed.ac.uk/jyamagis/page3/page58/page58.html): English multi-speaker
+- [LibriTTS](https://arxiv.org/abs/1904.02882): English multi-speaker
+- [YesNo](https://arxiv.org/abs/1904.02882): English speaker (For debugging)
+## How to run the recipe
+```bash
+# Let us move on the recipe directory
+$ cd egs/ljspeech/voc1
+# Run the recipe from scratch
+$ ./run.sh
+# You can change config via command line
+$ ./run.sh --conf <your_customized_yaml_config>
+# You can select the stage to start and stop
+$ ./run.sh --stage 2 --stop_stage 2
+# If you want to specify the gpu
+$ CUDA_VISIBLE_DEVICES=1 ./run.sh --stage 2
+# If you want to resume training from 10000 steps checkpoint
+$ ./run.sh --stage 2 --resume <path>/<to>/checkpoint-10000steps.pkl
+```
+You can check the command line options in `run.sh`.
+The integration with job schedulers such as [slurm](https://slurm.schedmd.com/documentation.html) can be done via `cmd.sh` and  `conf/slurm.conf`.
+If you want to use it, please check [this page](https://kaldi-asr.org/doc/queue.html).
+All of the hyperparameters are written in a single yaml format configuration file.
+Please check [this example](https://github.com/kan-bayashi/ParallelWaveGAN/blob/master/egs/ljspeech/voc1/conf/parallel_wavegan.v1.yaml) in ljspeech recipe.
+You can monitor the training progress via tensorboard.
+```bash
+$ tensorboard --logdir exp
+```
+![](https://user-images.githubusercontent.com/22779813/68100080-58bbc500-ff09-11e9-9945-c835186fd7c2.png)
+If you want to accelerate the training, you can try distributed multi-gpu training based on apex.
+You need to install apex for distributed training. Please make sure you already installed it.
+Then you can run distributed multi-gpu training via following command:
+```bash
+# in the case of the number of gpus = 8
+$ CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" ./run.sh --stage 2 --n_gpus 8
+```
+In the case of distributed training, the batch size will be automatically multiplied by the number of gpus.
+Please be careful.
+## How to make the recipe for your own dateset
+Here, I will show how to make the recipe for your own dataset.
+1. Setup your dataset to be the following structure.
+    ```bash
+    # For single-speaker case
+    $ tree /path/to/databse
+    /path/to/database
+    ├── utt_1.wav
+    ├── utt_2.wav
+    │   ...
+    └── utt_N.wav
+    # The directory can be nested, but each filename must be unique
+    # For multi-speaker case
+    $ tree /path/to/databse
+    /path/to/database
+    ├── spk_1
+    │   ├── utt1.wav
+    ├── spk_2
+    │   ├── utt1.wav
+    │   ...
+    └── spk_N
+        ├── utt1.wav
+        ...
+    # The directory under each speaker can be nested, but each filename in each speaker directory must be unique
+    ```
+2. Copy the template directory.
+    ```bash
+    cd egs
+    # For single speaker case
+    cp -r template_single_spk <your_dataset_name>
+    # For multi speaker case
+    cp -r template_multi_spk <your_dataset_name>
+    # Move on your recipe
+    cd egs/<your_dataset_name>/voc1
+    ```
+3. Modify the options in `run.sh`.
+   What you need to change at least in `run.sh` is as follows:
+   - `db_root`: Root path of the database.
+   - `num_dev`: The number of utterances for development set.
+   - `num_eval`: The number of utterances for evaluation set.
+4. Modify the hyperpameters in `conf/parallel_wavegan.v1.yaml`.
+   What you need to change at least in config is as follows:
+    - `sampling_rate`: If you can specify the lower sampling rate, the audio will be downsampled by sox.
+5. (Optional) Change command backend in `cmd.sh`.
+   If you are not familiar with kaldi and run in your local env, you do not need to change.
+   See more info on https://kaldi-asr.org/doc/queue.html.
+6. Run your recipe.
+    ```bash
+    # Run all stages from the first stage
+    ./run.sh
+    # If you want to specify CUDA device
+    CUDA_VISIBLE_DEVICES=0 ./run.sh
+    ```
+If you want to try the other advanced model, please check the config files in `egs/ljspeech/voc1/conf`.
+## Run training using ESPnet2-TTS recipe within 5 minutes
+Make sure already you finished the espnet2-tts recipe experiments (at least starting the training).
+```bash
+cd egs
+# Please use single spk template for both single and multi spk case
+cp -r template_single_spk <recipe_name>
+# Move on your recipe
+cd egs/<recipe_name>/voc1
+# Make symlink of data directory (Better to use absolute path)
+mkdir dump data
+ln -s /path/to/espnet/egs2/<recipe_name>/tts1/dump/raw dump/
+ln -s /path/to/espnet/egs2/<recipe_name>/tts1/dump/raw/tr_no_dev data/train_nodev
+ln -s /path/to/espnet/egs2/<recipe_name>/tts1/dump/raw/dev data/dev
+ln -s /path/to/espnet/egs2/<recipe_name>/tts1/dump/raw/eval1 data/eval
+# Edit config to match TTS model setting
+vim conf/parallel_wavegan.v1.yaml
+# Run from stage 1
+./run.sh --stage 1 --conf conf/parallel_wavegan.v1.yaml
+```
+That's it!

ParallelWaveGAN/egs/arctic/voc1/cmd.sh ADDED Viewed

	@@ -0,0 +1,91 @@

+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend="local"
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+    # The other usage
+    export train_cmd="utils/run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="utils/run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="utils/run.pl"
+# Local machine, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+    # The other usage
+    export train_cmd="utils/stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="utils/stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="utils/stdout.pl"
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+    export train_cmd="utils/queue.pl"
+    export cuda_cmd="utils/queue.pl"
+    export decode_cmd="utils/queue.pl"
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+    export train_cmd="utils/slurm.pl"
+    export cuda_cmd="utils/slurm.pl"
+    export decode_cmd="utils/slurm.pl"
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+    export train_cmd="utils/ssh.pl"
+    export cuda_cmd="utils/ssh.pl"
+    export decode_cmd="utils/ssh.pl"
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi

ParallelWaveGAN/egs/arctic/voc1/conf/parallel_wavegan.v1.yaml ADDED Viewed

	@@ -0,0 +1,122 @@

+# This is the hyperparameter configuration file for Parallel WaveGAN.
+# Please make sure this is adjusted for the Arctic dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration requires 12 GB GPU memory and takes ~3 days on TITAN V.
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 16000     # Sampling rate.
+fft_size: 1024           # FFT size.
+hop_size: 256            # Hop size.
+win_length: null         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: false      # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 2048    # Frame size in trimming.
+trim_hop_size: 512       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Kernel size of dilated convolution.
+    layers: 30            # Number of residual block layers.
+    stacks: 3             # Number of stacks i.e., dilation cycles.
+    residual_channels: 64 # Number of channels in residual conv.
+    gate_channels: 128    # Number of channels in gated conv.
+    skip_channels: 64     # Number of channels in skip conv.
+    aux_channels: 80      # Number of channels for auxiliary feature conv.
+                          # Must be the same as num_mels.
+    aux_context_window: 2 # Context window size for auxiliary feature.
+                          # If set to 2, previous 2 and future 2 frames will be considered.
+    dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
+    upsample_params:                      # Upsampling network parameters.
+        upsample_scales: [4, 4, 4, 4]     # Upsampling scales. Prodcut of these must be the same as hop size.
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Number of output channels.
+    layers: 10            # Number of conv layers.
+    conv_channels: 64     # Number of chnn layers.
+    bias: true            # Whether to use bias parameter in conv.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
+    nonlinear_activation_params:      # Nonlinear function parameters
+        negative_slope: 0.2           # Alpha in LeakyReLU.
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 4.0  # Loss balancing coefficient.
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 10             # Batch size.
+batch_max_steps: 15360     # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2             # Number of workers in Pytorch DataLoader.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    lr: 0.0001             # Generator's learning rate.
+    eps: 1.0e-6            # Generator's epsilon.
+    weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+    step_size: 200000      # Generator's scheduler step size.
+    gamma: 0.5             # Generator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+    lr: 0.00005            # Discriminator's learning rate.
+    eps: 1.0e-6            # Discriminator's epsilon.
+    weight_decay: 0.0      # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    step_size: 200000      # Discriminator's scheduler step size.
+    gamma: 0.5             # Discriminator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 400000                 # Number of training steps.
+save_interval_steps: 5000               # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+log_interval_steps: 100                 # Interval steps to record the training log.
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.

ParallelWaveGAN/egs/arctic/voc1/conf/slurm.conf ADDED Viewed

	@@ -0,0 +1,12 @@

+# Default configuration
+command sbatch --export=PATH  --ntasks-per-node=1
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
+option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1 # Do not add anything to qsub_opts
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.

ParallelWaveGAN/egs/arctic/voc1/local/data_download.sh ADDED Viewed

	@@ -0,0 +1,40 @@

+#!/bin/bash
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+download_dir=$1
+spk=$2
+available_spks=(
+    "slt" "clb" "bdl" "rms" "jmk" "awb" "ksp"
+)
+# check arguments
+if [ $# != 2 ]; then
+    echo "Usage: $0 <download_dir> <spk>"
+    echo "Available speakers: ${available_spks[*]}"
+    exit 1
+fi
+set -euo pipefail
+# check speakers
+if ! echo "${available_spks[*]}" | grep -q "${spk}"; then
+    echo "Specified spk (${spk}) is not available or not supported." >&2
+    exit 1
+fi
+# download dataset
+cwd=$(pwd)
+if [ ! -e "${download_dir}/cmu_us_${spk}_arctic" ]; then
+    mkdir -p "${download_dir}"
+    cd "${download_dir}"
+    wget "http://festvox.org/cmu_arctic/cmu_arctic/packed/cmu_us_${spk}_arctic-0.95-release.tar.bz2"
+    tar xf "cmu_us_${spk}_arctic-0.95-release.tar.bz2"
+    rm "cmu_us_${spk}_arctic-0.95-release.tar.bz2"
+    cd "${cwd}"
+    echo "Successfully finished download."
+else
+    echo "Already exists. Skip download."
+fi

ParallelWaveGAN/egs/arctic/voc1/local/data_prep.sh ADDED Viewed

	@@ -0,0 +1,113 @@

+#!/bin/bash
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+# shellcheck disable=SC1091
+. ./path.sh || exit 1;
+num_dev=100
+num_eval=100
+train_set="train_nodev"
+dev_set="dev"
+eval_set="eval"
+shuffle=false
+# shellcheck disable=SC1091
+. utils/parse_options.sh || exit 1;
+db_root=$1
+spk=$2
+data_dir=$3
+# check arguments
+if [ $# != 3 ]; then
+    echo "Usage: $0 <db_root> <spk> <data_dir>"
+    echo "e.g.: $0 downloads/cms_us_slt_arctic slt data"
+    echo ""
+    echo "Options:"
+    echo "    --num_dev: number of development uttreances (default=250)."
+    echo "    --num_eval: number of evaluation uttreances (default=250)."
+    echo "    --train_set: name of train set (default=train_nodev)."
+    echo "    --dev_set: name of dev set (default=dev)."
+    echo "    --eval_set: name of eval set (default=eval)."
+    echo "    --shuffle: whether to perform shuffle in making dev / eval set (default=false)."
+    exit 1
+fi
+set -euo pipefail
+# check speaker
+available_spks=(
+    "slt" "clb" "bdl" "rms" "jmk" "awb" "ksp"
+)
+if ! echo "${available_spks[*]}" | grep -q "${spk}"; then
+    echo "Specified speaker ${spk} is not available."
+    echo "Available speakers: ${available_spks[*]}"
+    exit 1
+fi
+[ ! -e "${data_dir}/all" ] && mkdir -p "${data_dir}/all"
+# set filenames
+scp="${data_dir}/all/wav.scp"
+segments="${data_dir}/all/segments"
+# check file existence
+[ -e "${scp}" ] && rm "${scp}"
+[ -e "${segments}" ] && rm "${segments}"
+# make scp
+find "${db_root}" -name "*.wav" -follow | sort | while read -r filename; do
+    id="${spk}_$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")"
+    echo "${id} ${filename}" >> "${scp}"
+done
+# make segments
+find "${db_root}/lab" -name "*.lab" -follow | sort | while read -r filename; do
+    # get start time
+    while read -r line; do
+        phn=$(echo "${line}" | cut -d " " -f 3)
+        if [ "${phn}" != "pau" ]; then
+            break
+        fi
+        start=$(echo "${line}" | cut -d " " -f 1)
+    done < <(tail -n +2 "$filename")
+    # get end time
+    while read -r line; do
+        end=$(echo "${line}" | cut -d " " -f 1)
+        phn=$(echo "${line}" | cut -d " " -f 3)
+        if [ "${phn}" != "pau" ]; then
+            break
+        fi
+    done < <(tail -n +2 "$filename" | tac)
+    echo "${spk}_$(basename "${filename}" .lab) ${spk}_$(basename "${filename}" .lab) ${start} ${end}" >> "${segments}"
+done
+# check
+diff -q <(awk '{print $1}' "${scp}") <(awk '{print $1}' "${segments}") > /dev/null
+# split
+num_all=$(wc -l < "${scp}")
+num_deveval=$((num_dev + num_eval))
+num_train=$((num_all - num_deveval))
+utils/split_data.sh \
+    --num_first "${num_train}" \
+    --num_second "${num_deveval}" \
+    --shuffle "${shuffle}" \
+    "${data_dir}/all" \
+    "${data_dir}/${train_set}" \
+    "${data_dir}/deveval"
+utils/split_data.sh \
+    --num_first "${num_dev}" \
+    --num_second "${num_eval}" \
+    --shuffle "${shuffle}" \
+    "${data_dir}/deveval" \
+    "${data_dir}/${dev_set}" \
+    "${data_dir}/${eval_set}"
+# remove tmp directories
+rm -rf "${data_dir}/all"
+rm -rf "${data_dir}/deveval"
+echo "Successfully prepared data."

ParallelWaveGAN/egs/arctic/voc1/path.sh ADDED Viewed

	@@ -0,0 +1,33 @@

+# cuda related
+export CUDA_HOME=/usr/local/cuda-10.0
+export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
+# path related
+export PRJ_ROOT="${PWD}/../../.."
+if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then
+    # shellcheck disable=SC1090
+    . "${PRJ_ROOT}/tools/venv/bin/activate"
+fi
+# python related
+export OMP_NUM_THREADS=1
+export PYTHONIOENCODING=UTF-8
+export MPL_BACKEND=Agg
+# check installation
+if ! command -v parallel-wavegan-train > /dev/null; then
+    echo "Error: It seems setup is not finished." >&2
+    echo "Error: Please setup your environment by following README.md" >&2
+    return 1
+fi
+if ! command -v jq > /dev/null; then
+    echo "Error: It seems jq is not installed." >&2
+    echo "Error: Please install via \`sudo apt-get install jq\`." >&2
+    echo "Error: If you do not have sudo, please download from https://stedolan.github.io/jq/download/." >&2
+    return 1
+fi
+if ! command -v yq > /dev/null; then
+    echo "Error: It seems yq is not installed." >&2
+    echo "Error: Please install via \`pip install yq\`." >&2
+    return 1
+fi

ParallelWaveGAN/egs/arctic/voc1/run.sh ADDED Viewed

	@@ -0,0 +1,167 @@

+#!/bin/bash
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+. ./cmd.sh || exit 1;
+. ./path.sh || exit 1;
+# basic settings
+stage=-1       # stage to start
+stop_stage=100 # stage to stop
+verbose=1      # verbosity level (lower is less info)
+n_gpus=1       # number of gpus in training
+n_jobs=16      # number of parallel jobs in feature extraction
+# NOTE(kan-bayashi): renamed to conf to avoid conflict in parse_options.sh
+conf=conf/parallel_wavegan.v1.yaml
+# directory path setting
+download_dir=downloads # direcotry to save downloaded files
+dumpdir=dump           # directory to dump features
+# target speaker setting
+spk=slt # you can select from slt, clb, bdl, rms, awb, jmk, ksp
+# training related setting
+tag=""     # tag for directory to save model
+resume=""  # checkpoint path to resume training
+           # (e.g. <path>/<to>/checkpoint-10000steps.pkl)
+# decoding related setting
+checkpoint="" # checkpoint path to be used for decoding
+              # if not provided, the latest one will be used
+              # (e.g. <path>/<to>/checkpoint-400000steps.pkl)
+# shellcheck disable=SC1091
+. utils/parse_options.sh || exit 1;
+train_set="train_nodev_${spk}" # name of training data directory
+dev_set="dev_${spk}"           # name of development data direcotry
+eval_set="eval_${spk}"         # name of evaluation data direcotry
+set -euo pipefail
+if [ "${stage}" -le -1 ] && [ "${stop_stage}" -ge -1 ]; then
+    echo "Stage -1: Data download"
+    local/data_download.sh "${download_dir}" "${spk}"
+fi
+if [ "${stage}" -le 0 ] && [ "${stop_stage}" -ge 0 ]; then
+    echo "Stage 0: Data preparation"
+    local/data_prep.sh \
+        --train_set "${train_set}" \
+        --dev_set "${dev_set}" \
+        --eval_set "${eval_set}" \
+        "${download_dir}/cmu_us_${spk}_arctic" "${spk}" data
+fi
+stats_ext=$(grep -q "hdf5" <(yq ".format" "${conf}") && echo "h5" || echo "npy")
+if [ "${stage}" -le 1 ] && [ "${stop_stage}" -ge 1 ]; then
+    echo "Stage 1: Feature extraction"
+    # extract raw features
+    pids=()
+    for name in "${train_set}" "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${dumpdir}/${name}/raw" ] && mkdir -p "${dumpdir}/${name}/raw"
+        echo "Feature extraction start. See the progress via ${dumpdir}/${name}/raw/preprocessing.*.log."
+        utils/make_subset_data.sh "data/${name}" "${n_jobs}" "${dumpdir}/${name}/raw"
+        ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/raw/preprocessing.JOB.log" \
+            parallel-wavegan-preprocess \
+                --config "${conf}" \
+                --scp "${dumpdir}/${name}/raw/wav.JOB.scp" \
+                --segments "${dumpdir}/${name}/raw/segments.JOB" \
+                --dumpdir "${dumpdir}/${name}/raw/dump.JOB" \
+                --verbose "${verbose}"
+        echo "Successfully finished feature extraction of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished feature extraction."
+    # calculate statistics for normalization
+    echo "Statistics computation start. See the progress via ${dumpdir}/${train_set}/compute_statistics.log."
+    ${train_cmd} "${dumpdir}/${train_set}/compute_statistics.log" \
+        parallel-wavegan-compute-statistics \
+            --config "${conf}" \
+            --rootdir "${dumpdir}/${train_set}/raw" \
+            --dumpdir "${dumpdir}/${train_set}" \
+            --verbose "${verbose}"
+    echo "Successfully finished calculation of statistics."
+    # normalize and dump them
+    pids=()
+    for name in "${train_set}" "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${dumpdir}/${name}/norm" ] && mkdir -p "${dumpdir}/${name}/norm"
+        echo "Nomalization start. See the progress via ${dumpdir}/${name}/norm/normalize.*.log."
+        ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/norm/normalize.JOB.log" \
+            parallel-wavegan-normalize \
+                --config "${conf}" \
+                --stats "${dumpdir}/${train_set}/stats.${stats_ext}" \
+                --rootdir "${dumpdir}/${name}/raw/dump.JOB" \
+                --dumpdir "${dumpdir}/${name}/norm/dump.JOB" \
+                --verbose "${verbose}"
+        echo "Successfully finished normalization of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished normalization."
+fi
+if [ -z "${tag}" ]; then
+    expdir="exp/${train_set}_arctic_$(basename "${conf}" .yaml)"
+else
+    expdir="exp/${train_set}_arctic_${tag}"
+fi
+if [ "${stage}" -le 2 ] && [ "${stop_stage}" -ge 2 ]; then
+    echo "Stage 2: Network training"
+    [ ! -e "${expdir}" ] && mkdir -p "${expdir}"
+    cp "${dumpdir}/${train_set}/stats.${stats_ext}" "${expdir}"
+    if [ "${n_gpus}" -gt 1 ]; then
+        train="python -m parallel_wavegan.distributed.launch --nproc_per_node ${n_gpus} -c parallel-wavegan-train"
+    else
+        train="parallel-wavegan-train"
+    fi
+    echo "Training start. See the progress via ${expdir}/train.log."
+    ${cuda_cmd} --gpu "${n_gpus}" "${expdir}/train.log" \
+        ${train} \
+            --config "${conf}" \
+            --train-dumpdir "${dumpdir}/${train_set}/norm" \
+            --dev-dumpdir "${dumpdir}/${dev_set}/norm" \
+            --outdir "${expdir}" \
+            --resume "${resume}" \
+            --verbose "${verbose}"
+    echo "Successfully finished training."
+fi
+if [ "${stage}" -le 3 ] && [ "${stop_stage}" -ge 3 ]; then
+    echo "Stage 3: Network decoding"
+    # shellcheck disable=SC2012
+    [ -z "${checkpoint}" ] && checkpoint="$(ls -dt "${expdir}"/*.pkl | head -1 || true)"
+    outdir="${expdir}/wav/$(basename "${checkpoint}" .pkl)"
+    pids=()
+    for name in "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${outdir}/${name}" ] && mkdir -p "${outdir}/${name}"
+        [ "${n_gpus}" -gt 1 ] && n_gpus=1
+        echo "Decoding start. See the progress via ${outdir}/${name}/decode.log."
+        ${cuda_cmd} --gpu "${n_gpus}" "${outdir}/${name}/decode.log" \
+            parallel-wavegan-decode \
+                --dumpdir "${dumpdir}/${name}/norm" \
+                --checkpoint "${checkpoint}" \
+                --outdir "${outdir}/${name}" \
+                --verbose "${verbose}"
+        echo "Successfully finished decoding of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished decoding."
+fi
+echo "Finished."

ParallelWaveGAN/egs/arctic/voc1/utils ADDED Viewed

	@@ -0,0 +1 @@


1	+ ../../../utils

ParallelWaveGAN/egs/csmsc/voc1/cmd.sh ADDED Viewed

	@@ -0,0 +1,91 @@

+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend="local"
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+    # The other usage
+    export train_cmd="utils/run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="utils/run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="utils/run.pl"
+# Local machine, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+    # The other usage
+    export train_cmd="utils/stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="utils/stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="utils/stdout.pl"
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+    export train_cmd="utils/queue.pl"
+    export cuda_cmd="utils/queue.pl"
+    export decode_cmd="utils/queue.pl"
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+    export train_cmd="utils/slurm.pl"
+    export cuda_cmd="utils/slurm.pl"
+    export decode_cmd="utils/slurm.pl"
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+    export train_cmd="utils/ssh.pl"
+    export cuda_cmd="utils/ssh.pl"
+    export decode_cmd="utils/ssh.pl"
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi

ParallelWaveGAN/egs/csmsc/voc1/conf/hifigan.v1.yaml ADDED Viewed

	@@ -0,0 +1,180 @@

+# This is the configuration file for CSMSC dataset.
+# This configuration is based on HiFiGAN V1, which is
+# an official configuration. But I found that the optimizer
+# setting does not work well with my implementation.
+# So I changed optimizer settings as follows:
+# - AdamW -> Adam
+# - betas: [0.8, 0.99] -> betas: [0.5, 0.9]
+# - Scheduler: ExponentialLR -> MultiStepLR
+# To match the shift size difference, the upsample scales
+# is also modified from the original 256 shift setting.
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 24000     # Sampling rate.
+fft_size: 2048           # FFT size.
+hop_size: 300            # Hop size.
+win_length: 1200         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: false      # Whether to trim the start and end of silence.
+trim_threshold_in_db: 20 # Need to tune carefully if the recording is not good.
+trim_frame_size: 1024    # Frame size in trimming.
+trim_hop_size: 256       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_type: HiFiGANGenerator
+generator_params:
+    in_channels: 80                       # Number of input channels.
+    out_channels: 1                       # Number of output channels.
+    channels: 512                         # Number of initial channels.
+    kernel_size: 7                        # Kernel size of initial and final conv layers.
+    upsample_scales: [5, 5, 4, 3]         # Upsampling scales.
+    upsample_kernel_sizes: [10, 10, 8, 6] # Kernel size for upsampling layers.
+    resblock_kernel_sizes: [3, 7, 11]     # Kernel size for residual blocks.
+    resblock_dilations:                   # Dilations for residual blocks.
+        - [1, 3, 5]
+        - [1, 3, 5]
+        - [1, 3, 5]
+    use_additional_convs: true            # Whether to use additional conv layer in residual blocks.
+    bias: true                            # Whether to use bias parameter in conv.
+    nonlinear_activation: "LeakyReLU"     # Nonlinear activation type.
+    nonlinear_activation_params:          # Nonlinear activation paramters.
+        negative_slope: 0.1
+    use_weight_norm: true                 # Whether to apply weight normalization.
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_type: HiFiGANMultiScaleMultiPeriodDiscriminator
+discriminator_params:
+    scales: 3                              # Number of multi-scale discriminator.
+    scale_downsample_pooling: "AvgPool1d"  # Pooling operation for scale discriminator.
+    scale_downsample_pooling_params:
+        kernel_size: 4                     # Pooling kernel size.
+        stride: 2                          # Pooling stride.
+        padding: 2                         # Padding size.
+    scale_discriminator_params:
+        in_channels: 1                     # Number of input channels.
+        out_channels: 1                    # Number of output channels.
+        kernel_sizes: [15, 41, 5, 3]       # List of kernel sizes.
+        channels: 128                      # Initial number of channels.
+        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
+        max_groups: 16                     # Maximum number of groups in downsampling conv layers.
+        bias: true
+        downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales.
+        nonlinear_activation: "LeakyReLU"  # Nonlinear activation.
+        nonlinear_activation_params:
+            negative_slope: 0.1
+    follow_official_norm: true             # Whether to follow the official norm setting.
+    periods: [2, 3, 5, 7, 11]              # List of period for multi-period discriminator.
+    period_discriminator_params:
+        in_channels: 1                     # Number of input channels.
+        out_channels: 1                    # Number of output channels.
+        kernel_sizes: [5, 3]               # List of kernel sizes.
+        channels: 32                       # Initial number of channels.
+        downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
+        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
+        bias: true                         # Whether to use bias parameter in conv layer."
+        nonlinear_activation: "LeakyReLU"  # Nonlinear activation.
+        nonlinear_activation_params:       # Nonlinear activation paramters.
+            negative_slope: 0.1
+        use_weight_norm: true              # Whether to apply weight normalization.
+        use_spectral_norm: false           # Whether to apply spectral normalization.
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+use_stft_loss: false                 # Whether to use multi-resolution STFT loss.
+use_mel_loss: true                   # Whether to use Mel-spectrogram loss.
+mel_loss_params:
+    fs: 24000
+    fft_size: 2048
+    hop_size: 300
+    win_length: 1200
+    window: "hann"
+    num_mels: 80
+    fmin: 0
+    fmax: 12000
+    log_base: null
+generator_adv_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+discriminator_adv_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+use_feat_match_loss: true
+feat_match_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+    average_by_layers: false         # Whether to average loss by #layers in each discriminator.
+    include_final_outputs: false     # Whether to include final outputs in feat match loss calculation.
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_aux: 45.0       # Loss balancing coefficient for STFT loss.
+lambda_adv: 1.0        # Loss balancing coefficient for adversarial loss.
+lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss..
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 16              # Batch size.
+batch_max_steps: 8400       # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2              # Number of workers in Pytorch DataLoader.
+remove_short_samples: false # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_type: Adam
+generator_optimizer_params:
+    lr: 2.0e-4
+    betas: [0.5, 0.9]
+    weight_decay: 0.0
+generator_scheduler_type: MultiStepLR
+generator_scheduler_params:
+    gamma: 0.5
+    milestones:
+        - 200000
+        - 400000
+        - 600000
+        - 800000
+generator_grad_norm: -1
+discriminator_optimizer_type: Adam
+discriminator_optimizer_params:
+    lr: 2.0e-4
+    betas: [0.5, 0.9]
+    weight_decay: 0.0
+discriminator_scheduler_type: MultiStepLR
+discriminator_scheduler_params:
+    gamma: 0.5
+    milestones:
+        - 200000
+        - 400000
+        - 600000
+        - 800000
+discriminator_grad_norm: -1
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+generator_train_start_steps: 1     # Number of steps to start to train discriminator.
+discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
+train_max_steps: 2500000           # Number of training steps.
+save_interval_steps: 10000         # Interval steps to save checkpoint.
+eval_interval_steps: 1000          # Interval steps to evaluate the network.
+log_interval_steps: 100            # Interval steps to record the training log.
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.

ParallelWaveGAN/egs/csmsc/voc1/conf/multi_band_melgan.v2.yaml ADDED Viewed

	@@ -0,0 +1,150 @@

+# This is the hyperparameter configuration file for MelGAN.
+# Please make sure this is adjusted for the CSMSC dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration requires ~ 8GB memory and will finish within 4 days on Titan V.
+# This configuration is based on full-band MelGAN but the hop size and sampling
+# rate is different from the paper (16kHz vs 24kHz). The number of iteraions
+# is now shown in the paper so currently we train 1M iterations (not sure enough
+# to converge). The optimizer setting is based on @dathudeptrai advice.
+# https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 24000     # Sampling rate.
+fft_size: 2048           # FFT size.
+hop_size: 300            # Hop size.
+win_length: 1200         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: false      # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 2048    # Frame size in trimming.
+trim_hop_size: 512       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_type: "MelGANGenerator" # Generator type.
+generator_params:
+    in_channels: 80               # Number of input channels.
+    out_channels: 4               # Number of output channels.
+    kernel_size: 7                # Kernel size of initial and final conv layers.
+    channels: 384                 # Initial number of channels for conv layers.
+    upsample_scales: [5, 5, 3]    # List of Upsampling scales.
+    stack_kernel_size: 3          # Kernel size of dilated conv layers in residual stack.
+    stacks: 4                     # Number of stacks in a single residual stack module.
+    use_weight_norm: True         # Whether to use weight normalization.
+    use_causal_conv: False        # Whether to use causal convolution.
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_type: "MelGANMultiScaleDiscriminator" # Discriminator type.
+discriminator_params:
+    in_channels: 1                    # Number of input channels.
+    out_channels: 1                   # Number of output channels.
+    scales: 3                         # Number of multi-scales.
+    downsample_pooling: "AvgPool1d"   # Pooling type for the input downsampling.
+    downsample_pooling_params:        # Parameters of the above pooling function.
+        kernel_size: 4
+        stride: 2
+        padding: 1
+        count_include_pad: False
+    kernel_sizes: [5, 3]              # List of kernel size.
+    channels: 16                      # Number of channels of the initial conv layer.
+    max_downsample_channels: 512      # Maximum number of channels of downsampling layers.
+    downsample_scales: [4, 4, 4]      # List of downsampling scales.
+    nonlinear_activation: "LeakyReLU" # Nonlinear activation function.
+    nonlinear_activation_params:      # Parameters of nonlinear activation function.
+        negative_slope: 0.2
+    use_weight_norm: True             # Whether to use weight norm.
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+use_subband_stft_loss: true
+subband_stft_loss_params:
+    fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
+    hop_sizes: [30, 60, 10]     # List of hop size for STFT-based loss
+    win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
+    window: "hann_window"       # Window function for STFT-based loss
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+use_feat_match_loss: false # Whether to use feature matching loss.
+lambda_adv: 2.5            # Loss balancing coefficient for adversarial loss.
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 64             # Batch size.
+batch_max_steps: 16200     # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
+num_workers: 4             # Number of workers in Pytorch DataLoader.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_type: "Adam"       # Generator's optimizer type.
+generator_optimizer_params:
+    lr: 1.0e-3                          # Generator's learning rate.
+    eps: 1.0e-7                         # Generator's epsilon.
+    weight_decay: 0.0                   # Generator's weight decay coefficient.
+    amsgrad: true
+generator_grad_norm: -1                 # Generator's gradient norm.
+generator_scheduler_type: "MultiStepLR" # Generator's scheduler type.
+generator_scheduler_params:
+    gamma: 0.5                          # Generator's scheduler gamma.
+    milestones:                         # At each milestone, lr will be multiplied by gamma.
+        - 100000
+        - 200000
+        - 300000
+        - 400000
+        - 500000
+        - 600000
+discriminator_optimizer_type: "Adam"        # Discriminator's optimizer type.
+discriminator_optimizer_params:
+    lr: 1.0e-3                              # Discriminator's learning rate.
+    eps: 1.0e-7                             # Discriminator's epsilon.
+    weight_decay: 0.0                       # Discriminator's weight decay coefficient.
+    amsgrad: true
+discriminator_grad_norm: -1                 # Discriminator's gradient norm.
+discriminator_scheduler_type: "MultiStepLR" # Discriminator's scheduler type.
+discriminator_scheduler_params:
+    gamma: 0.5                              # Discriminator's scheduler gamma.
+    milestones:                             # At each milestone, lr will be multiplied by gamma.
+        - 100000
+        - 200000
+        - 300000
+        - 400000
+        - 500000
+        - 600000
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 200000 # Number of steps to start to train discriminator.
+train_max_steps: 1000000                # Number of training steps.
+save_interval_steps: 50000              # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+log_interval_steps: 1000                # Interval steps to record the training log.
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.

ParallelWaveGAN/egs/csmsc/voc1/conf/parallel_wavegan.v1.yaml ADDED Viewed

	@@ -0,0 +1,122 @@

+# This is the hyperparameter configuration file for Parallel WaveGAN.
+# Please make sure this is adjusted for the CSMSC dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration requires 12 GB GPU memory and takes ~3 days on RTX TITAN.
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 24000     # Sampling rate.
+fft_size: 2048           # FFT size.
+hop_size: 300            # Hop size.
+win_length: 1200         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: false      # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 2048    # Frame size in trimming.
+trim_hop_size: 512       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Kernel size of dilated convolution.
+    layers: 30            # Number of residual block layers.
+    stacks: 3             # Number of stacks i.e., dilation cycles.
+    residual_channels: 64 # Number of channels in residual conv.
+    gate_channels: 128    # Number of channels in gated conv.
+    skip_channels: 64     # Number of channels in skip conv.
+    aux_channels: 80      # Number of channels for auxiliary feature conv.
+                          # Must be the same as num_mels.
+    aux_context_window: 2 # Context window size for auxiliary feature.
+                          # If set to 2, previous 2 and future 2 frames will be considered.
+    dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
+    upsample_params:                      # Upsampling network parameters.
+        upsample_scales: [4, 5, 3, 5]     # Upsampling scales. Prodcut of these must be the same as hop size.
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Number of output channels.
+    layers: 10            # Number of conv layers.
+    conv_channels: 64     # Number of chnn layers.
+    bias: true            # Whether to use bias parameter in conv.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
+    nonlinear_activation_params:      # Nonlinear function parameters
+        negative_slope: 0.2           # Alpha in LeakyReLU.
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 4.0  # Loss balancing coefficient.
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 6              # Batch size.
+batch_max_steps: 25500     # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2             # Number of workers in Pytorch DataLoader.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    lr: 0.0001             # Generator's learning rate.
+    eps: 1.0e-6            # Generator's epsilon.
+    weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+    step_size: 200000      # Generator's scheduler step size.
+    gamma: 0.5             # Generator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+    lr: 0.00005            # Discriminator's learning rate.
+    eps: 1.0e-6            # Discriminator's epsilon.
+    weight_decay: 0.0      # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    step_size: 200000      # Discriminator's scheduler step size.
+    gamma: 0.5             # Discriminator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 400000                 # Number of training steps.
+save_interval_steps: 5000               # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+log_interval_steps: 100                 # Interval steps to record the training log.
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.

ParallelWaveGAN/egs/csmsc/voc1/conf/slurm.conf ADDED Viewed

	@@ -0,0 +1,12 @@

+# Default configuration
+command sbatch --export=PATH  --ntasks-per-node=1
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
+option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1 # Do not add anything to qsub_opts
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.

ParallelWaveGAN/egs/csmsc/voc1/conf/style_melgan.v1.yaml ADDED Viewed

	@@ -0,0 +1,147 @@

+# This is the configuration file for CSMSC dataset.
+# This configuration is based on StyleMelGAN paper but
+# uses MSE loss instead of Hinge loss. And I found that
+# batch_size = 8 is also working good. So maybe if you
+# want to accelerate the training, you can reduce the
+# batch size (e.g. 8 or 16). Upsampling scales is modified
+# to fit the shift size 300 pt.
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 24000     # Sampling rate.
+fft_size: 2048           # FFT size.
+hop_size: 300            # Hop size.
+win_length: 1200         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: false      # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 1024    # Frame size in trimming.
+trim_hop_size: 256       # Hop size in trimming.
+format: "hdf5"           # Feature file format. " npy " or " hdf5 " is supported.
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_type: "StyleMelGANGenerator" # Generator type.
+generator_params:
+    in_channels: 128
+    aux_channels: 80
+    channels: 64
+    out_channels: 1
+    kernel_size: 9
+    dilation: 2
+    bias: True
+    noise_upsample_scales: [10, 2, 2, 2]
+    noise_upsample_activation: "LeakyReLU"
+    noise_upsample_activation_params:
+        negative_slope: 0.2
+    upsample_scales: [5, 1, 5, 1, 3, 1, 2, 2, 1]
+    upsample_mode: "nearest"
+    gated_function: "softmax"
+    use_weight_norm: True
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_type: "StyleMelGANDiscriminator" # Discriminator type.
+discriminator_params:
+    repeats: 4
+    window_sizes: [512, 1024, 2048, 4096]
+    pqmf_params:
+        - [1, None, None, None]
+        - [2, 62, 0.26700, 9.0]
+        - [4, 62, 0.14200, 9.0]
+        - [8, 62, 0.07949, 9.0]
+    discriminator_params:
+        out_channels: 1
+        kernel_sizes: [5, 3]
+        channels: 16
+        max_downsample_channels: 512
+        bias: True
+        downsample_scales: [4, 4, 4, 1]
+        nonlinear_activation: "LeakyReLU"
+        nonlinear_activation_params:
+            negative_slope: 0.2
+    use_weight_norm: True
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+lambda_aux: 1.0                   # Loss balancing coefficient for aux loss.
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 1.0 # Loss balancing coefficient for adv loss.
+generator_adv_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+discriminator_adv_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 32              # Batch size.
+batch_max_steps: 24000      # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2              # Number of workers in Pytorch DataLoader.
+remove_short_samples: false # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_type: Adam
+generator_optimizer_params:
+    lr: 1.0e-4
+    betas: [0.5, 0.9]
+    weight_decay: 0.0
+generator_scheduler_type: MultiStepLR
+generator_scheduler_params:
+    gamma: 0.5
+    milestones:
+        - 100000
+        - 300000
+        - 500000
+        - 700000
+        - 900000
+generator_grad_norm: -1
+discriminator_optimizer_type: Adam
+discriminator_optimizer_params:
+    lr: 2.0e-4
+    betas: [0.5, 0.9]
+    weight_decay: 0.0
+discriminator_scheduler_type: MultiStepLR
+discriminator_scheduler_params:
+    gamma: 0.5
+    milestones:
+        - 200000
+        - 400000
+        - 600000
+        - 800000
+discriminator_grad_norm: -1
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 1500000                # Number of training steps.
+save_interval_steps: 50000              # Interval steps to save checkpoint.
+eval_interval_steps: 1000              # Interval steps to evaluate the network.
+log_interval_steps: 100                # Interval steps to record the training log.
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.

ParallelWaveGAN/egs/csmsc/voc1/local/data_download.sh ADDED Viewed

	@@ -0,0 +1,32 @@

+#!/bin/bash
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+download_dir=$1
+# check arguments
+if [ $# != 1 ]; then
+    echo "Usage: $0 <download_dir>"
+    exit 1
+fi
+set -euo pipefail
+# download dataset
+cwd=$(pwd)
+if [ ! -e "${download_dir}/CSMSC" ]; then
+    mkdir -p "${download_dir}"
+    cd "${download_dir}"
+    wget https://weixinxcxdb.oss-cn-beijing.aliyuncs.com/gwYinPinKu/BZNSYP.rar
+    mkdir CSMSC && cd CSMSC && unrar x ../BZNSYP.rar
+    # convert new line code
+    find ./PhoneLabeling -name "*.interval" | while read -r line; do
+        nkf -Lu --overwrite "${line}"
+    done
+    rm ../BZNSYP.rar
+    cd "${cwd}"
+    echo "Successfully finished download."
+else
+    echo "Already exists. Skip download."
+fi

ParallelWaveGAN/egs/csmsc/voc1/local/data_prep.sh ADDED Viewed

	@@ -0,0 +1,94 @@

+#!/bin/bash
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+# shellcheck disable=SC1091
+. ./path.sh || exit 1;
+fs=24000
+num_dev=100
+num_eval=100
+train_set="train_nodev"
+dev_set="dev"
+eval_set="eval"
+shuffle=false
+# shellcheck disable=SC1091
+. utils/parse_options.sh || exit 1;
+db_root=$1
+data_dir=$2
+# check arguments
+if [ $# != 2 ]; then
+    echo "Usage: $0 [Options] <db_root> <data_dir>"
+    echo "e.g.: $0 downloads/CSMSC data"
+    echo ""
+    echo "Options:"
+    echo "    --fs: target sampling rate (default=24000)."
+    echo "    --num_dev: number of development uttreances (default=100)."
+    echo "    --num_eval: number of evaluation uttreances (default=100)."
+    echo "    --train_set: name of train set (default=train_nodev)."
+    echo "    --dev_set: name of dev set (default=dev)."
+    echo "    --eval_set: name of eval set (default=eval)."
+    echo "    --shuffle: whether to perform shuffle in making dev / eval set (default=false)."
+    exit 1
+fi
+set -euo pipefail
+[ ! -e "${data_dir}/all" ] && mkdir -p "${data_dir}/all"
+# set filenames
+scp="${data_dir}/all/wav.scp"
+segments="${data_dir}/all/segments"
+# check file existence
+[ -e "${scp}" ] && rm "${scp}"
+[ -e "${segments}" ] && rm "${segments}"
+# make wav.scp
+find "${db_root}/Wave" -name "*.wav" -follow | sort | while read -r filename; do
+    id="$(basename "${filename}" .wav)"
+    echo "csmsc_${id} cat ${filename} | sox -t wav - -c 1 -b 16 -t wav - rate ${fs} |" >> "${scp}"
+done
+# make segments
+find "${db_root}/PhoneLabeling" -name "*.interval" -follow | sort | while read -r filename; do
+    nkf -Lu --overwrite "${filename}"
+    id="$(basename "${filename}" .interval)"
+    start_sec=$(tail -n +14 "${filename}" | head -n 1)
+    end_sec=$(head -n -2 "${filename}" | tail -n 1)
+    [ -z "${start_sec}" ] && echo "Start second is missing (utt_id=${id}). " >&2 && exit 1;
+    [ -z "${end_sec}" ] && echo "End second is missing (utt_id=${id})." >&2 && exit 1;
+    echo "csmsc_${id} csmsc_${id} ${start_sec} ${end_sec}" >> "${segments}"
+done
+# check
+diff -q <(awk '{print $1}' "${scp}") <(awk '{print $1}' "${segments}") > /dev/null
+# split
+num_all=$(wc -l < "${scp}")
+num_deveval=$((num_dev + num_eval))
+num_train=$((num_all - num_deveval))
+utils/split_data.sh \
+    --num_first "${num_train}" \
+    --num_second "${num_deveval}" \
+    --shuffle "${shuffle}" \
+    "${data_dir}/all" \
+    "${data_dir}/${train_set}" \
+    "${data_dir}/deveval"
+utils/split_data.sh \
+    --num_first "${num_dev}" \
+    --num_second "${num_eval}" \
+    --shuffle "${shuffle}" \
+    "${data_dir}/deveval" \
+    "${data_dir}/${dev_set}" \
+    "${data_dir}/${eval_set}"
+# remove tmp directories
+rm -rf "${data_dir}/all"
+rm -rf "${data_dir}/deveval"
+echo "Successfully prepared data."

ParallelWaveGAN/egs/csmsc/voc1/path.sh ADDED Viewed

	@@ -0,0 +1,33 @@

+# cuda related
+export CUDA_HOME=/usr/local/cuda-10.0
+export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
+# path related
+export PRJ_ROOT="${PWD}/../../.."
+if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then
+    # shellcheck disable=SC1090
+    . "${PRJ_ROOT}/tools/venv/bin/activate"
+fi
+# python related
+export OMP_NUM_THREADS=1
+export PYTHONIOENCODING=UTF-8
+export MPL_BACKEND=Agg
+# check installation
+if ! command -v parallel-wavegan-train > /dev/null; then
+    echo "Error: It seems setup is not finished." >&2
+    echo "Error: Please setup your environment by following README.md" >&2
+    return 1
+fi
+if ! command -v jq > /dev/null; then
+    echo "Error: It seems jq is not installed." >&2
+    echo "Error: Please install via \`sudo apt-get install jq\`." >&2
+    echo "Error: If you do not have sudo, please download from https://stedolan.github.io/jq/download/." >&2
+    return 1
+fi
+if ! command -v yq > /dev/null; then
+    echo "Error: It seems yq is not installed." >&2
+    echo "Error: Please install via \`pip install yq\`." >&2
+    return 1
+fi

ParallelWaveGAN/egs/csmsc/voc1/run.sh ADDED Viewed

	@@ -0,0 +1,164 @@

+#!/bin/bash
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+. ./cmd.sh || exit 1;
+. ./path.sh || exit 1;
+# basic settings
+stage=-1       # stage to start
+stop_stage=100 # stage to stop
+verbose=1      # verbosity level (lower is less info)
+n_gpus=1       # number of gpus in training
+n_jobs=16      # number of parallel jobs in feature extraction
+# NOTE(kan-bayashi): renamed to conf to avoid conflict in parse_options.sh
+conf=conf/parallel_wavegan.v1.yaml
+# directory path setting
+download_dir=downloads # direcotry to save downloaded files
+dumpdir=dump           # directory to dump features
+# training related setting
+tag=""     # tag for directory to save model
+resume=""  # checkpoint path to resume training
+           # (e.g. <path>/<to>/checkpoint-10000steps.pkl)
+# decoding related setting
+checkpoint="" # checkpoint path to be used for decoding
+              # if not provided, the latest one will be used
+              # (e.g. <path>/<to>/checkpoint-400000steps.pkl)
+# shellcheck disable=SC1091
+. utils/parse_options.sh || exit 1;
+train_set="train_nodev" # name of training data directory
+dev_set="dev"           # name of development data direcotry
+eval_set="eval"         # name of evaluation data direcotry
+set -euo pipefail
+if [ "${stage}" -le -1 ] && [ "${stop_stage}" -ge -1 ]; then
+    echo "Stage -1: Data download"
+    local/data_download.sh "${download_dir}"
+fi
+if [ "${stage}" -le 0 ] && [ "${stop_stage}" -ge 0 ]; then
+    echo "Stage 0: Data preparation"
+    local/data_prep.sh \
+        --train_set "${train_set}" \
+        --dev_set "${dev_set}" \
+        --eval_set "${eval_set}" \
+        "${download_dir}/CSMSC" data
+fi
+stats_ext=$(grep -q "hdf5" <(yq ".format" "${conf}") && echo "h5" || echo "npy")
+if [ "${stage}" -le 1 ] && [ "${stop_stage}" -ge 1 ]; then
+    echo "Stage 1: Feature extraction"
+    # extract raw features
+    pids=()
+    for name in "${train_set}" "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${dumpdir}/${name}/raw" ] && mkdir -p "${dumpdir}/${name}/raw"
+        echo "Feature extraction start. See the progress via ${dumpdir}/${name}/raw/preprocessing.*.log."
+        utils/make_subset_data.sh "data/${name}" "${n_jobs}" "${dumpdir}/${name}/raw"
+        ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/raw/preprocessing.JOB.log" \
+            parallel-wavegan-preprocess \
+                --config "${conf}" \
+                --scp "${dumpdir}/${name}/raw/wav.JOB.scp" \
+                --segments "${dumpdir}/${name}/raw/segments.JOB" \
+                --dumpdir "${dumpdir}/${name}/raw/dump.JOB" \
+                --verbose "${verbose}"
+        echo "Successfully finished feature extraction of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished feature extraction."
+    # calculate statistics for normalization
+    echo "Statistics computation start. See the progress via ${dumpdir}/${train_set}/compute_statistics.log."
+    ${train_cmd} "${dumpdir}/${train_set}/compute_statistics.log" \
+        parallel-wavegan-compute-statistics \
+            --config "${conf}" \
+            --rootdir "${dumpdir}/${train_set}/raw" \
+            --dumpdir "${dumpdir}/${train_set}" \
+            --verbose "${verbose}"
+    echo "Successfully finished calculation of statistics."
+    # normalize and dump them
+    pids=()
+    for name in "${train_set}" "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${dumpdir}/${name}/norm" ] && mkdir -p "${dumpdir}/${name}/norm"
+        echo "Nomalization start. See the progress via ${dumpdir}/${name}/norm/normalize.*.log."
+        ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/norm/normalize.JOB.log" \
+            parallel-wavegan-normalize \
+                --config "${conf}" \
+                --stats "${dumpdir}/${train_set}/stats.${stats_ext}" \
+                --rootdir "${dumpdir}/${name}/raw/dump.JOB" \
+                --dumpdir "${dumpdir}/${name}/norm/dump.JOB" \
+                --verbose "${verbose}"
+        echo "Successfully finished normalization of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished normalization."
+fi
+if [ -z "${tag}" ]; then
+    expdir="exp/${train_set}_csmsc_$(basename "${conf}" .yaml)"
+else
+    expdir="exp/${train_set}_csmsc_${tag}"
+fi
+if [ "${stage}" -le 2 ] && [ "${stop_stage}" -ge 2 ]; then
+    echo "Stage 2: Network training"
+    [ ! -e "${expdir}" ] && mkdir -p "${expdir}"
+    cp "${dumpdir}/${train_set}/stats.${stats_ext}" "${expdir}"
+    if [ "${n_gpus}" -gt 1 ]; then
+        train="python -m parallel_wavegan.distributed.launch --nproc_per_node ${n_gpus} -c parallel-wavegan-train"
+    else
+        train="parallel-wavegan-train"
+    fi
+    echo "Training start. See the progress via ${expdir}/train.log."
+    ${cuda_cmd} --gpu "${n_gpus}" "${expdir}/train.log" \
+        ${train} \
+            --config "${conf}" \
+            --train-dumpdir "${dumpdir}/${train_set}/norm" \
+            --dev-dumpdir "${dumpdir}/${dev_set}/norm" \
+            --outdir "${expdir}" \
+            --resume "${resume}" \
+            --verbose "${verbose}"
+    echo "Successfully finished training."
+fi
+if [ "${stage}" -le 3 ] && [ "${stop_stage}" -ge 3 ]; then
+    echo "Stage 3: Network decoding"
+    # shellcheck disable=SC2012
+    [ -z "${checkpoint}" ] && checkpoint="$(ls -dt "${expdir}"/*.pkl | head -1 || true)"
+    outdir="${expdir}/wav/$(basename "${checkpoint}" .pkl)"
+    pids=()
+    for name in "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${outdir}/${name}" ] && mkdir -p "${outdir}/${name}"
+        [ "${n_gpus}" -gt 1 ] && n_gpus=1
+        echo "Decoding start. See the progress via ${outdir}/${name}/decode.log."
+        ${cuda_cmd} --gpu "${n_gpus}" "${outdir}/${name}/decode.log" \
+            parallel-wavegan-decode \
+                --dumpdir "${dumpdir}/${name}/norm" \
+                --checkpoint "${checkpoint}" \
+                --outdir "${outdir}/${name}" \
+                --verbose "${verbose}"
+        echo "Successfully finished decoding of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished decoding."
+fi
+echo "Finished."

ParallelWaveGAN/egs/csmsc/voc1/utils ADDED Viewed

	@@ -0,0 +1 @@


1	+ ../../../utils

ParallelWaveGAN/egs/jnas/voc1/cmd.sh ADDED Viewed

	@@ -0,0 +1,91 @@

+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend="local"
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+    # The other usage
+    export train_cmd="utils/run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="utils/run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="utils/run.pl"
+# Local machine, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+    # The other usage
+    export train_cmd="utils/stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="utils/stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="utils/stdout.pl"
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+    export train_cmd="utils/queue.pl"
+    export cuda_cmd="utils/queue.pl"
+    export decode_cmd="utils/queue.pl"
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+    export train_cmd="utils/slurm.pl"
+    export cuda_cmd="utils/slurm.pl"
+    export decode_cmd="utils/slurm.pl"
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+    export train_cmd="utils/ssh.pl"
+    export cuda_cmd="utils/ssh.pl"
+    export decode_cmd="utils/ssh.pl"
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi

ParallelWaveGAN/egs/jnas/voc1/conf/parallel_wavegan.v1.long.yaml ADDED Viewed

	@@ -0,0 +1,123 @@

+# This is the hyperparameter configuration file for Parallel WaveGAN.
+# Please make sure this is adjusted for the JNAS dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration trains more steps up to 1000k compared to v1 config.
+# It requires 12 GB GPU memory and takes ~7 days on TITAN V.
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 16000     # Sampling rate.
+fft_size: 1024           # FFT size.
+hop_size: 256            # Hop size.
+win_length: null         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: true       # Whether to trim the start and end of silence.
+trim_threshold_in_db: 20 # Need to tune carefully if the recording is not good.
+trim_frame_size: 1024    # Frame size in trimming.
+trim_hop_size: 256       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Kernel size of dilated convolution.
+    layers: 30            # Number of residual block layers.
+    stacks: 3             # Number of stacks i.e., dilation cycles.
+    residual_channels: 64 # Number of channels in residual conv.
+    gate_channels: 128    # Number of channels in gated conv.
+    skip_channels: 64     # Number of channels in skip conv.
+    aux_channels: 80      # Number of channels for auxiliary feature conv.
+                          # Must be the same as num_mels.
+    aux_context_window: 2 # Context window size for auxiliary feature.
+                          # If set to 2, previous 2 and future 2 frames will be considered.
+    dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
+    upsample_params:                      # Upsampling network parameters.
+        upsample_scales: [4, 4, 4, 4]     # Upsampling scales. Prodcut of these must be the same as hop size.
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Number of output channels.
+    layers: 10            # Number of conv layers.
+    conv_channels: 64     # Number of chnn layers.
+    bias: true            # Whether to use bias parameter in conv.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
+    nonlinear_activation_params:      # Nonlinear function parameters
+        negative_slope: 0.2           # Alpha in LeakyReLU.
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 4.0  # Loss balancing coefficient.
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 10             # Batch size.
+batch_max_steps: 15360     # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2             # Number of workers in Pytorch DataLoader.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    lr: 0.0001             # Generator's learning rate.
+    eps: 1.0e-6            # Generator's epsilon.
+    weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+    step_size: 200000      # Generator's scheduler step size.
+    gamma: 0.5             # Generator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+    lr: 0.00005            # Discriminator's learning rate.
+    eps: 1.0e-6            # Discriminator's epsilon.
+    weight_decay: 0.0      # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    step_size: 200000      # Discriminator's scheduler step size.
+    gamma: 0.5             # Discriminator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 1000000                # Number of training steps.
+save_interval_steps: 5000               # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+log_interval_steps: 100                 # Interval steps to record the training log.
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.

ParallelWaveGAN/egs/jnas/voc1/conf/parallel_wavegan.v1.yaml ADDED Viewed

	@@ -0,0 +1,122 @@

+# This is the hyperparameter configuration file for Parallel WaveGAN.
+# Please make sure this is adjusted for the JNAS dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration requires 12 GB GPU memory and takes ~3 days on TITAN V.
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 16000     # Sampling rate.
+fft_size: 1024           # FFT size.
+hop_size: 256            # Hop size.
+win_length: null         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: true       # Whether to trim the start and end of silence.
+trim_threshold_in_db: 20 # Need to tune carefully if the recording is not good.
+trim_frame_size: 1024    # Frame size in trimming.
+trim_hop_size: 256       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Kernel size of dilated convolution.
+    layers: 30            # Number of residual block layers.
+    stacks: 3             # Number of stacks i.e., dilation cycles.
+    residual_channels: 64 # Number of channels in residual conv.
+    gate_channels: 128    # Number of channels in gated conv.
+    skip_channels: 64     # Number of channels in skip conv.
+    aux_channels: 80      # Number of channels for auxiliary feature conv.
+                          # Must be the same as num_mels.
+    aux_context_window: 2 # Context window size for auxiliary feature.
+                          # If set to 2, previous 2 and future 2 frames will be considered.
+    dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
+    upsample_params:                      # Upsampling network parameters.
+        upsample_scales: [4, 4, 4, 4]     # Upsampling scales. Prodcut of these must be the same as hop size.
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Number of output channels.
+    layers: 10            # Number of conv layers.
+    conv_channels: 64     # Number of chnn layers.
+    bias: true            # Whether to use bias parameter in conv.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
+    nonlinear_activation_params:      # Nonlinear function parameters
+        negative_slope: 0.2           # Alpha in LeakyReLU.
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 4.0  # Loss balancing coefficient.
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 10             # Batch size.
+batch_max_steps: 15360     # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2             # Number of workers in Pytorch DataLoader.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    lr: 0.0001             # Generator's learning rate.
+    eps: 1.0e-6            # Generator's epsilon.
+    weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+    step_size: 200000      # Generator's scheduler step size.
+    gamma: 0.5             # Generator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+    lr: 0.00005            # Discriminator's learning rate.
+    eps: 1.0e-6            # Discriminator's epsilon.
+    weight_decay: 0.0      # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    step_size: 200000      # Discriminator's scheduler step size.
+    gamma: 0.5             # Discriminator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 400000                 # Number of training steps.
+save_interval_steps: 5000               # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+log_interval_steps: 100                 # Interval steps to record the training log.
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.

ParallelWaveGAN/egs/jnas/voc1/conf/slurm.conf ADDED Viewed

	@@ -0,0 +1,12 @@

+# Default configuration
+command sbatch --export=PATH  --ntasks-per-node=1
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
+option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1 # Do not add anything to qsub_opts
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.

ParallelWaveGAN/egs/jnas/voc1/conf/train_speakers.txt ADDED Viewed

	@@ -0,0 +1,261 @@

+F001
+F002
+F003
+F004
+F007
+F008
+F009
+F010
+F012
+F013
+F015
+F016
+F018
+F019
+F020
+F022
+F023
+F024
+F025
+F026
+F027
+F028
+F029
+F030
+F031
+F032
+F033
+F034
+F035
+F036
+F037
+F038
+F039
+F040
+F041
+F042
+F043
+F044
+F045
+F046
+F047
+F049
+F050
+F051
+F052
+F053
+F054
+F055
+F056
+F057
+F058
+F059
+F061
+F062
+F063
+F065
+F066
+F067
+F069
+F070
+F071
+F073
+F074
+F076
+F077
+F079
+F081
+F083
+F084
+F085
+F087
+F090
+F091
+F092
+F093
+F094
+F095
+F096
+F097
+F098
+F099
+F100
+F101
+F103
+F104
+F105
+F106
+F107
+F108
+F110
+F111
+F112
+F113
+F114
+F115
+F116
+F117
+F118
+F119
+F120
+F121
+F123
+F124
+F125
+F126
+F127
+F128
+F130
+F131
+F132
+F133
+F134
+F135
+F136
+F137
+F138
+F143A
+F143B
+F144A
+F145A
+F145B
+F146A
+F146B
+F149
+F150
+FP01
+FP02
+FP03
+FP04
+FP05
+M001
+M002
+M003
+M004
+M007
+M008
+M009
+M010
+M011
+M012
+M013
+M015
+M016
+M018
+M019
+M020
+M021
+M022
+M023
+M024
+M025
+M027
+M028
+M029
+M030
+M031
+M032
+M033
+M034
+M035
+M036
+M037
+M038
+M039
+M040
+M041
+M042
+M043
+M044
+M045
+M046
+M047
+M049
+M050
+M051
+M052
+M053
+M054
+M055
+M056
+M057
+M058
+M059
+M061
+M062
+M063
+M065
+M066
+M067
+M069
+M070
+M071
+M073
+M074
+M076
+M077
+M079
+M081
+M083
+M084
+M085
+M087
+M090
+M091
+M092
+M093
+M094
+M095
+M096
+M097
+M098
+M099
+M100
+M101
+M103
+M104
+M105
+M106
+M107
+M108
+M110
+M111
+M112
+M113
+M114
+M116
+M117
+M118
+M119
+M120
+M121
+M123
+M124
+M125
+M126
+M127
+M128
+M130
+M131
+M132
+M133
+M134
+M135
+M136
+M137
+M138
+M139A
+M139B
+M140A
+M140B
+M141A
+M141B
+M142A
+M142B
+M147
+M148
+MP01
+MP02
+MP03
+MP04
+MP05

ParallelWaveGAN/egs/jnas/voc1/local/data_prep.sh ADDED Viewed

	@@ -0,0 +1,89 @@

+#!/bin/bash
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+# shellcheck disable=SC1091
+. ./path.sh || exit 1;
+num_dev=500
+train_set="train_nodev"
+dev_set="dev"
+eval_set="eval"
+shuffle=false
+# shellcheck disable=SC1091
+. utils/parse_options.sh || exit 1;
+# check arguments
+if [ $# != 3 ]; then
+    echo "Usage: $0 <db_root> <data_dir> <spk_list>"
+    echo "e.g.: $0 /database/JNAS data conf/train_speakers.txt"
+    echo ""
+    echo "Options:"
+    echo "    --num_dev: number of development uttreances (default=500)."
+    echo "    --train_set: name of train set (default=train_nodev)."
+    echo "    --dev_set: name of dev set (default=dev)."
+    echo "    --eval_set: name of eval set (default=eval)."
+    echo "    --shuffle: whether to perform shuffle in making dev / eval set (default=false)."
+    exit 1
+fi
+set -euo pipefail
+db_root=$1  # database root directory
+data_dir=$2
+spk_list=$3
+eval_db_root=${db_root}/DOCS/Test_set
+wav_type=HS  # DT or HS
+# make directories
+for name in train "${eval_set}"; do
+    [ ! -e "${data_dir}/${name}" ] && mkdir -p "${data_dir}/${name}"
+done
+# make training & development data
+scp="${data_dir}/train/wav.scp"
+# check file existence
+[ -e "${scp}" ] && rm "${scp}"
+# shellcheck disable=SC2013
+for spk in $(cat "${spk_list}"); do
+    wavdir=${db_root}/WAVES_${wav_type}/${spk}
+    [ ! -e "${wavdir}" ] && echo "There are no such a directory (${wavdir})" && exit 1
+    find "${wavdir}" -follow -name "*.wav" | sort | while read -r filename; do
+        id=$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")
+        echo "${spk}_${id} ${filename}" >> "${scp}"
+    done
+done
+# shuffle
+cp "${scp}" "${scp}.tmp"
+sort -R "${scp}.tmp" > "${scp}"
+rm -r "${scp}.tmp"
+# split
+utils/split_data.sh \
+    --num_second ${num_dev} \
+    --shuffle "${shuffle}" \
+    "${data_dir}/train" \
+    "${data_dir}/${train_set}" \
+    "${data_dir}/${dev_set}"
+# make evaluation data
+scp="${data_dir}/${eval_set}/wav.scp"
+# check file existence
+[ -e "${scp}" ] && rm "${scp}"
+for name in JNAS_testset_100 JNAS_testset_500; do
+    find "${eval_db_root}/${name}/WAVES" -follow -name "*.wav" | sort | while read -r filename; do
+        id=$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")
+        dirname=$(basename "$(dirname "${filename}")")
+        echo "${name}_${dirname}_${id} ${filename}" >> "${scp}"
+    done
+done
+echo "Successfully prepared data."

ParallelWaveGAN/egs/jnas/voc1/path.sh ADDED Viewed

	@@ -0,0 +1,33 @@

+# cuda related
+export CUDA_HOME=/usr/local/cuda-10.0
+export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
+# path related
+export PRJ_ROOT="${PWD}/../../.."
+if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then
+    # shellcheck disable=SC1090
+    . "${PRJ_ROOT}/tools/venv/bin/activate"
+fi
+# python related
+export OMP_NUM_THREADS=1
+export PYTHONIOENCODING=UTF-8
+export MPL_BACKEND=Agg
+# check installation
+if ! command -v parallel-wavegan-train > /dev/null; then
+    echo "Error: It seems setup is not finished." >&2
+    echo "Error: Please setup your environment by following README.md" >&2
+    return 1
+fi
+if ! command -v jq > /dev/null; then
+    echo "Error: It seems jq is not installed." >&2
+    echo "Error: Please install via \`sudo apt-get install jq\`." >&2
+    echo "Error: If you do not have sudo, please download from https://stedolan.github.io/jq/download/." >&2
+    return 1
+fi
+if ! command -v yq > /dev/null; then
+    echo "Error: It seems yq is not installed." >&2
+    echo "Error: Please install via \`pip install yq\`." >&2
+    return 1
+fi

ParallelWaveGAN/egs/jnas/voc1/run.sh ADDED Viewed

	@@ -0,0 +1,158 @@

+#!/bin/bash
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+. ./cmd.sh || exit 1;
+. ./path.sh || exit 1;
+# basic settings
+stage=0        # stage to start
+stop_stage=100 # stage to stop
+verbose=1      # verbosity level (lower is less info)
+n_gpus=1       # number of gpus in training
+n_jobs=16      # number of parallel jobs in feature extraction
+# NOTE(kan-bayashi): renamed to conf to avoid conflict in parse_options.sh
+conf=conf/parallel_wavegan.v1.yaml
+# directory path setting
+db_root=/database/JNAS # database direcotry
+dumpdir=dump           # directory to dump features
+# training related setting
+tag=""     # tag for directory to save model
+resume=""  # checkpoint path to resume training
+           # (e.g. <path>/<to>/checkpoint-10000steps.pkl)
+# decoding related setting
+checkpoint="" # checkpoint path to be used for decoding
+              # if not provided, the latest one will be used
+              # (e.g. <path>/<to>/checkpoint-400000steps.pkl)
+# shellcheck disable=SC1091
+. utils/parse_options.sh || exit 1;
+train_set="train_nodev" # name of training data directory
+dev_set="dev"           # name of development data direcotry
+eval_set="eval"         # name of evaluation data direcotry
+set -euo pipefail
+if [ "${stage}" -le 0 ] && [ "${stop_stage}" -ge 0 ]; then
+    echo "Stage 0: Data preparation"
+    local/data_prep.sh \
+        --train_set "${train_set}" \
+        --dev_set "${dev_set}" \
+        --eval_set "${eval_set}" \
+        "${db_root}" data conf/train_speakers.txt
+fi
+stats_ext=$(grep -q "hdf5" <(yq ".format" "${conf}") && echo "h5" || echo "npy")
+if [ "${stage}" -le 1 ] && [ "${stop_stage}" -ge 1 ]; then
+    echo "Stage 1: Feature extraction"
+    # extract raw features
+    pids=()
+    for name in "${train_set}" "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${dumpdir}/${name}/raw" ] && mkdir -p "${dumpdir}/${name}/raw"
+        echo "Feature extraction start. See the progress via ${dumpdir}/${name}/raw/preprocessing.*.log."
+        utils/make_subset_data.sh "data/${name}" "${n_jobs}" "${dumpdir}/${name}/raw"
+        ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/raw/preprocessing.JOB.log" \
+            parallel-wavegan-preprocess \
+                --config "${conf}" \
+                --scp "${dumpdir}/${name}/raw/wav.JOB.scp" \
+                --dumpdir "${dumpdir}/${name}/raw/dump.JOB" \
+                --verbose "${verbose}"
+        echo "Successfully finished feature extraction of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished feature extraction."
+    # calculate statistics for normalization
+    echo "Statistics computation start. See the progress via ${dumpdir}/${train_set}/compute_statistics.log."
+    ${train_cmd} "${dumpdir}/${train_set}/compute_statistics.log" \
+        parallel-wavegan-compute-statistics \
+            --config "${conf}" \
+            --rootdir "${dumpdir}/${train_set}/raw" \
+            --dumpdir "${dumpdir}/${train_set}" \
+            --verbose "${verbose}"
+    echo "Successfully finished calculation of statistics."
+    # normalize and dump them
+    pids=()
+    for name in "${train_set}" "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${dumpdir}/${name}/norm" ] && mkdir -p "${dumpdir}/${name}/norm"
+        echo "Nomalization start. See the progress via ${dumpdir}/${name}/norm/normalize.*.log."
+        ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/norm/normalize.JOB.log" \
+            parallel-wavegan-normalize \
+                --config "${conf}" \
+                --stats "${dumpdir}/${train_set}/stats.${stats_ext}" \
+                --rootdir "${dumpdir}/${name}/raw/dump.JOB" \
+                --dumpdir "${dumpdir}/${name}/norm/dump.JOB" \
+                --verbose "${verbose}"
+        echo "Successfully finished normalization of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished normalization."
+fi
+if [ -z "${tag}" ]; then
+    expdir="exp/${train_set}_jnas_$(basename "${conf}" .yaml)"
+else
+    expdir="exp/${train_set}_jnas_${tag}"
+fi
+if [ "${stage}" -le 2 ] && [ "${stop_stage}" -ge 2 ]; then
+    echo "Stage 2: Network training"
+    [ ! -e "${expdir}" ] && mkdir -p "${expdir}"
+    cp "${dumpdir}/${train_set}/stats.${stats_ext}" "${expdir}"
+    if [ "${n_gpus}" -gt 1 ]; then
+        train="python -m parallel_wavegan.distributed.launch --nproc_per_node ${n_gpus} -c parallel-wavegan-train"
+    else
+        train="parallel-wavegan-train"
+    fi
+    echo "Training start. See the progress via ${expdir}/train.log."
+    ${cuda_cmd} --gpu "${n_gpus}" "${expdir}/train.log" \
+        ${train} \
+            --config "${conf}" \
+            --train-dumpdir "${dumpdir}/${train_set}/norm" \
+            --dev-dumpdir "${dumpdir}/${dev_set}/norm" \
+            --outdir "${expdir}" \
+            --resume "${resume}" \
+            --verbose "${verbose}"
+    echo "Successfully finished training."
+fi
+if [ "${stage}" -le 3 ] && [ "${stop_stage}" -ge 3 ]; then
+    echo "Stage 3: Network decoding"
+    # shellcheck disable=SC2012
+    [ -z "${checkpoint}" ] && checkpoint="$(ls -dt "${expdir}"/*.pkl | head -1 || true)"
+    outdir="${expdir}/wav/$(basename "${checkpoint}" .pkl)"
+    pids=()
+    for name in "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${outdir}/${name}" ] && mkdir -p "${outdir}/${name}"
+        [ "${n_gpus}" -gt 1 ] && n_gpus=1
+        echo "Decoding start. See the progress via ${outdir}/${name}/decode.log."
+        ${cuda_cmd} --gpu "${n_gpus}" "${outdir}/${name}/decode.log" \
+            parallel-wavegan-decode \
+                --dumpdir "${dumpdir}/${name}/norm" \
+                --checkpoint "${checkpoint}" \
+                --outdir "${outdir}/${name}" \
+                --verbose "${verbose}"
+        echo "Successfully finished decoding of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished decoding."
+fi
+echo "Finished."

ParallelWaveGAN/egs/jnas/voc1/utils ADDED Viewed

	@@ -0,0 +1 @@


1	+ ../../../utils

ParallelWaveGAN/egs/jsss/voc1/cmd.sh ADDED Viewed

	@@ -0,0 +1,91 @@

+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend="local"
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+    # The other usage
+    export train_cmd="utils/run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="utils/run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="utils/run.pl"
+# Local machine, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+    # The other usage
+    export train_cmd="utils/stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="utils/stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="utils/stdout.pl"
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+    export train_cmd="utils/queue.pl"
+    export cuda_cmd="utils/queue.pl"
+    export decode_cmd="utils/queue.pl"
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+    export train_cmd="utils/slurm.pl"
+    export cuda_cmd="utils/slurm.pl"
+    export decode_cmd="utils/slurm.pl"
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+    export train_cmd="utils/ssh.pl"
+    export cuda_cmd="utils/ssh.pl"
+    export decode_cmd="utils/ssh.pl"
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi

ParallelWaveGAN/egs/jsss/voc1/conf/parallel_wavegan.v1.yaml ADDED Viewed

	@@ -0,0 +1,122 @@

+# This is the hyperparameter configuration file for Parallel WaveGAN.
+# Please make sure this is adjusted for the JSSS dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration requires 12 GB GPU memory and takes ~3 days on RTX TITAN.
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 24000     # Sampling rate.
+fft_size: 2048           # FFT size.
+hop_size: 300            # Hop size.
+win_length: 1200         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: false      # Whether to trim the start and end of silence.
+trim_threshold_in_db: 40 # Need to tune carefully if the recording is not good.
+trim_frame_size: 2048    # Frame size in trimming.
+trim_hop_size: 512       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Kernel size of dilated convolution.
+    layers: 30            # Number of residual block layers.
+    stacks: 3             # Number of stacks i.e., dilation cycles.
+    residual_channels: 64 # Number of channels in residual conv.
+    gate_channels: 128    # Number of channels in gated conv.
+    skip_channels: 64     # Number of channels in skip conv.
+    aux_channels: 80      # Number of channels for auxiliary feature conv.
+                          # Must be the same as num_mels.
+    aux_context_window: 2 # Context window size for auxiliary feature.
+                          # If set to 2, previous 2 and future 2 frames will be considered.
+    dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
+    upsample_params:                      # Upsampling network parameters.
+        upsample_scales: [4, 5, 3, 5]     # Upsampling scales. Prodcut of these must be the same as hop size.
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Number of output channels.
+    layers: 10            # Number of conv layers.
+    conv_channels: 64     # Number of chnn layers.
+    bias: true            # Whether to use bias parameter in conv.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
+    nonlinear_activation_params:      # Nonlinear function parameters
+        negative_slope: 0.2           # Alpha in LeakyReLU.
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 4.0  # Loss balancing coefficient.
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 6              # Batch size.
+batch_max_steps: 25500     # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2             # Number of workers in Pytorch DataLoader.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    lr: 0.0001             # Generator's learning rate.
+    eps: 1.0e-6            # Generator's epsilon.
+    weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+    step_size: 200000      # Generator's scheduler step size.
+    gamma: 0.5             # Generator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+    lr: 0.00005            # Discriminator's learning rate.
+    eps: 1.0e-6            # Discriminator's epsilon.
+    weight_decay: 0.0      # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    step_size: 200000      # Discriminator's scheduler step size.
+    gamma: 0.5             # Discriminator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 400000                 # Number of training steps.
+save_interval_steps: 5000               # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+log_interval_steps: 100                 # Interval steps to record the training log.
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.

ParallelWaveGAN/egs/jsss/voc1/conf/slurm.conf ADDED Viewed

	@@ -0,0 +1,12 @@

+# Default configuration
+command sbatch --export=PATH  --ntasks-per-node=1
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
+option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1 # Do not add anything to qsub_opts
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.

ParallelWaveGAN/egs/jsss/voc1/local/data_download.sh ADDED Viewed

	@@ -0,0 +1,41 @@

+#!/bin/bash
+# Copyright 2020 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+# Download JSSS Corpus
+# shellcheck disable=SC1091
+. ./path.sh || exit 1
+download_dir=$1
+# check arguments
+if [ $# != 1 ]; then
+    echo "Usage: $0 <download_dir>"
+    exit 1
+fi
+set -euo pipefail
+url="https://drive.google.com/a/g.sp.m.is.nagoya-u.ac.jp/uc?id=1NyiZCXkYTdYBNtD1B-IMAYCVa-0SQsKX"
+if [ ! -e "${download_dir}/jsss_ver1" ]; then
+    utils/download_from_google_drive.sh "${url}" "${download_dir}" zip
+    echo "Successfully downloaded JSSS corpus."
+else
+    echo "Already exists. Skipped."
+fi
+cwd=$(pwd)
+if [ ! -e "${download_dir}/JSSSLabel" ]; then
+    echo "Downloading full-context labels for jsut v1.1..."
+    cd "${download_dir}"
+    git clone https://github.com/kan-bayashi/JSSSLabel
+    for name in long-form short-form simplification summarization; do
+        cp -vr JSSSLabel/${name} jsss_ver1
+    done
+    cd "${cwd}"
+    echo "Successfully downloaded JSSS label."
+else
+    echo "Already exists. Skipped."
+fi

ParallelWaveGAN/egs/jsss/voc1/local/data_prep.sh ADDED Viewed

	@@ -0,0 +1,180 @@

+#!/bin/bash
+# Copyright 2020 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+# Prepare kaldi-style data directory for JSSS corpus
+fs=24000
+num_dev=50
+num_eval=50
+train_set="train_nodev"
+dev_set="dev"
+eval_set="eval"
+shuffle=false
+# shellcheck disable=SC1091
+. utils/parse_options.sh || exit 1;
+db=$1
+data_dir_root=$2
+# check arguments
+if [ $# != 2 ]; then
+    echo "Usage: $0 [Options] <db> <data_dir>"
+    echo "e.g.: $0 downloads/jsss_ver1 data"
+    echo ""
+    echo "Options:"
+    echo "    --fs: target sampling rate (default=24000)."
+    echo "    --num_dev: number of development uttreances (default=50)."
+    echo "    --num_eval: number of evaluation uttreances (default=50)."
+    echo "    --train_set: name of train set (default=train_nodev)."
+    echo "    --dev_set: name of dev set (default=dev)."
+    echo "    --eval_set: name of eval set (default=eval)."
+    echo "    --shuffle: whether to perform shuffle in making dev / eval set (default=false)."
+    exit 1
+fi
+set -euo pipefail
+######################################
+#    process data without segments   #
+######################################
+dsets_without_segments="
+short-form/basic5000
+short-form/onomatopee300
+short-form/voiceactress100
+simplification
+"
+for dset in ${dsets_without_segments}; do
+    # check directory existence
+    _data_dir=${data_dir_root}/$(basename "${dset}")
+    [ ! -e "${_data_dir}" ] && mkdir -p "${_data_dir}"
+    # set filenames
+    scp=${_data_dir}/wav.scp
+    segments=${_data_dir}/segments
+    # check file existence
+    [ -e "${scp}" ] && rm "${scp}"
+    [ -e "${segments}" ] && rm "${segments}"
+    # make wav.scp and segments
+    find "${db}/${dset}/wav24kHz16bit" -name "*.wav" | sort | while read -r filename; do
+        utt_id=$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")
+        lab_filename="${db}/${dset}/lab/$(basename "${filename}" .wav).lab"
+        if [ ! -e "${lab_filename}" ]; then
+            echo "${lab_filename} does not exist. Skipped."
+            continue
+        fi
+        start_sec=$(head -n 1 "${lab_filename}" | cut -d " " -f 2)
+        end_sec=$(tail -n 1 "${lab_filename}" | cut -d " " -f 1)
+        echo "${utt_id} ${utt_id} ${start_sec} ${end_sec}" >> "${segments}"
+        if [ "${fs}" -eq 24000 ]; then
+            # default sampling rate
+            echo "${utt_id} ${filename}" >> "${scp}"
+        else
+            echo "${utt_id} sox ${filename} -t wav -r $fs - |" >> "${scp}"
+        fi
+    done
+    echo "Successfully prepared ${dset}."
+done
+######################################
+#     process data with segments     #
+######################################
+dsets_with_segments="
+long-form/katsura-masakazu
+long-form/udon
+long-form/washington-dc
+summarization
+"
+for dset in ${dsets_with_segments}; do
+    # check directory existence
+    _data_dir=${data_dir_root}/$(basename "${dset}")
+    [ ! -e "${_data_dir}" ] && mkdir -p "${_data_dir}"
+    # set filenames
+    scp=${_data_dir}/wav.scp
+    segments=${_data_dir}/segments
+    # check file existence
+    [ -e "${scp}" ] && rm "${scp}"
+    [ -e "${segments}" ] && rm "${segments}"
+    # make wav.scp
+    find "${db}/${dset}/wav24kHz16bit" -name "*.wav" | sort | while read -r filename; do
+        wav_id=$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")
+        if [ "${fs}" -eq 24000 ]; then
+            # default sampling rate
+            echo "${wav_id} ${filename}" >> "${scp}"
+        else
+            echo "${wav_id} sox ${filename} -t wav -r $fs - |" >> "${scp}"
+        fi
+    done
+    # make segments
+    find "${db}/${dset}/transcript_utf8" -name "*.txt" | sort | while read -r filename; do
+        wav_id=$(basename "${filename}" .txt)
+        while read -r line; do
+            start_sec=$(echo "${line}" | cut -f 1)
+            end_sec=$(echo "${line}" | cut -f 2)
+            utt_id=${wav_id}
+            utt_id+="_$(printf %010d "$(echo "${start_sec}" | tr -d "." | sed -e "s/^[0]*//g")")"
+            utt_id+="_$(printf %010d "$(echo "${end_sec}" | tr -d "." | sed -e "s/^[0]*//g")")"
+            # modify segment information with force alignment results
+            lab_filename=${db}/${dset}/lab/${utt_id}.lab
+            if [ ! -e "${lab_filename}" ]; then
+                echo "${lab_filename} does not exist. Skipped."
+                continue
+            fi
+            start_sec_offset=$(head -n 1 "${lab_filename}" | cut -d " " -f 2)
+            end_sec_offset=$(tail -n 1 "${lab_filename}" | cut -d " " -f 1)
+            start_sec=$(python -c "print(${start_sec} + ${start_sec_offset})")
+            end_sec=$(python -c "print(${start_sec} + ${end_sec_offset} - ${start_sec_offset})")
+            echo "${utt_id} ${wav_id} ${start_sec} ${end_sec}" >> "${segments}"
+        done < "${filename}"
+    done
+    # fix
+    echo "Successfully prepared ${dset}."
+done
+######################################
+#       combine and split data       #
+######################################
+# combine all data
+combined_data_dirs=""
+for dset in ${dsets_without_segments} ${dsets_with_segments}; do
+    combined_data_dirs+="${data_dir_root}/$(basename "${dset}") "
+done
+# shellcheck disable=SC2086
+utils/combine_data.sh "${data_dir_root}/all" ${combined_data_dirs}
+# shellcheck disable=SC2086
+rm -rf ${combined_data_dirs}
+# split
+num_all=$(wc -l < "${data_dir_root}/all/segments")
+num_deveval=$((num_dev + num_eval))
+num_train=$((num_all - num_deveval))
+utils/split_data.sh \
+    --num_first "${num_deveval}" \
+    --num_second "${num_train}" \
+    --shuffle "${shuffle}" \
+    "${data_dir_root}/all" \
+    "${data_dir_root}/deveval" \
+    "${data_dir_root}/${train_set}"
+utils/split_data.sh \
+    --num_first "${num_eval}" \
+    --num_second "${num_dev}" \
+    --shuffle "${shuffle}" \
+    "${data_dir_root}/deveval" \
+    "${data_dir_root}/${eval_set}" \
+    "${data_dir_root}/${dev_set}"
+# remove tmp directories
+rm -rf "${data_dir_root}/all"
+rm -rf "${data_dir_root}/deveval"
+echo "Successfully prepared data."

ParallelWaveGAN/egs/jsss/voc1/path.sh ADDED Viewed

	@@ -0,0 +1,33 @@

+# cuda related
+export CUDA_HOME=/usr/local/cuda-10.0
+export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
+# path related
+export PRJ_ROOT="${PWD}/../../.."
+if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then
+    # shellcheck disable=SC1090
+    . "${PRJ_ROOT}/tools/venv/bin/activate"
+fi
+# python related
+export OMP_NUM_THREADS=1
+export PYTHONIOENCODING=UTF-8
+export MPL_BACKEND=Agg
+# check installation
+if ! command -v parallel-wavegan-train > /dev/null; then
+    echo "Error: It seems setup is not finished." >&2
+    echo "Error: Please setup your environment by following README.md" >&2
+    return 1
+fi
+if ! command -v jq > /dev/null; then
+    echo "Error: It seems jq is not installed." >&2
+    echo "Error: Please install via \`sudo apt-get install jq\`." >&2
+    echo "Error: If you do not have sudo, please download from https://stedolan.github.io/jq/download/." >&2
+    return 1
+fi
+if ! command -v yq > /dev/null; then
+    echo "Error: It seems yq is not installed." >&2
+    echo "Error: Please install via \`pip install yq\`." >&2
+    return 1
+fi

ParallelWaveGAN/egs/jsss/voc1/run.sh ADDED Viewed

	@@ -0,0 +1,186 @@

+#!/bin/bash
+# Copyright 2020 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+. ./cmd.sh || exit 1;
+. ./path.sh || exit 1;
+# basic settings
+stage=-1       # stage to start
+stop_stage=100 # stage to stop
+verbose=1      # verbosity level (lower is less info)
+n_gpus=1       # number of gpus in training
+n_jobs=4       # number of parallel jobs in feature extraction
+# NOTE(kan-bayashi): renamed to conf to avoid conflict in parse_options.sh
+conf=conf/parallel_wavegan.v1.yaml
+# directory path setting
+download_dir=downloads # direcotry to save downloaded files
+dumpdir=dump # directory to dump features
+# subset setting
+shuffle=false # whether to shuffle the data to create subset
+num_dev=50    # the number of development data
+num_eval=50   # the number of evaluation data
+              # (if set to 0, the same dev set is used as eval set)
+# training related setting
+tag=""     # tag for directory to save model
+resume=""  # checkpoint path to resume training
+           # (e.g. <path>/<to>/checkpoint-10000steps.pkl)
+pretrain="" # checkpoint path to load pretrained parameters
+            # (e.g. ../../jsut/<path>/<to>/checkpoint-400000steps.pkl)
+# decoding related setting
+checkpoint="" # checkpoint path to be used for decoding
+              # if not provided, the latest one will be used
+              # (e.g. <path>/<to>/checkpoint-400000steps.pkl)
+# shellcheck disable=SC1091
+. utils/parse_options.sh || exit 1;
+train_set="train_nodev" # name of training data directory
+dev_set="dev"           # name of development data direcotry
+eval_set="eval"         # name of evaluation data direcotry
+set -euo pipefail
+if [ "${stage}" -le -1 ] && [ "${stop_stage}" -ge -1 ]; then
+    echo "Stage -1: Data download"
+    local/data_download.sh "${download_dir}"
+fi
+if [ "${stage}" -le 0 ] && [ "${stop_stage}" -ge 0 ]; then
+    echo "Stage 0: Data preparation"
+    local/data_prep.sh \
+        --fs "$(yq ".sampling_rate" "${conf}")" \
+        --num_dev "${num_dev}" \
+        --num_eval "${num_eval}" \
+        --train_set "${train_set}" \
+        --dev_set "${dev_set}" \
+        --eval_set "${eval_set}" \
+        --shuffle "${shuffle}" \
+        "${download_dir}/jsss_ver1" data
+fi
+stats_ext=$(grep -q "hdf5" <(yq ".format" "${conf}") && echo "h5" || echo "npy")
+if [ "${stage}" -le 1 ] && [ "${stop_stage}" -ge 1 ]; then
+    echo "Stage 1: Feature extraction"
+    # extract raw features
+    pids=()
+    for name in "${train_set}" "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${dumpdir}/${name}/raw" ] && mkdir -p "${dumpdir}/${name}/raw"
+        echo "Feature extraction start. See the progress via ${dumpdir}/${name}/raw/preprocessing.*.log."
+        utils/make_subset_data.sh "data/${name}" "${n_jobs}" "${dumpdir}/${name}/raw"
+        ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/raw/preprocessing.JOB.log" \
+            parallel-wavegan-preprocess \
+                --config "${conf}" \
+                --scp "${dumpdir}/${name}/raw/wav.JOB.scp" \
+                --dumpdir "${dumpdir}/${name}/raw/dump.JOB" \
+                --verbose "${verbose}"
+        echo "Successfully finished feature extraction of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished feature extraction."
+    # calculate statistics for normalization
+    if [ -z "${pretrain}" ]; then
+        # calculate statistics for normalization
+        echo "Statistics computation start. See the progress via ${dumpdir}/${train_set}/compute_statistics.log."
+        ${train_cmd} "${dumpdir}/${train_set}/compute_statistics.log" \
+            parallel-wavegan-compute-statistics \
+                --config "${conf}" \
+                --rootdir "${dumpdir}/${train_set}/raw" \
+                --dumpdir "${dumpdir}/${train_set}" \
+                --verbose "${verbose}"
+        echo "Successfully finished calculation of statistics."
+    else
+        echo "Use statistics of pretrained model. Skip statistics computation."
+        cp "$(dirname "${pretrain}")/stats.${stats_ext}" "${dumpdir}/${train_set}"
+    fi
+    # normalize and dump them
+    pids=()
+    for name in "${train_set}" "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${dumpdir}/${name}/norm" ] && mkdir -p "${dumpdir}/${name}/norm"
+        echo "Nomalization start. See the progress via ${dumpdir}/${name}/norm/normalize.*.log."
+        ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/norm/normalize.JOB.log" \
+            parallel-wavegan-normalize \
+                --config "${conf}" \
+                --stats "${dumpdir}/${train_set}/stats.${stats_ext}" \
+                --rootdir "${dumpdir}/${name}/raw/dump.JOB" \
+                --dumpdir "${dumpdir}/${name}/norm/dump.JOB" \
+                --verbose "${verbose}"
+        echo "Successfully finished normalization of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished normalization."
+fi
+if [ -z "${tag}" ]; then
+    expdir="exp/${train_set}_jsss_$(basename "${conf}" .yaml)"
+    if [ -n "${pretrain}" ]; then
+        pretrain_tag=$(basename "$(dirname "${pretrain}")")
+        expdir+="_${pretrain_tag}"
+    fi
+else
+    expdir="exp/${train_set}_jsss_${tag}"
+fi
+if [ "${stage}" -le 2 ] && [ "${stop_stage}" -ge 2 ]; then
+    echo "Stage 2: Network training"
+    [ ! -e "${expdir}" ] && mkdir -p "${expdir}"
+    cp "${dumpdir}/${train_set}/stats.${stats_ext}" "${expdir}"
+    if [ "${n_gpus}" -gt 1 ]; then
+        train="python -m parallel_wavegan.distributed.launch --nproc_per_node ${n_gpus} -c parallel-wavegan-train"
+    else
+        train="parallel-wavegan-train"
+    fi
+    echo "Training start. See the progress via ${expdir}/train.log."
+    ${cuda_cmd} --gpu "${n_gpus}" "${expdir}/train.log" \
+        ${train} \
+            --config "${conf}" \
+            --train-dumpdir "${dumpdir}/${train_set}/norm" \
+            --dev-dumpdir "${dumpdir}/${dev_set}/norm" \
+            --outdir "${expdir}" \
+            --resume "${resume}" \
+            --pretrain "${pretrain}" \
+            --verbose "${verbose}"
+    echo "Successfully finished training."
+fi
+if [ "${stage}" -le 3 ] && [ "${stop_stage}" -ge 3 ]; then
+    echo "Stage 3: Network decoding"
+    # shellcheck disable=SC2012
+    [ -z "${checkpoint}" ] && checkpoint="$(ls -dt "${expdir}"/*.pkl | head -1 || true)"
+    outdir="${expdir}/wav/$(basename "${checkpoint}" .pkl)"
+    pids=()
+    for name in "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${outdir}/${name}" ] && mkdir -p "${outdir}/${name}"
+        [ "${n_gpus}" -gt 1 ] && n_gpus=1
+        echo "Decoding start. See the progress via ${outdir}/${name}/decode.log."
+        ${cuda_cmd} --gpu "${n_gpus}" "${outdir}/${name}/decode.log" \
+            parallel-wavegan-decode \
+                --dumpdir "${dumpdir}/${name}/norm" \
+                --checkpoint "${checkpoint}" \
+                --outdir "${outdir}/${name}" \
+                --verbose "${verbose}"
+        echo "Successfully finished decoding of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished decoding."
+fi
+echo "Finished."

ParallelWaveGAN/egs/jsss/voc1/utils ADDED Viewed

	@@ -0,0 +1 @@


1	+ ../../../utils

ParallelWaveGAN/egs/jsut/voc1/cmd.sh ADDED Viewed

	@@ -0,0 +1,91 @@

+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend="local"
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+    # The other usage
+    export train_cmd="utils/run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="utils/run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="utils/run.pl"
+# Local machine, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+    # The other usage
+    export train_cmd="utils/stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="utils/stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="utils/stdout.pl"
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+    export train_cmd="utils/queue.pl"
+    export cuda_cmd="utils/queue.pl"
+    export decode_cmd="utils/queue.pl"
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+    export train_cmd="utils/slurm.pl"
+    export cuda_cmd="utils/slurm.pl"
+    export decode_cmd="utils/slurm.pl"
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+    export train_cmd="utils/ssh.pl"
+    export cuda_cmd="utils/ssh.pl"
+    export decode_cmd="utils/ssh.pl"
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi

ParallelWaveGAN/egs/jsut/voc1/conf/hifigan.v1.yaml ADDED Viewed

	@@ -0,0 +1,180 @@

+# This is the configuration file for JSUT dataset.
+# This configuration is based on HiFiGAN V1, which is
+# an official configuration. But I found that the optimizer
+# setting does not work well with my implementation.
+# So I changed optimizer settings as follows:
+# - AdamW -> Adam
+# - betas: [0.8, 0.99] -> betas: [0.5, 0.9]
+# - Scheduler: ExponentialLR -> MultiStepLR
+# To match the shift size difference, the upsample scales
+# is also modified from the original 256 shift setting.
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 24000     # Sampling rate.
+fft_size: 2048           # FFT size.
+hop_size: 300            # Hop size.
+win_length: 1200         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: false      # Whether to trim the start and end of silence.
+trim_threshold_in_db: 20 # Need to tune carefully if the recording is not good.
+trim_frame_size: 1024    # Frame size in trimming.
+trim_hop_size: 256       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_type: HiFiGANGenerator
+generator_params:
+    in_channels: 80                       # Number of input channels.
+    out_channels: 1                       # Number of output channels.
+    channels: 512                         # Number of initial channels.
+    kernel_size: 7                        # Kernel size of initial and final conv layers.
+    upsample_scales: [5, 5, 4, 3]         # Upsampling scales.
+    upsample_kernel_sizes: [10, 10, 8, 6] # Kernel size for upsampling layers.
+    resblock_kernel_sizes: [3, 7, 11]     # Kernel size for residual blocks.
+    resblock_dilations:                   # Dilations for residual blocks.
+        - [1, 3, 5]
+        - [1, 3, 5]
+        - [1, 3, 5]
+    use_additional_convs: true            # Whether to use additional conv layer in residual blocks.
+    bias: true                            # Whether to use bias parameter in conv.
+    nonlinear_activation: "LeakyReLU"     # Nonlinear activation type.
+    nonlinear_activation_params:          # Nonlinear activation paramters.
+        negative_slope: 0.1
+    use_weight_norm: true                 # Whether to apply weight normalization.
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_type: HiFiGANMultiScaleMultiPeriodDiscriminator
+discriminator_params:
+    scales: 3                              # Number of multi-scale discriminator.
+    scale_downsample_pooling: "AvgPool1d"  # Pooling operation for scale discriminator.
+    scale_downsample_pooling_params:
+        kernel_size: 4                     # Pooling kernel size.
+        stride: 2                          # Pooling stride.
+        padding: 2                         # Padding size.
+    scale_discriminator_params:
+        in_channels: 1                     # Number of input channels.
+        out_channels: 1                    # Number of output channels.
+        kernel_sizes: [15, 41, 5, 3]       # List of kernel sizes.
+        channels: 128                      # Initial number of channels.
+        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
+        max_groups: 16                     # Maximum number of groups in downsampling conv layers.
+        bias: true
+        downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales.
+        nonlinear_activation: "LeakyReLU"  # Nonlinear activation.
+        nonlinear_activation_params:
+            negative_slope: 0.1
+    follow_official_norm: true             # Whether to follow the official norm setting.
+    periods: [2, 3, 5, 7, 11]              # List of period for multi-period discriminator.
+    period_discriminator_params:
+        in_channels: 1                     # Number of input channels.
+        out_channels: 1                    # Number of output channels.
+        kernel_sizes: [5, 3]               # List of kernel sizes.
+        channels: 32                       # Initial number of channels.
+        downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
+        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
+        bias: true                         # Whether to use bias parameter in conv layer."
+        nonlinear_activation: "LeakyReLU"  # Nonlinear activation.
+        nonlinear_activation_params:       # Nonlinear activation paramters.
+            negative_slope: 0.1
+        use_weight_norm: true              # Whether to apply weight normalization.
+        use_spectral_norm: false           # Whether to apply spectral normalization.
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+use_stft_loss: false                 # Whether to use multi-resolution STFT loss.
+use_mel_loss: true                   # Whether to use Mel-spectrogram loss.
+mel_loss_params:
+    fs: 24000
+    fft_size: 2048
+    hop_size: 300
+    win_length: 1200
+    window: "hann"
+    num_mels: 80
+    fmin: 0
+    fmax: 12000
+    log_base: null
+generator_adv_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+discriminator_adv_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+use_feat_match_loss: true
+feat_match_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+    average_by_layers: false         # Whether to average loss by #layers in each discriminator.
+    include_final_outputs: false     # Whether to include final outputs in feat match loss calculation.
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_aux: 45.0       # Loss balancing coefficient for STFT loss.
+lambda_adv: 1.0        # Loss balancing coefficient for adversarial loss.
+lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss..
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 16              # Batch size.
+batch_max_steps: 8400       # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2              # Number of workers in Pytorch DataLoader.
+remove_short_samples: false # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_type: Adam
+generator_optimizer_params:
+    lr: 2.0e-4
+    betas: [0.5, 0.9]
+    weight_decay: 0.0
+generator_scheduler_type: MultiStepLR
+generator_scheduler_params:
+    gamma: 0.5
+    milestones:
+        - 200000
+        - 400000
+        - 600000
+        - 800000
+generator_grad_norm: -1
+discriminator_optimizer_type: Adam
+discriminator_optimizer_params:
+    lr: 2.0e-4
+    betas: [0.5, 0.9]
+    weight_decay: 0.0
+discriminator_scheduler_type: MultiStepLR
+discriminator_scheduler_params:
+    gamma: 0.5
+    milestones:
+        - 200000
+        - 400000
+        - 600000
+        - 800000
+discriminator_grad_norm: -1
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+generator_train_start_steps: 1     # Number of steps to start to train discriminator.
+discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
+train_max_steps: 2500000           # Number of training steps.
+save_interval_steps: 10000         # Interval steps to save checkpoint.
+eval_interval_steps: 1000          # Interval steps to evaluate the network.
+log_interval_steps: 100            # Interval steps to record the training log.
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.

ParallelWaveGAN/egs/jsut/voc1/conf/multi_band_melgan.v2.yaml ADDED Viewed

	@@ -0,0 +1,150 @@

+# This is the hyperparameter configuration file for MelGAN.
+# Please make sure this is adjusted for the JSUT dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration requires ~ 8GB memory and will finish within 4 days on Titan V.
+# This configuration is based on full-band MelGAN but the hop size and sampling
+# rate is different from the paper (16kHz vs 24kHz). The number of iteraions
+# is now shown in the paper so currently we train 1M iterations (not sure enough
+# to converge). The optimizer setting is based on @dathudeptrai advice.
+# https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 24000     # Sampling rate.
+fft_size: 2048           # FFT size.
+hop_size: 300            # Hop size.
+win_length: 1200         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: false      # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 2048    # Frame size in trimming.
+trim_hop_size: 512       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_type: "MelGANGenerator" # Generator type.
+generator_params:
+    in_channels: 80               # Number of input channels.
+    out_channels: 4               # Number of output channels.
+    kernel_size: 7                # Kernel size of initial and final conv layers.
+    channels: 384                 # Initial number of channels for conv layers.
+    upsample_scales: [5, 5, 3]    # List of Upsampling scales.
+    stack_kernel_size: 3          # Kernel size of dilated conv layers in residual stack.
+    stacks: 4                     # Number of stacks in a single residual stack module.
+    use_weight_norm: True         # Whether to use weight normalization.
+    use_causal_conv: False        # Whether to use causal convolution.
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_type: "MelGANMultiScaleDiscriminator" # Discriminator type.
+discriminator_params:
+    in_channels: 1                    # Number of input channels.
+    out_channels: 1                   # Number of output channels.
+    scales: 3                         # Number of multi-scales.
+    downsample_pooling: "AvgPool1d"   # Pooling type for the input downsampling.
+    downsample_pooling_params:        # Parameters of the above pooling function.
+        kernel_size: 4
+        stride: 2
+        padding: 1
+        count_include_pad: False
+    kernel_sizes: [5, 3]              # List of kernel size.
+    channels: 16                      # Number of channels of the initial conv layer.
+    max_downsample_channels: 512      # Maximum number of channels of downsampling layers.
+    downsample_scales: [4, 4, 4]      # List of downsampling scales.
+    nonlinear_activation: "LeakyReLU" # Nonlinear activation function.
+    nonlinear_activation_params:      # Parameters of nonlinear activation function.
+        negative_slope: 0.2
+    use_weight_norm: True             # Whether to use weight norm.
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+use_subband_stft_loss: true
+subband_stft_loss_params:
+    fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
+    hop_sizes: [30, 60, 10]     # List of hop size for STFT-based loss
+    win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
+    window: "hann_window"       # Window function for STFT-based loss
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+use_feat_match_loss: false # Whether to use feature matching loss.
+lambda_adv: 2.5            # Loss balancing coefficient for adversarial loss.
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 64             # Batch size.
+batch_max_steps: 16200     # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
+num_workers: 4             # Number of workers in Pytorch DataLoader.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_type: "Adam"       # Generator's optimizer type.
+generator_optimizer_params:
+    lr: 1.0e-3                          # Generator's learning rate.
+    eps: 1.0e-7                         # Generator's epsilon.
+    weight_decay: 0.0                   # Generator's weight decay coefficient.
+    amsgrad: true
+generator_grad_norm: -1                 # Generator's gradient norm.
+generator_scheduler_type: "MultiStepLR" # Generator's scheduler type.
+generator_scheduler_params:
+    gamma: 0.5                          # Generator's scheduler gamma.
+    milestones:                         # At each milestone, lr will be multiplied by gamma.
+        - 100000
+        - 200000
+        - 300000
+        - 400000
+        - 500000
+        - 600000
+discriminator_optimizer_type: "Adam"        # Discriminator's optimizer type.
+discriminator_optimizer_params:
+    lr: 1.0e-3                              # Discriminator's learning rate.
+    eps: 1.0e-7                             # Discriminator's epsilon.
+    weight_decay: 0.0                       # Discriminator's weight decay coefficient.
+    amsgrad: true
+discriminator_grad_norm: -1                 # Discriminator's gradient norm.
+discriminator_scheduler_type: "MultiStepLR" # Discriminator's scheduler type.
+discriminator_scheduler_params:
+    gamma: 0.5                              # Discriminator's scheduler gamma.
+    milestones:                             # At each milestone, lr will be multiplied by gamma.
+        - 100000
+        - 200000
+        - 300000
+        - 400000
+        - 500000
+        - 600000
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 200000 # Number of steps to start to train discriminator.
+train_max_steps: 1000000                # Number of training steps.
+save_interval_steps: 50000              # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+log_interval_steps: 1000                # Interval steps to record the training log.
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.

ParallelWaveGAN/egs/jsut/voc1/conf/parallel_wavegan.v1.yaml ADDED Viewed

	@@ -0,0 +1,122 @@

+# This is the hyperparameter configuration file for Parallel WaveGAN.
+# Please make sure this is adjusted for the JSUT dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration requires 12 GB GPU memory and takes ~3 days on RTX TITAN.
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 24000     # Sampling rate.
+fft_size: 2048           # FFT size.
+hop_size: 300            # Hop size.
+win_length: 1200         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: false      # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 2048    # Frame size in trimming.
+trim_hop_size: 512       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Kernel size of dilated convolution.
+    layers: 30            # Number of residual block layers.
+    stacks: 3             # Number of stacks i.e., dilation cycles.
+    residual_channels: 64 # Number of channels in residual conv.
+    gate_channels: 128    # Number of channels in gated conv.
+    skip_channels: 64     # Number of channels in skip conv.
+    aux_channels: 80      # Number of channels for auxiliary feature conv.
+                          # Must be the same as num_mels.
+    aux_context_window: 2 # Context window size for auxiliary feature.
+                          # If set to 2, previous 2 and future 2 frames will be considered.
+    dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
+    upsample_params:                      # Upsampling network parameters.
+        upsample_scales: [4, 5, 3, 5]     # Upsampling scales. Prodcut of these must be the same as hop size.
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Number of output channels.
+    layers: 10            # Number of conv layers.
+    conv_channels: 64     # Number of chnn layers.
+    bias: true            # Whether to use bias parameter in conv.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
+    nonlinear_activation_params:      # Nonlinear function parameters
+        negative_slope: 0.2           # Alpha in LeakyReLU.
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 4.0  # Loss balancing coefficient.
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 6              # Batch size.
+batch_max_steps: 25500     # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2             # Number of workers in Pytorch DataLoader.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    lr: 0.0001             # Generator's learning rate.
+    eps: 1.0e-6            # Generator's epsilon.
+    weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+    step_size: 200000      # Generator's scheduler step size.
+    gamma: 0.5             # Generator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+    lr: 0.00005            # Discriminator's learning rate.
+    eps: 1.0e-6            # Discriminator's epsilon.
+    weight_decay: 0.0      # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    step_size: 200000      # Discriminator's scheduler step size.
+    gamma: 0.5             # Discriminator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 400000                 # Number of training steps.
+save_interval_steps: 5000               # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+log_interval_steps: 100                 # Interval steps to record the training log.
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.

ParallelWaveGAN/egs/jsut/voc1/conf/slurm.conf ADDED Viewed

	@@ -0,0 +1,12 @@

+# Default configuration
+command sbatch --export=PATH  --ntasks-per-node=1
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
+option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1 # Do not add anything to qsub_opts
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.

ParallelWaveGAN/egs/jsut/voc1/conf/style_melgan.v1.yaml ADDED Viewed

	@@ -0,0 +1,147 @@

+# This is the configuration file for JSUT dataset.
+# This configuration is based on StyleMelGAN paper but
+# uses MSE loss instead of Hinge loss. And I found that
+# batch_size = 8 is also working good. So maybe if you
+# want to accelerate the training, you can reduce the
+# batch size (e.g. 8 or 16). Upsampling scales is modified
+# to fit the shift size 300 pt.
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 24000     # Sampling rate.
+fft_size: 2048           # FFT size.
+hop_size: 300            # Hop size.
+win_length: 1200         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: false      # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 1024    # Frame size in trimming.
+trim_hop_size: 256       # Hop size in trimming.
+format: "hdf5"           # Feature file format. " npy " or " hdf5 " is supported.
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_type: "StyleMelGANGenerator" # Generator type.
+generator_params:
+    in_channels: 128
+    aux_channels: 80
+    channels: 64
+    out_channels: 1
+    kernel_size: 9
+    dilation: 2
+    bias: True
+    noise_upsample_scales: [10, 2, 2, 2]
+    noise_upsample_activation: "LeakyReLU"
+    noise_upsample_activation_params:
+        negative_slope: 0.2
+    upsample_scales: [5, 1, 5, 1, 3, 1, 2, 2, 1]
+    upsample_mode: "nearest"
+    gated_function: "softmax"
+    use_weight_norm: True
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_type: "StyleMelGANDiscriminator" # Discriminator type.
+discriminator_params:
+    repeats: 4
+    window_sizes: [512, 1024, 2048, 4096]
+    pqmf_params:
+        - [1, None, None, None]
+        - [2, 62, 0.26700, 9.0]
+        - [4, 62, 0.14200, 9.0]
+        - [8, 62, 0.07949, 9.0]
+    discriminator_params:
+        out_channels: 1
+        kernel_sizes: [5, 3]
+        channels: 16
+        max_downsample_channels: 512
+        bias: True
+        downsample_scales: [4, 4, 4, 1]
+        nonlinear_activation: "LeakyReLU"
+        nonlinear_activation_params:
+            negative_slope: 0.2
+    use_weight_norm: True
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+lambda_aux: 1.0                   # Loss balancing coefficient for aux loss.
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 1.0 # Loss balancing coefficient for adv loss.
+generator_adv_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+discriminator_adv_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 32              # Batch size.
+batch_max_steps: 24000      # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2              # Number of workers in Pytorch DataLoader.
+remove_short_samples: false # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_type: Adam
+generator_optimizer_params:
+    lr: 1.0e-4
+    betas: [0.5, 0.9]
+    weight_decay: 0.0
+generator_scheduler_type: MultiStepLR
+generator_scheduler_params:
+    gamma: 0.5
+    milestones:
+        - 100000
+        - 300000
+        - 500000
+        - 700000
+        - 900000
+generator_grad_norm: -1
+discriminator_optimizer_type: Adam
+discriminator_optimizer_params:
+    lr: 2.0e-4
+    betas: [0.5, 0.9]
+    weight_decay: 0.0
+discriminator_scheduler_type: MultiStepLR
+discriminator_scheduler_params:
+    gamma: 0.5
+    milestones:
+        - 200000
+        - 400000
+        - 600000
+        - 800000
+discriminator_grad_norm: -1
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 1500000                # Number of training steps.
+save_interval_steps: 50000              # Interval steps to save checkpoint.
+eval_interval_steps: 1000              # Interval steps to evaluate the network.
+log_interval_steps: 100                # Interval steps to record the training log.
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.

ParallelWaveGAN/egs/jsut/voc1/local/data_download.sh ADDED Viewed

	@@ -0,0 +1,39 @@

+#!/bin/bash
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+download_dir=$1
+# check arguments
+if [ $# != 1 ]; then
+    echo "Usage: $0 <download_dir>"
+    exit 1
+fi
+set -euo pipefail
+cwd=$(pwd)
+if [ ! -e "${download_dir}/jsut_ver1.1" ]; then
+    mkdir -p "${download_dir}"
+    cd "${download_dir}" || exit 1;
+    wget http://ss-takashi.sakura.ne.jp/corpus/jsut_ver1.1.zip
+    unzip -o ./*.zip
+    rm ./*.zip
+    cd "${cwd}" || exit 1;
+    echo "Successfully downloaded data."
+else
+    echo "Already exists. Skipped."
+fi
+if [ ! -e "${download_dir}/jsut_lab" ]; then
+    cd "${download_dir}" || exit 1;
+    git clone https://github.com/r9y9/jsut-lab
+    for name in loanword128 repeat500 voiceactress100 basic5000 onomatopee300 travel1000 countersuffix26 precedent130 utparaphrase512; do
+        cp -vr "jsut-lab/${name}" jsut_ver1.1/
+    done
+    cd - || exit 1;
+    echo "Successfully downloaded context label."
+else
+    echo "Already exists. Skipped."
+fi

ParallelWaveGAN/egs/jsut/voc1/local/data_prep.sh ADDED Viewed

	@@ -0,0 +1,93 @@

+#!/bin/bash
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+# shellcheck disable=SC1091
+. ./path.sh || exit 1;
+fs=24000
+num_dev=250
+num_eval=250
+train_set="train_nodev"
+dev_set="dev"
+eval_set="eval"
+shuffle=false
+# shellcheck disable=SC1091
+. utils/parse_options.sh || exit 1;
+db_root=$1
+data_dir=$2
+# check arguments
+if [ $# != 2 ]; then
+    echo "Usage: $0 [Options] <db_root> <data_dir>"
+    echo "e.g.: $0 downloads/jsut_ver1.1 data"
+    echo ""
+    echo "Options:"
+    echo "    --fs: target sampling rate (default=24000)."
+    echo "    --num_dev: number of development uttreances (default=250)."
+    echo "    --num_eval: number of evaluation uttreances (default=250)."
+    echo "    --train_set: name of train set (default=train_nodev)."
+    echo "    --dev_set: name of dev set (default=dev)."
+    echo "    --eval_set: name of eval set (default=eval)."
+    echo "    --shuffle: whether to perform shuffle in making dev / eval set (default=false)."
+    exit 1
+fi
+set -euo pipefail
+[ ! -e "${data_dir}/all" ] && mkdir -p "${data_dir}/all"
+# set filenames
+scp="${data_dir}/all/wav.scp"
+segments="${data_dir}/all/segments"
+# check file existence
+[ -e "${scp}" ] && rm "${scp}"
+[ -e "${segments}" ] && rm "${segments}"
+# make scp
+find "${db_root}" -follow -name "*.wav" | sort | while read -r filename; do
+    id=$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")
+    echo "${id} cat ${filename} | sox -t wav - -c 1 -b 16 -t wav - rate ${fs} |" >> "${scp}"
+done
+# make segments
+find "${db_root}" -name "*.lab" -follow | sort | while read -r filename; do
+    id=$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")
+    start_nsec=$(head -n 1 "${filename}" | cut -d " " -f 2)
+    end_nsec=$(tail -n 1 "${filename}" | cut -d " " -f 1)
+    start_sec=$(echo "${start_nsec}*0.0000001" | bc | sed "s/^\./0./")
+    end_sec=$(echo "${end_nsec}*0.0000001" | bc | sed "s/^\./0./")
+    echo "${id} ${id} ${start_sec} ${end_sec}" >> "${segments}"
+done
+# check
+diff -q <(awk '{print $1}' "${scp}") <(awk '{print $1}' "${segments}") > /dev/null
+# split
+num_all=$(wc -l < "${scp}")
+num_deveval=$((num_dev + num_eval))
+num_train=$((num_all - num_deveval))
+utils/split_data.sh \
+    --num_first "${num_train}" \
+    --num_second "${num_deveval}" \
+    --shuffle "${shuffle}" \
+    "${data_dir}/all" \
+    "${data_dir}/${train_set}" \
+    "${data_dir}/deveval"
+utils/split_data.sh \
+    --num_first "${num_dev}" \
+    --num_second "${num_eval}" \
+    --shuffle "${shuffle}" \
+    "${data_dir}/deveval" \
+    "${data_dir}/${dev_set}" \
+    "${data_dir}/${eval_set}"
+# remove tmp directories
+rm -rf "${data_dir}/all"
+rm -rf "${data_dir}/deveval"
+echo "Successfully prepared data."

ParallelWaveGAN/egs/jsut/voc1/path.sh ADDED Viewed

	@@ -0,0 +1,33 @@

+# cuda related
+export CUDA_HOME=/usr/local/cuda-10.0
+export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
+# path related
+export PRJ_ROOT="${PWD}/../../.."
+if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then
+    # shellcheck disable=SC1090
+    . "${PRJ_ROOT}/tools/venv/bin/activate"
+fi
+# python related
+export OMP_NUM_THREADS=1
+export PYTHONIOENCODING=UTF-8
+export MPL_BACKEND=Agg
+# check installation
+if ! command -v parallel-wavegan-train > /dev/null; then
+    echo "Error: It seems setup is not finished." >&2
+    echo "Error: Please setup your environment by following README.md" >&2
+    return 1
+fi
+if ! command -v jq > /dev/null; then
+    echo "Error: It seems jq is not installed." >&2
+    echo "Error: Please install via \`sudo apt-get install jq\`." >&2
+    echo "Error: If you do not have sudo, please download from https://stedolan.github.io/jq/download/." >&2
+    return 1
+fi
+if ! command -v yq > /dev/null; then
+    echo "Error: It seems yq is not installed." >&2
+    echo "Error: Please install via \`pip install yq\`." >&2
+    return 1
+fi