diff --git a/ParallelWaveGAN/.github/FUNDING.yml b/ParallelWaveGAN/.github/FUNDING.yml
new file mode 100644
index 0000000000000000000000000000000000000000..d780f47ca5115bd8d2ce1dff4657f6453fe82b12
--- /dev/null
+++ b/ParallelWaveGAN/.github/FUNDING.yml
@@ -0,0 +1 @@
+github: kan-bayashi
diff --git a/ParallelWaveGAN/.github/workflows/ci.yaml b/ParallelWaveGAN/.github/workflows/ci.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1a333bdc461bf698e32cad7d1a1cc43b9943c90f
--- /dev/null
+++ b/ParallelWaveGAN/.github/workflows/ci.yaml
@@ -0,0 +1,97 @@
+name: CI
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+  schedule:
+    - cron: 0 0 * * 1
+
+jobs:
+  linter_and_test:
+    runs-on: ubuntu-20.04
+    strategy:
+      max-parallel: 5
+      matrix:
+        python-version: [3.6]
+        # 1.6 is failed on cpu: https://github.com/kan-bayashi/ParallelWaveGAN/issues/198
+        pytorch-version: [1.4, 1.5.1, 1.7.1, 1.8.1, 1.9]
+    steps:
+      - uses: actions/checkout@master
+      - uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+          architecture: 'x64'
+      - uses: actions/cache@v2
+        with:
+          path: ~/.cache/pip
+          key: ${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}-pip-${{ hashFiles('**/setup.py') }}
+          restore-keys: |
+            ${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}-pip-
+      - name: Install dependencies
+        run: |
+          sudo apt-get install libsndfile-dev
+          # make python env
+          cd tools; make CUDA_VERSION="" PYTHON=python${{ matrix.python-version }} PYTORCH_VERSION=${{ matrix.pytorch-version }}
+          # install shell check
+          wget https://github.com/koalaman/shellcheck/releases/download/stable/shellcheck-stable.linux.x86_64.tar.xz
+          tar -xvf shellcheck-stable.linux.x86_64.tar.xz
+      - name: ShellCheck
+        run: |
+          export PATH=shellcheck-stable:$PATH
+          find egs -name "*.sh" | grep -v path.sh | while read line; do shellcheck -x --shell=bash -P $(dirname $line) ${line}; done
+      - name: Black & Flake8
+        run: |
+          source tools/venv/bin/activate
+          black --diff parallel_wavegan
+          flake8 parallel_wavegan
+          flake8 --extend-ignore=D test
+      - name: Pytest
+        run: |
+          source tools/venv/bin/activate
+          pytest test
+
+  integration:
+    runs-on: ubuntu-20.04
+    strategy:
+      max-parallel: 10
+      matrix:
+        python-version: [3.7]
+        pytorch-version: [1.9]
+        config:
+          - "parallel_wavegan.v1.debug.yaml"
+          - "melgan.v1.debug.yaml"
+          - "melgan.v3.debug.yaml"
+          - "multi_band_melgan.v1.debug.yaml"
+          - "parallel_wavegan.v1.debug.npy.yaml"
+          - "parallel_wavegan.v1.debug.diff_fs.yaml"
+          - "hifigan.v1.debug.yaml"
+          - "style_melgan.v1.debug.yaml"
+    steps:
+      - uses: actions/checkout@master
+      - uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+          architecture: 'x64'
+      - uses: actions/cache@v2
+        with:
+          path: ~/.cache/pip
+          key: ${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}-pip-${{ hashFiles('**/setup.py') }}
+          restore-keys: |
+            ${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}-pip-
+      - name: Install dependencies
+        run: |
+          sudo apt-get install libsndfile-dev jq
+          # make python env
+          cd tools; make CUDA_VERSION="" PYTHON=python${{ matrix.python-version }} PYTORCH_VERSION=${{ matrix.pytorch-version }}
+      - name: Integration
+        run: |
+          cd egs/yesno/voc1 && ./run.sh --conf conf/${{ matrix.config }}
+      - uses: actions/upload-artifact@v1
+        if: failure()
+        with:
+          name: artifacts-${{ matrix.config }}
+          path: egs/yesno/voc1
diff --git a/ParallelWaveGAN/.gitignore b/ParallelWaveGAN/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..80f35e4c3ec6653bc3cde6b0c89b9c215730c844
--- /dev/null
+++ b/ParallelWaveGAN/.gitignore
@@ -0,0 +1,36 @@
+# general
+*~
+*.pyc
+\#*\#
+.\#*
+*DS_Store
+out.txt
+parallel_wavegan.egg-info/
+doc/_build
+slurm-*.out
+tmp*
+.eggs/
+.hypothesis/
+.idea
+.backup/
+.pytest_cache/
+__pycache__/
+.coverage*
+coverage.xml*
+.vscode*
+.nfs*
+.ipynb_checkpoints
+.d000*
+*.out
+*.err
+
+# recipe related
+egs/*/*/data
+egs/*/*/downloads
+egs/*/*/dump
+egs/*/*/exp
+egs/*/*/conf/tuning
+
+# tools related
+tools/venv/
+tools/apex/
diff --git a/ParallelWaveGAN/LICENSE b/ParallelWaveGAN/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..1ac590bf4864dbf3bf32f59709dc8ea87e8cfb02
--- /dev/null
+++ b/ParallelWaveGAN/LICENSE
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2020 Tomoki Hayashi <hayashi.tomoki@g.sp.m.is.nagoya-u.ac.jp>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/ParallelWaveGAN/egs/README.md b/ParallelWaveGAN/egs/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..197444c11e73febf0da40a5db49f9908dfe598f3
--- /dev/null
+++ b/ParallelWaveGAN/egs/README.md
@@ -0,0 +1,165 @@
+# Kaldi-style all-in-one recipes
+
+This repository provides [Kaldi](https://github.com/kaldi-asr/kaldi)-style recipes, as the same as [ESPnet](https://github.com/espnet/espnet).  
+Currently, the following recipes are supported.
+
+- [LJSpeech](https://keithito.com/LJ-Speech-Dataset/): English female speaker
+- [JSUT](https://sites.google.com/site/shinnosuketakamichi/publication/jsut): Japanese female speaker
+- [JSSS](https://sites.google.com/site/shinnosuketakamichi/research-topics/jsss_corpus): Japanese female speaker
+- [CSMSC](https://www.data-baker.com/open_source.html): Mandarin female speaker
+- [CMU Arctic](http://www.festvox.org/cmu_arctic/): English speakers
+- [JNAS](http://research.nii.ac.jp/src/en/JNAS.html): Japanese multi-speaker
+- [VCTK](https://homepages.inf.ed.ac.uk/jyamagis/page3/page58/page58.html): English multi-speaker
+- [LibriTTS](https://arxiv.org/abs/1904.02882): English multi-speaker
+- [YesNo](https://arxiv.org/abs/1904.02882): English speaker (For debugging)
+
+
+## How to run the recipe
+
+```bash
+# Let us move on the recipe directory
+$ cd egs/ljspeech/voc1
+
+# Run the recipe from scratch
+$ ./run.sh
+
+# You can change config via command line
+$ ./run.sh --conf <your_customized_yaml_config>
+
+# You can select the stage to start and stop
+$ ./run.sh --stage 2 --stop_stage 2
+
+# If you want to specify the gpu
+$ CUDA_VISIBLE_DEVICES=1 ./run.sh --stage 2
+
+# If you want to resume training from 10000 steps checkpoint
+$ ./run.sh --stage 2 --resume <path>/<to>/checkpoint-10000steps.pkl
+```
+
+You can check the command line options in `run.sh`.
+
+The integration with job schedulers such as [slurm](https://slurm.schedmd.com/documentation.html) can be done via `cmd.sh` and  `conf/slurm.conf`.  
+If you want to use it, please check [this page](https://kaldi-asr.org/doc/queue.html).
+
+All of the hyperparameters are written in a single yaml format configuration file.  
+Please check [this example](https://github.com/kan-bayashi/ParallelWaveGAN/blob/master/egs/ljspeech/voc1/conf/parallel_wavegan.v1.yaml) in ljspeech recipe.
+
+You can monitor the training progress via tensorboard.
+
+```bash
+$ tensorboard --logdir exp
+```
+
+![](https://user-images.githubusercontent.com/22779813/68100080-58bbc500-ff09-11e9-9945-c835186fd7c2.png)
+
+If you want to accelerate the training, you can try distributed multi-gpu training based on apex.  
+You need to install apex for distributed training. Please make sure you already installed it.  
+Then you can run distributed multi-gpu training via following command:
+
+```bash
+# in the case of the number of gpus = 8
+$ CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" ./run.sh --stage 2 --n_gpus 8
+```
+
+In the case of distributed training, the batch size will be automatically multiplied by the number of gpus.  
+Please be careful.
+
+## How to make the recipe for your own dateset
+
+Here, I will show how to make the recipe for your own dataset.
+
+1. Setup your dataset to be the following structure.
+
+    ```bash
+    # For single-speaker case
+    $ tree /path/to/databse
+    /path/to/database
+    ├── utt_1.wav
+    ├── utt_2.wav
+    │   ...
+    └── utt_N.wav
+    # The directory can be nested, but each filename must be unique
+
+    # For multi-speaker case
+    $ tree /path/to/databse
+    /path/to/database
+    ├── spk_1
+    │   ├── utt1.wav
+    ├── spk_2
+    │   ├── utt1.wav
+    │   ...
+    └── spk_N
+        ├── utt1.wav
+        ...
+    # The directory under each speaker can be nested, but each filename in each speaker directory must be unique
+    ```
+
+2. Copy the template directory.
+
+    ```bash
+    cd egs
+
+    # For single speaker case
+    cp -r template_single_spk <your_dataset_name>
+
+    # For multi speaker case
+    cp -r template_multi_spk <your_dataset_name>
+
+    # Move on your recipe
+    cd egs/<your_dataset_name>/voc1
+    ```
+
+3. Modify the options in `run.sh`.  
+   What you need to change at least in `run.sh` is as follows:
+   - `db_root`: Root path of the database.
+   - `num_dev`: The number of utterances for development set.
+   - `num_eval`: The number of utterances for evaluation set.
+
+4. Modify the hyperpameters in `conf/parallel_wavegan.v1.yaml`.  
+   What you need to change at least in config is as follows:
+    - `sampling_rate`: If you can specify the lower sampling rate, the audio will be downsampled by sox.
+
+5. (Optional) Change command backend in `cmd.sh`.  
+   If you are not familiar with kaldi and run in your local env, you do not need to change.  
+   See more info on https://kaldi-asr.org/doc/queue.html.
+
+6. Run your recipe.
+
+    ```bash
+    # Run all stages from the first stage
+    ./run.sh
+
+    # If you want to specify CUDA device
+    CUDA_VISIBLE_DEVICES=0 ./run.sh
+    ```
+
+If you want to try the other advanced model, please check the config files in `egs/ljspeech/voc1/conf`.
+
+## Run training using ESPnet2-TTS recipe within 5 minutes
+
+Make sure already you finished the espnet2-tts recipe experiments (at least starting the training).
+
+```bash
+cd egs
+
+# Please use single spk template for both single and multi spk case
+cp -r template_single_spk <recipe_name>
+
+# Move on your recipe
+cd egs/<recipe_name>/voc1
+
+# Make symlink of data directory (Better to use absolute path)
+mkdir dump data
+ln -s /path/to/espnet/egs2/<recipe_name>/tts1/dump/raw dump/
+ln -s /path/to/espnet/egs2/<recipe_name>/tts1/dump/raw/tr_no_dev data/train_nodev
+ln -s /path/to/espnet/egs2/<recipe_name>/tts1/dump/raw/dev data/dev
+ln -s /path/to/espnet/egs2/<recipe_name>/tts1/dump/raw/eval1 data/eval
+
+# Edit config to match TTS model setting
+vim conf/parallel_wavegan.v1.yaml
+
+# Run from stage 1
+./run.sh --stage 1 --conf conf/parallel_wavegan.v1.yaml
+```
+
+That's it!
diff --git a/ParallelWaveGAN/egs/arctic/voc1/cmd.sh b/ParallelWaveGAN/egs/arctic/voc1/cmd.sh
new file mode 100644
index 0000000000000000000000000000000000000000..19f342102fc4f3389157c48f1196b16b68eb1cf1
--- /dev/null
+++ b/ParallelWaveGAN/egs/arctic/voc1/cmd.sh
@@ -0,0 +1,91 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend="local"
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="utils/run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="utils/run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="utils/run.pl"
+
+# Local machine, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="utils/stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="utils/stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="utils/stdout.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="utils/queue.pl"
+    export cuda_cmd="utils/queue.pl"
+    export decode_cmd="utils/queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="utils/slurm.pl"
+    export cuda_cmd="utils/slurm.pl"
+    export decode_cmd="utils/slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="utils/ssh.pl"
+    export cuda_cmd="utils/ssh.pl"
+    export decode_cmd="utils/ssh.pl"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/ParallelWaveGAN/egs/arctic/voc1/conf/parallel_wavegan.v1.yaml b/ParallelWaveGAN/egs/arctic/voc1/conf/parallel_wavegan.v1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..23f7e0f1cb5f05cdcdfa997ab0cbf0a698d75001
--- /dev/null
+++ b/ParallelWaveGAN/egs/arctic/voc1/conf/parallel_wavegan.v1.yaml
@@ -0,0 +1,122 @@
+# This is the hyperparameter configuration file for Parallel WaveGAN.
+# Please make sure this is adjusted for the Arctic dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration requires 12 GB GPU memory and takes ~3 days on TITAN V.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 16000     # Sampling rate.
+fft_size: 1024           # FFT size.
+hop_size: 256            # Hop size.
+win_length: null         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: false      # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 2048    # Frame size in trimming.
+trim_hop_size: 512       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Kernel size of dilated convolution.
+    layers: 30            # Number of residual block layers.
+    stacks: 3             # Number of stacks i.e., dilation cycles.
+    residual_channels: 64 # Number of channels in residual conv.
+    gate_channels: 128    # Number of channels in gated conv.
+    skip_channels: 64     # Number of channels in skip conv.
+    aux_channels: 80      # Number of channels for auxiliary feature conv.
+                          # Must be the same as num_mels.
+    aux_context_window: 2 # Context window size for auxiliary feature.
+                          # If set to 2, previous 2 and future 2 frames will be considered.
+    dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
+    upsample_params:                      # Upsampling network parameters.
+        upsample_scales: [4, 4, 4, 4]     # Upsampling scales. Prodcut of these must be the same as hop size.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Number of output channels.
+    layers: 10            # Number of conv layers.
+    conv_channels: 64     # Number of chnn layers.
+    bias: true            # Whether to use bias parameter in conv.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
+    nonlinear_activation_params:      # Nonlinear function parameters
+        negative_slope: 0.2           # Alpha in LeakyReLU.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 4.0  # Loss balancing coefficient.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 10             # Batch size.
+batch_max_steps: 15360     # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2             # Number of workers in Pytorch DataLoader.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    lr: 0.0001             # Generator's learning rate.
+    eps: 1.0e-6            # Generator's epsilon.
+    weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+    step_size: 200000      # Generator's scheduler step size.
+    gamma: 0.5             # Generator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+    lr: 0.00005            # Discriminator's learning rate.
+    eps: 1.0e-6            # Discriminator's epsilon.
+    weight_decay: 0.0      # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    step_size: 200000      # Discriminator's scheduler step size.
+    gamma: 0.5             # Discriminator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 400000                 # Number of training steps.
+save_interval_steps: 5000               # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+log_interval_steps: 100                 # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/arctic/voc1/conf/slurm.conf b/ParallelWaveGAN/egs/arctic/voc1/conf/slurm.conf
new file mode 100644
index 0000000000000000000000000000000000000000..cefd21f031d647f798e8045a6beb92b0b833d041
--- /dev/null
+++ b/ParallelWaveGAN/egs/arctic/voc1/conf/slurm.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command sbatch --export=PATH  --ntasks-per-node=1
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
+option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1 # Do not add anything to qsub_opts
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/ParallelWaveGAN/egs/arctic/voc1/local/data_download.sh b/ParallelWaveGAN/egs/arctic/voc1/local/data_download.sh
new file mode 100755
index 0000000000000000000000000000000000000000..d9bff0f9606dcb8a210ee610509bd86a4e352716
--- /dev/null
+++ b/ParallelWaveGAN/egs/arctic/voc1/local/data_download.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+download_dir=$1
+spk=$2
+
+available_spks=(
+    "slt" "clb" "bdl" "rms" "jmk" "awb" "ksp"
+)
+
+# check arguments
+if [ $# != 2 ]; then
+    echo "Usage: $0 <download_dir> <spk>"
+    echo "Available speakers: ${available_spks[*]}"
+    exit 1
+fi
+
+set -euo pipefail
+
+# check speakers
+if ! echo "${available_spks[*]}" | grep -q "${spk}"; then
+    echo "Specified spk (${spk}) is not available or not supported." >&2
+    exit 1
+fi
+
+# download dataset
+cwd=$(pwd)
+if [ ! -e "${download_dir}/cmu_us_${spk}_arctic" ]; then
+    mkdir -p "${download_dir}"
+    cd "${download_dir}"
+    wget "http://festvox.org/cmu_arctic/cmu_arctic/packed/cmu_us_${spk}_arctic-0.95-release.tar.bz2"
+    tar xf "cmu_us_${spk}_arctic-0.95-release.tar.bz2"
+    rm "cmu_us_${spk}_arctic-0.95-release.tar.bz2"
+    cd "${cwd}"
+    echo "Successfully finished download."
+else
+    echo "Already exists. Skip download."
+fi
diff --git a/ParallelWaveGAN/egs/arctic/voc1/local/data_prep.sh b/ParallelWaveGAN/egs/arctic/voc1/local/data_prep.sh
new file mode 100755
index 0000000000000000000000000000000000000000..94d5293e5df8c8bec79e8e2c5f36163d4f02b9bb
--- /dev/null
+++ b/ParallelWaveGAN/egs/arctic/voc1/local/data_prep.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+# shellcheck disable=SC1091
+. ./path.sh || exit 1;
+
+num_dev=100
+num_eval=100
+train_set="train_nodev"
+dev_set="dev"
+eval_set="eval"
+shuffle=false
+
+# shellcheck disable=SC1091
+. utils/parse_options.sh || exit 1;
+
+db_root=$1
+spk=$2
+data_dir=$3
+
+# check arguments
+if [ $# != 3 ]; then
+    echo "Usage: $0 <db_root> <spk> <data_dir>"
+    echo "e.g.: $0 downloads/cms_us_slt_arctic slt data"
+    echo ""
+    echo "Options:"
+    echo "    --num_dev: number of development uttreances (default=250)."
+    echo "    --num_eval: number of evaluation uttreances (default=250)."
+    echo "    --train_set: name of train set (default=train_nodev)."
+    echo "    --dev_set: name of dev set (default=dev)."
+    echo "    --eval_set: name of eval set (default=eval)."
+    echo "    --shuffle: whether to perform shuffle in making dev / eval set (default=false)."
+    exit 1
+fi
+
+set -euo pipefail
+
+# check speaker
+available_spks=(
+    "slt" "clb" "bdl" "rms" "jmk" "awb" "ksp"
+)
+if ! echo "${available_spks[*]}" | grep -q "${spk}"; then
+    echo "Specified speaker ${spk} is not available."
+    echo "Available speakers: ${available_spks[*]}"
+    exit 1
+fi
+
+[ ! -e "${data_dir}/all" ] && mkdir -p "${data_dir}/all"
+
+# set filenames
+scp="${data_dir}/all/wav.scp"
+segments="${data_dir}/all/segments"
+
+# check file existence
+[ -e "${scp}" ] && rm "${scp}"
+[ -e "${segments}" ] && rm "${segments}"
+
+# make scp
+find "${db_root}" -name "*.wav" -follow | sort | while read -r filename; do
+    id="${spk}_$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")"
+    echo "${id} ${filename}" >> "${scp}"
+done
+
+# make segments
+find "${db_root}/lab" -name "*.lab" -follow | sort | while read -r filename; do
+    # get start time
+    while read -r line; do
+        phn=$(echo "${line}" | cut -d " " -f 3)
+        if [ "${phn}" != "pau" ]; then
+            break
+        fi
+        start=$(echo "${line}" | cut -d " " -f 1)
+    done < <(tail -n +2 "$filename")
+    # get end time
+    while read -r line; do
+        end=$(echo "${line}" | cut -d " " -f 1)
+        phn=$(echo "${line}" | cut -d " " -f 3)
+        if [ "${phn}" != "pau" ]; then
+            break
+        fi
+    done < <(tail -n +2 "$filename" | tac)
+    echo "${spk}_$(basename "${filename}" .lab) ${spk}_$(basename "${filename}" .lab) ${start} ${end}" >> "${segments}"
+done
+
+# check
+diff -q <(awk '{print $1}' "${scp}") <(awk '{print $1}' "${segments}") > /dev/null
+
+# split
+num_all=$(wc -l < "${scp}")
+num_deveval=$((num_dev + num_eval))
+num_train=$((num_all - num_deveval))
+utils/split_data.sh \
+    --num_first "${num_train}" \
+    --num_second "${num_deveval}" \
+    --shuffle "${shuffle}" \
+    "${data_dir}/all" \
+    "${data_dir}/${train_set}" \
+    "${data_dir}/deveval"
+utils/split_data.sh \
+    --num_first "${num_dev}" \
+    --num_second "${num_eval}" \
+    --shuffle "${shuffle}" \
+    "${data_dir}/deveval" \
+    "${data_dir}/${dev_set}" \
+    "${data_dir}/${eval_set}"
+
+# remove tmp directories
+rm -rf "${data_dir}/all"
+rm -rf "${data_dir}/deveval"
+
+echo "Successfully prepared data."
diff --git a/ParallelWaveGAN/egs/arctic/voc1/path.sh b/ParallelWaveGAN/egs/arctic/voc1/path.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b0ca27c615f70aa29e240222ec370f8ad4e7b45a
--- /dev/null
+++ b/ParallelWaveGAN/egs/arctic/voc1/path.sh
@@ -0,0 +1,33 @@
+# cuda related
+export CUDA_HOME=/usr/local/cuda-10.0
+export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
+
+# path related
+export PRJ_ROOT="${PWD}/../../.."
+if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then
+    # shellcheck disable=SC1090
+    . "${PRJ_ROOT}/tools/venv/bin/activate"
+fi
+
+# python related
+export OMP_NUM_THREADS=1
+export PYTHONIOENCODING=UTF-8
+export MPL_BACKEND=Agg
+
+# check installation
+if ! command -v parallel-wavegan-train > /dev/null; then
+    echo "Error: It seems setup is not finished." >&2
+    echo "Error: Please setup your environment by following README.md" >&2
+    return 1
+fi
+if ! command -v jq > /dev/null; then
+    echo "Error: It seems jq is not installed." >&2
+    echo "Error: Please install via \`sudo apt-get install jq\`." >&2
+    echo "Error: If you do not have sudo, please download from https://stedolan.github.io/jq/download/." >&2
+    return 1
+fi
+if ! command -v yq > /dev/null; then
+    echo "Error: It seems yq is not installed." >&2
+    echo "Error: Please install via \`pip install yq\`." >&2
+    return 1
+fi
diff --git a/ParallelWaveGAN/egs/arctic/voc1/run.sh b/ParallelWaveGAN/egs/arctic/voc1/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..269201462b97692bf1536c296bf8612833ce15d8
--- /dev/null
+++ b/ParallelWaveGAN/egs/arctic/voc1/run.sh
@@ -0,0 +1,167 @@
+#!/bin/bash
+
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+. ./cmd.sh || exit 1;
+. ./path.sh || exit 1;
+
+# basic settings
+stage=-1       # stage to start
+stop_stage=100 # stage to stop
+verbose=1      # verbosity level (lower is less info)
+n_gpus=1       # number of gpus in training
+n_jobs=16      # number of parallel jobs in feature extraction
+
+# NOTE(kan-bayashi): renamed to conf to avoid conflict in parse_options.sh
+conf=conf/parallel_wavegan.v1.yaml
+
+# directory path setting
+download_dir=downloads # direcotry to save downloaded files
+dumpdir=dump           # directory to dump features
+
+# target speaker setting
+spk=slt # you can select from slt, clb, bdl, rms, awb, jmk, ksp
+
+# training related setting
+tag=""     # tag for directory to save model
+resume=""  # checkpoint path to resume training
+           # (e.g. <path>/<to>/checkpoint-10000steps.pkl)
+
+# decoding related setting
+checkpoint="" # checkpoint path to be used for decoding
+              # if not provided, the latest one will be used
+              # (e.g. <path>/<to>/checkpoint-400000steps.pkl)
+
+# shellcheck disable=SC1091
+. utils/parse_options.sh || exit 1;
+
+train_set="train_nodev_${spk}" # name of training data directory
+dev_set="dev_${spk}"           # name of development data direcotry
+eval_set="eval_${spk}"         # name of evaluation data direcotry
+
+set -euo pipefail
+
+if [ "${stage}" -le -1 ] && [ "${stop_stage}" -ge -1 ]; then
+    echo "Stage -1: Data download"
+    local/data_download.sh "${download_dir}" "${spk}"
+fi
+
+if [ "${stage}" -le 0 ] && [ "${stop_stage}" -ge 0 ]; then
+    echo "Stage 0: Data preparation"
+    local/data_prep.sh \
+        --train_set "${train_set}" \
+        --dev_set "${dev_set}" \
+        --eval_set "${eval_set}" \
+        "${download_dir}/cmu_us_${spk}_arctic" "${spk}" data
+fi
+
+stats_ext=$(grep -q "hdf5" <(yq ".format" "${conf}") && echo "h5" || echo "npy")
+if [ "${stage}" -le 1 ] && [ "${stop_stage}" -ge 1 ]; then
+    echo "Stage 1: Feature extraction"
+    # extract raw features
+    pids=()
+    for name in "${train_set}" "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${dumpdir}/${name}/raw" ] && mkdir -p "${dumpdir}/${name}/raw"
+        echo "Feature extraction start. See the progress via ${dumpdir}/${name}/raw/preprocessing.*.log."
+        utils/make_subset_data.sh "data/${name}" "${n_jobs}" "${dumpdir}/${name}/raw"
+        ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/raw/preprocessing.JOB.log" \
+            parallel-wavegan-preprocess \
+                --config "${conf}" \
+                --scp "${dumpdir}/${name}/raw/wav.JOB.scp" \
+                --segments "${dumpdir}/${name}/raw/segments.JOB" \
+                --dumpdir "${dumpdir}/${name}/raw/dump.JOB" \
+                --verbose "${verbose}"
+        echo "Successfully finished feature extraction of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished feature extraction."
+
+    # calculate statistics for normalization
+    echo "Statistics computation start. See the progress via ${dumpdir}/${train_set}/compute_statistics.log."
+    ${train_cmd} "${dumpdir}/${train_set}/compute_statistics.log" \
+        parallel-wavegan-compute-statistics \
+            --config "${conf}" \
+            --rootdir "${dumpdir}/${train_set}/raw" \
+            --dumpdir "${dumpdir}/${train_set}" \
+            --verbose "${verbose}"
+    echo "Successfully finished calculation of statistics."
+
+    # normalize and dump them
+    pids=()
+    for name in "${train_set}" "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${dumpdir}/${name}/norm" ] && mkdir -p "${dumpdir}/${name}/norm"
+        echo "Nomalization start. See the progress via ${dumpdir}/${name}/norm/normalize.*.log."
+        ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/norm/normalize.JOB.log" \
+            parallel-wavegan-normalize \
+                --config "${conf}" \
+                --stats "${dumpdir}/${train_set}/stats.${stats_ext}" \
+                --rootdir "${dumpdir}/${name}/raw/dump.JOB" \
+                --dumpdir "${dumpdir}/${name}/norm/dump.JOB" \
+                --verbose "${verbose}"
+        echo "Successfully finished normalization of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished normalization."
+fi
+
+if [ -z "${tag}" ]; then
+    expdir="exp/${train_set}_arctic_$(basename "${conf}" .yaml)"
+else
+    expdir="exp/${train_set}_arctic_${tag}"
+fi
+if [ "${stage}" -le 2 ] && [ "${stop_stage}" -ge 2 ]; then
+    echo "Stage 2: Network training"
+    [ ! -e "${expdir}" ] && mkdir -p "${expdir}"
+    cp "${dumpdir}/${train_set}/stats.${stats_ext}" "${expdir}"
+    if [ "${n_gpus}" -gt 1 ]; then
+        train="python -m parallel_wavegan.distributed.launch --nproc_per_node ${n_gpus} -c parallel-wavegan-train"
+    else
+        train="parallel-wavegan-train"
+    fi
+    echo "Training start. See the progress via ${expdir}/train.log."
+    ${cuda_cmd} --gpu "${n_gpus}" "${expdir}/train.log" \
+        ${train} \
+            --config "${conf}" \
+            --train-dumpdir "${dumpdir}/${train_set}/norm" \
+            --dev-dumpdir "${dumpdir}/${dev_set}/norm" \
+            --outdir "${expdir}" \
+            --resume "${resume}" \
+            --verbose "${verbose}"
+    echo "Successfully finished training."
+fi
+
+if [ "${stage}" -le 3 ] && [ "${stop_stage}" -ge 3 ]; then
+    echo "Stage 3: Network decoding"
+    # shellcheck disable=SC2012
+    [ -z "${checkpoint}" ] && checkpoint="$(ls -dt "${expdir}"/*.pkl | head -1 || true)"
+    outdir="${expdir}/wav/$(basename "${checkpoint}" .pkl)"
+    pids=()
+    for name in "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${outdir}/${name}" ] && mkdir -p "${outdir}/${name}"
+        [ "${n_gpus}" -gt 1 ] && n_gpus=1
+        echo "Decoding start. See the progress via ${outdir}/${name}/decode.log."
+        ${cuda_cmd} --gpu "${n_gpus}" "${outdir}/${name}/decode.log" \
+            parallel-wavegan-decode \
+                --dumpdir "${dumpdir}/${name}/norm" \
+                --checkpoint "${checkpoint}" \
+                --outdir "${outdir}/${name}" \
+                --verbose "${verbose}"
+        echo "Successfully finished decoding of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished decoding."
+fi
+echo "Finished."
diff --git a/ParallelWaveGAN/egs/arctic/voc1/utils b/ParallelWaveGAN/egs/arctic/voc1/utils
new file mode 120000
index 0000000000000000000000000000000000000000..973afe674f2c85f7a400600f963e0709767602dc
--- /dev/null
+++ b/ParallelWaveGAN/egs/arctic/voc1/utils
@@ -0,0 +1 @@
+../../../utils
\ No newline at end of file
diff --git a/ParallelWaveGAN/egs/csmsc/voc1/cmd.sh b/ParallelWaveGAN/egs/csmsc/voc1/cmd.sh
new file mode 100644
index 0000000000000000000000000000000000000000..19f342102fc4f3389157c48f1196b16b68eb1cf1
--- /dev/null
+++ b/ParallelWaveGAN/egs/csmsc/voc1/cmd.sh
@@ -0,0 +1,91 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend="local"
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="utils/run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="utils/run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="utils/run.pl"
+
+# Local machine, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="utils/stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="utils/stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="utils/stdout.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="utils/queue.pl"
+    export cuda_cmd="utils/queue.pl"
+    export decode_cmd="utils/queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="utils/slurm.pl"
+    export cuda_cmd="utils/slurm.pl"
+    export decode_cmd="utils/slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="utils/ssh.pl"
+    export cuda_cmd="utils/ssh.pl"
+    export decode_cmd="utils/ssh.pl"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/ParallelWaveGAN/egs/csmsc/voc1/conf/hifigan.v1.yaml b/ParallelWaveGAN/egs/csmsc/voc1/conf/hifigan.v1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..16fe8c0e53b4f2bccbc36242503939794fc9c9d5
--- /dev/null
+++ b/ParallelWaveGAN/egs/csmsc/voc1/conf/hifigan.v1.yaml
@@ -0,0 +1,180 @@
+# This is the configuration file for CSMSC dataset.
+# This configuration is based on HiFiGAN V1, which is
+# an official configuration. But I found that the optimizer
+# setting does not work well with my implementation.
+# So I changed optimizer settings as follows:
+# - AdamW -> Adam
+# - betas: [0.8, 0.99] -> betas: [0.5, 0.9]
+# - Scheduler: ExponentialLR -> MultiStepLR
+# To match the shift size difference, the upsample scales
+# is also modified from the original 256 shift setting.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 24000     # Sampling rate.
+fft_size: 2048           # FFT size.
+hop_size: 300            # Hop size.
+win_length: 1200         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: false      # Whether to trim the start and end of silence.
+trim_threshold_in_db: 20 # Need to tune carefully if the recording is not good.
+trim_frame_size: 1024    # Frame size in trimming.
+trim_hop_size: 256       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_type: HiFiGANGenerator
+generator_params:
+    in_channels: 80                       # Number of input channels.
+    out_channels: 1                       # Number of output channels.
+    channels: 512                         # Number of initial channels.
+    kernel_size: 7                        # Kernel size of initial and final conv layers.
+    upsample_scales: [5, 5, 4, 3]         # Upsampling scales.
+    upsample_kernel_sizes: [10, 10, 8, 6] # Kernel size for upsampling layers.
+    resblock_kernel_sizes: [3, 7, 11]     # Kernel size for residual blocks.
+    resblock_dilations:                   # Dilations for residual blocks.
+        - [1, 3, 5]
+        - [1, 3, 5]
+        - [1, 3, 5]
+    use_additional_convs: true            # Whether to use additional conv layer in residual blocks.
+    bias: true                            # Whether to use bias parameter in conv.
+    nonlinear_activation: "LeakyReLU"     # Nonlinear activation type.
+    nonlinear_activation_params:          # Nonlinear activation paramters.
+        negative_slope: 0.1
+    use_weight_norm: true                 # Whether to apply weight normalization.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_type: HiFiGANMultiScaleMultiPeriodDiscriminator
+discriminator_params:
+    scales: 3                              # Number of multi-scale discriminator.
+    scale_downsample_pooling: "AvgPool1d"  # Pooling operation for scale discriminator.
+    scale_downsample_pooling_params:
+        kernel_size: 4                     # Pooling kernel size.
+        stride: 2                          # Pooling stride.
+        padding: 2                         # Padding size.
+    scale_discriminator_params:
+        in_channels: 1                     # Number of input channels.
+        out_channels: 1                    # Number of output channels.
+        kernel_sizes: [15, 41, 5, 3]       # List of kernel sizes.
+        channels: 128                      # Initial number of channels.
+        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
+        max_groups: 16                     # Maximum number of groups in downsampling conv layers.
+        bias: true
+        downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales.
+        nonlinear_activation: "LeakyReLU"  # Nonlinear activation.
+        nonlinear_activation_params:
+            negative_slope: 0.1
+    follow_official_norm: true             # Whether to follow the official norm setting.
+    periods: [2, 3, 5, 7, 11]              # List of period for multi-period discriminator.
+    period_discriminator_params:
+        in_channels: 1                     # Number of input channels.
+        out_channels: 1                    # Number of output channels.
+        kernel_sizes: [5, 3]               # List of kernel sizes.
+        channels: 32                       # Initial number of channels.
+        downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
+        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
+        bias: true                         # Whether to use bias parameter in conv layer."
+        nonlinear_activation: "LeakyReLU"  # Nonlinear activation.
+        nonlinear_activation_params:       # Nonlinear activation paramters.
+            negative_slope: 0.1
+        use_weight_norm: true              # Whether to apply weight normalization.
+        use_spectral_norm: false           # Whether to apply spectral normalization.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+use_stft_loss: false                 # Whether to use multi-resolution STFT loss.
+use_mel_loss: true                   # Whether to use Mel-spectrogram loss.
+mel_loss_params:
+    fs: 24000
+    fft_size: 2048
+    hop_size: 300
+    win_length: 1200
+    window: "hann"
+    num_mels: 80
+    fmin: 0
+    fmax: 12000
+    log_base: null
+generator_adv_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+discriminator_adv_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+use_feat_match_loss: true
+feat_match_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+    average_by_layers: false         # Whether to average loss by #layers in each discriminator.
+    include_final_outputs: false     # Whether to include final outputs in feat match loss calculation.
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_aux: 45.0       # Loss balancing coefficient for STFT loss.
+lambda_adv: 1.0        # Loss balancing coefficient for adversarial loss.
+lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss..
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 16              # Batch size.
+batch_max_steps: 8400       # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2              # Number of workers in Pytorch DataLoader.
+remove_short_samples: false # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_type: Adam
+generator_optimizer_params:
+    lr: 2.0e-4
+    betas: [0.5, 0.9]
+    weight_decay: 0.0
+generator_scheduler_type: MultiStepLR
+generator_scheduler_params:
+    gamma: 0.5
+    milestones:
+        - 200000
+        - 400000
+        - 600000
+        - 800000
+generator_grad_norm: -1
+discriminator_optimizer_type: Adam
+discriminator_optimizer_params:
+    lr: 2.0e-4
+    betas: [0.5, 0.9]
+    weight_decay: 0.0
+discriminator_scheduler_type: MultiStepLR
+discriminator_scheduler_params:
+    gamma: 0.5
+    milestones:
+        - 200000
+        - 400000
+        - 600000
+        - 800000
+discriminator_grad_norm: -1
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+generator_train_start_steps: 1     # Number of steps to start to train discriminator.
+discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
+train_max_steps: 2500000           # Number of training steps.
+save_interval_steps: 10000         # Interval steps to save checkpoint.
+eval_interval_steps: 1000          # Interval steps to evaluate the network.
+log_interval_steps: 100            # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/csmsc/voc1/conf/multi_band_melgan.v2.yaml b/ParallelWaveGAN/egs/csmsc/voc1/conf/multi_band_melgan.v2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..076843a265c268c1b603d840ce8f4815a8c0e007
--- /dev/null
+++ b/ParallelWaveGAN/egs/csmsc/voc1/conf/multi_band_melgan.v2.yaml
@@ -0,0 +1,150 @@
+# This is the hyperparameter configuration file for MelGAN.
+# Please make sure this is adjusted for the CSMSC dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration requires ~ 8GB memory and will finish within 4 days on Titan V.
+
+# This configuration is based on full-band MelGAN but the hop size and sampling
+# rate is different from the paper (16kHz vs 24kHz). The number of iteraions
+# is now shown in the paper so currently we train 1M iterations (not sure enough
+# to converge). The optimizer setting is based on @dathudeptrai advice.
+# https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 24000     # Sampling rate.
+fft_size: 2048           # FFT size.
+hop_size: 300            # Hop size.
+win_length: 1200         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: false      # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 2048    # Frame size in trimming.
+trim_hop_size: 512       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_type: "MelGANGenerator" # Generator type.
+generator_params:
+    in_channels: 80               # Number of input channels.
+    out_channels: 4               # Number of output channels.
+    kernel_size: 7                # Kernel size of initial and final conv layers.
+    channels: 384                 # Initial number of channels for conv layers.
+    upsample_scales: [5, 5, 3]    # List of Upsampling scales.
+    stack_kernel_size: 3          # Kernel size of dilated conv layers in residual stack.
+    stacks: 4                     # Number of stacks in a single residual stack module.
+    use_weight_norm: True         # Whether to use weight normalization.
+    use_causal_conv: False        # Whether to use causal convolution.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_type: "MelGANMultiScaleDiscriminator" # Discriminator type.
+discriminator_params:
+    in_channels: 1                    # Number of input channels.
+    out_channels: 1                   # Number of output channels.
+    scales: 3                         # Number of multi-scales.
+    downsample_pooling: "AvgPool1d"   # Pooling type for the input downsampling.
+    downsample_pooling_params:        # Parameters of the above pooling function.
+        kernel_size: 4
+        stride: 2
+        padding: 1
+        count_include_pad: False
+    kernel_sizes: [5, 3]              # List of kernel size.
+    channels: 16                      # Number of channels of the initial conv layer.
+    max_downsample_channels: 512      # Maximum number of channels of downsampling layers.
+    downsample_scales: [4, 4, 4]      # List of downsampling scales.
+    nonlinear_activation: "LeakyReLU" # Nonlinear activation function.
+    nonlinear_activation_params:      # Parameters of nonlinear activation function.
+        negative_slope: 0.2
+    use_weight_norm: True             # Whether to use weight norm.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+use_subband_stft_loss: true
+subband_stft_loss_params:
+    fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
+    hop_sizes: [30, 60, 10]     # List of hop size for STFT-based loss
+    win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
+    window: "hann_window"       # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+use_feat_match_loss: false # Whether to use feature matching loss.
+lambda_adv: 2.5            # Loss balancing coefficient for adversarial loss.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 64             # Batch size.
+batch_max_steps: 16200     # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
+num_workers: 4             # Number of workers in Pytorch DataLoader.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_type: "Adam"       # Generator's optimizer type.
+generator_optimizer_params:
+    lr: 1.0e-3                          # Generator's learning rate.
+    eps: 1.0e-7                         # Generator's epsilon.
+    weight_decay: 0.0                   # Generator's weight decay coefficient.
+    amsgrad: true
+generator_grad_norm: -1                 # Generator's gradient norm.
+generator_scheduler_type: "MultiStepLR" # Generator's scheduler type.
+generator_scheduler_params:
+    gamma: 0.5                          # Generator's scheduler gamma.
+    milestones:                         # At each milestone, lr will be multiplied by gamma.
+        - 100000
+        - 200000
+        - 300000
+        - 400000
+        - 500000
+        - 600000
+discriminator_optimizer_type: "Adam"        # Discriminator's optimizer type.
+discriminator_optimizer_params:
+    lr: 1.0e-3                              # Discriminator's learning rate.
+    eps: 1.0e-7                             # Discriminator's epsilon.
+    weight_decay: 0.0                       # Discriminator's weight decay coefficient.
+    amsgrad: true
+discriminator_grad_norm: -1                 # Discriminator's gradient norm.
+discriminator_scheduler_type: "MultiStepLR" # Discriminator's scheduler type.
+discriminator_scheduler_params:
+    gamma: 0.5                              # Discriminator's scheduler gamma.
+    milestones:                             # At each milestone, lr will be multiplied by gamma.
+        - 100000
+        - 200000
+        - 300000
+        - 400000
+        - 500000
+        - 600000
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 200000 # Number of steps to start to train discriminator.
+train_max_steps: 1000000                # Number of training steps.
+save_interval_steps: 50000              # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+log_interval_steps: 1000                # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/csmsc/voc1/conf/parallel_wavegan.v1.yaml b/ParallelWaveGAN/egs/csmsc/voc1/conf/parallel_wavegan.v1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..942dfb91007aa456e457652fac744035b667528c
--- /dev/null
+++ b/ParallelWaveGAN/egs/csmsc/voc1/conf/parallel_wavegan.v1.yaml
@@ -0,0 +1,122 @@
+# This is the hyperparameter configuration file for Parallel WaveGAN.
+# Please make sure this is adjusted for the CSMSC dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration requires 12 GB GPU memory and takes ~3 days on RTX TITAN.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 24000     # Sampling rate.
+fft_size: 2048           # FFT size.
+hop_size: 300            # Hop size.
+win_length: 1200         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: false      # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 2048    # Frame size in trimming.
+trim_hop_size: 512       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Kernel size of dilated convolution.
+    layers: 30            # Number of residual block layers.
+    stacks: 3             # Number of stacks i.e., dilation cycles.
+    residual_channels: 64 # Number of channels in residual conv.
+    gate_channels: 128    # Number of channels in gated conv.
+    skip_channels: 64     # Number of channels in skip conv.
+    aux_channels: 80      # Number of channels for auxiliary feature conv.
+                          # Must be the same as num_mels.
+    aux_context_window: 2 # Context window size for auxiliary feature.
+                          # If set to 2, previous 2 and future 2 frames will be considered.
+    dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
+    upsample_params:                      # Upsampling network parameters.
+        upsample_scales: [4, 5, 3, 5]     # Upsampling scales. Prodcut of these must be the same as hop size.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Number of output channels.
+    layers: 10            # Number of conv layers.
+    conv_channels: 64     # Number of chnn layers.
+    bias: true            # Whether to use bias parameter in conv.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
+    nonlinear_activation_params:      # Nonlinear function parameters
+        negative_slope: 0.2           # Alpha in LeakyReLU.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 4.0  # Loss balancing coefficient.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 6              # Batch size.
+batch_max_steps: 25500     # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2             # Number of workers in Pytorch DataLoader.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    lr: 0.0001             # Generator's learning rate.
+    eps: 1.0e-6            # Generator's epsilon.
+    weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+    step_size: 200000      # Generator's scheduler step size.
+    gamma: 0.5             # Generator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+    lr: 0.00005            # Discriminator's learning rate.
+    eps: 1.0e-6            # Discriminator's epsilon.
+    weight_decay: 0.0      # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    step_size: 200000      # Discriminator's scheduler step size.
+    gamma: 0.5             # Discriminator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 400000                 # Number of training steps.
+save_interval_steps: 5000               # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+log_interval_steps: 100                 # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/csmsc/voc1/conf/slurm.conf b/ParallelWaveGAN/egs/csmsc/voc1/conf/slurm.conf
new file mode 100644
index 0000000000000000000000000000000000000000..cefd21f031d647f798e8045a6beb92b0b833d041
--- /dev/null
+++ b/ParallelWaveGAN/egs/csmsc/voc1/conf/slurm.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command sbatch --export=PATH  --ntasks-per-node=1
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
+option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1 # Do not add anything to qsub_opts
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/ParallelWaveGAN/egs/csmsc/voc1/conf/style_melgan.v1.yaml b/ParallelWaveGAN/egs/csmsc/voc1/conf/style_melgan.v1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e7a50800a15ffabd3d9830d1039e6660a0984870
--- /dev/null
+++ b/ParallelWaveGAN/egs/csmsc/voc1/conf/style_melgan.v1.yaml
@@ -0,0 +1,147 @@
+# This is the configuration file for CSMSC dataset.
+# This configuration is based on StyleMelGAN paper but
+# uses MSE loss instead of Hinge loss. And I found that
+# batch_size = 8 is also working good. So maybe if you
+# want to accelerate the training, you can reduce the
+# batch size (e.g. 8 or 16). Upsampling scales is modified
+# to fit the shift size 300 pt.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 24000     # Sampling rate.
+fft_size: 2048           # FFT size.
+hop_size: 300            # Hop size.
+win_length: 1200         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: false      # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 1024    # Frame size in trimming.
+trim_hop_size: 256       # Hop size in trimming.
+format: "hdf5"           # Feature file format. " npy " or " hdf5 " is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_type: "StyleMelGANGenerator" # Generator type.
+generator_params:
+    in_channels: 128
+    aux_channels: 80
+    channels: 64
+    out_channels: 1
+    kernel_size: 9
+    dilation: 2
+    bias: True
+    noise_upsample_scales: [10, 2, 2, 2]
+    noise_upsample_activation: "LeakyReLU"
+    noise_upsample_activation_params:
+        negative_slope: 0.2
+    upsample_scales: [5, 1, 5, 1, 3, 1, 2, 2, 1]
+    upsample_mode: "nearest"
+    gated_function: "softmax"
+    use_weight_norm: True
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_type: "StyleMelGANDiscriminator" # Discriminator type.
+discriminator_params:
+    repeats: 4
+    window_sizes: [512, 1024, 2048, 4096]
+    pqmf_params:
+        - [1, None, None, None]
+        - [2, 62, 0.26700, 9.0]
+        - [4, 62, 0.14200, 9.0]
+        - [8, 62, 0.07949, 9.0]
+    discriminator_params:
+        out_channels: 1
+        kernel_sizes: [5, 3]
+        channels: 16
+        max_downsample_channels: 512
+        bias: True
+        downsample_scales: [4, 4, 4, 1]
+        nonlinear_activation: "LeakyReLU"
+        nonlinear_activation_params:
+            negative_slope: 0.2
+    use_weight_norm: True
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+lambda_aux: 1.0                   # Loss balancing coefficient for aux loss.
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 1.0 # Loss balancing coefficient for adv loss.
+generator_adv_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+discriminator_adv_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 32              # Batch size.
+batch_max_steps: 24000      # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2              # Number of workers in Pytorch DataLoader.
+remove_short_samples: false # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_type: Adam
+generator_optimizer_params:
+    lr: 1.0e-4
+    betas: [0.5, 0.9]
+    weight_decay: 0.0
+generator_scheduler_type: MultiStepLR
+generator_scheduler_params:
+    gamma: 0.5
+    milestones:
+        - 100000
+        - 300000
+        - 500000
+        - 700000
+        - 900000
+generator_grad_norm: -1
+discriminator_optimizer_type: Adam
+discriminator_optimizer_params:
+    lr: 2.0e-4
+    betas: [0.5, 0.9]
+    weight_decay: 0.0
+discriminator_scheduler_type: MultiStepLR
+discriminator_scheduler_params:
+    gamma: 0.5
+    milestones:
+        - 200000
+        - 400000
+        - 600000
+        - 800000
+discriminator_grad_norm: -1
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 1500000                # Number of training steps.
+save_interval_steps: 50000              # Interval steps to save checkpoint.
+eval_interval_steps: 1000              # Interval steps to evaluate the network.
+log_interval_steps: 100                # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/csmsc/voc1/local/data_download.sh b/ParallelWaveGAN/egs/csmsc/voc1/local/data_download.sh
new file mode 100755
index 0000000000000000000000000000000000000000..85f8fb7abae3629921e5711db2cbd212dc4fa933
--- /dev/null
+++ b/ParallelWaveGAN/egs/csmsc/voc1/local/data_download.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+download_dir=$1
+
+# check arguments
+if [ $# != 1 ]; then
+    echo "Usage: $0 <download_dir>"
+    exit 1
+fi
+
+set -euo pipefail
+
+# download dataset
+cwd=$(pwd)
+if [ ! -e "${download_dir}/CSMSC" ]; then
+    mkdir -p "${download_dir}"
+    cd "${download_dir}"
+    wget https://weixinxcxdb.oss-cn-beijing.aliyuncs.com/gwYinPinKu/BZNSYP.rar
+    mkdir CSMSC && cd CSMSC && unrar x ../BZNSYP.rar
+    # convert new line code
+    find ./PhoneLabeling -name "*.interval" | while read -r line; do
+        nkf -Lu --overwrite "${line}"
+    done
+    rm ../BZNSYP.rar
+    cd "${cwd}"
+    echo "Successfully finished download."
+else
+    echo "Already exists. Skip download."
+fi
diff --git a/ParallelWaveGAN/egs/csmsc/voc1/local/data_prep.sh b/ParallelWaveGAN/egs/csmsc/voc1/local/data_prep.sh
new file mode 100755
index 0000000000000000000000000000000000000000..9230a6d220c73e7ad6c6704e2bdd5dc845c48b80
--- /dev/null
+++ b/ParallelWaveGAN/egs/csmsc/voc1/local/data_prep.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+# shellcheck disable=SC1091
+. ./path.sh || exit 1;
+
+fs=24000
+num_dev=100
+num_eval=100
+train_set="train_nodev"
+dev_set="dev"
+eval_set="eval"
+shuffle=false
+
+# shellcheck disable=SC1091
+. utils/parse_options.sh || exit 1;
+
+db_root=$1
+data_dir=$2
+
+# check arguments
+if [ $# != 2 ]; then
+    echo "Usage: $0 [Options] <db_root> <data_dir>"
+    echo "e.g.: $0 downloads/CSMSC data"
+    echo ""
+    echo "Options:"
+    echo "    --fs: target sampling rate (default=24000)."
+    echo "    --num_dev: number of development uttreances (default=100)."
+    echo "    --num_eval: number of evaluation uttreances (default=100)."
+    echo "    --train_set: name of train set (default=train_nodev)."
+    echo "    --dev_set: name of dev set (default=dev)."
+    echo "    --eval_set: name of eval set (default=eval)."
+    echo "    --shuffle: whether to perform shuffle in making dev / eval set (default=false)."
+    exit 1
+fi
+
+set -euo pipefail
+
+[ ! -e "${data_dir}/all" ] && mkdir -p "${data_dir}/all"
+
+# set filenames
+scp="${data_dir}/all/wav.scp"
+segments="${data_dir}/all/segments"
+
+# check file existence
+[ -e "${scp}" ] && rm "${scp}"
+[ -e "${segments}" ] && rm "${segments}"
+
+# make wav.scp
+find "${db_root}/Wave" -name "*.wav" -follow | sort | while read -r filename; do
+    id="$(basename "${filename}" .wav)"
+    echo "csmsc_${id} cat ${filename} | sox -t wav - -c 1 -b 16 -t wav - rate ${fs} |" >> "${scp}"
+done
+
+# make segments
+find "${db_root}/PhoneLabeling" -name "*.interval" -follow | sort | while read -r filename; do
+    nkf -Lu --overwrite "${filename}"
+    id="$(basename "${filename}" .interval)"
+    start_sec=$(tail -n +14 "${filename}" | head -n 1)
+    end_sec=$(head -n -2 "${filename}" | tail -n 1)
+    [ -z "${start_sec}" ] && echo "Start second is missing (utt_id=${id}). " >&2 && exit 1;
+    [ -z "${end_sec}" ] && echo "End second is missing (utt_id=${id})." >&2 && exit 1;
+    echo "csmsc_${id} csmsc_${id} ${start_sec} ${end_sec}" >> "${segments}"
+done
+
+# check
+diff -q <(awk '{print $1}' "${scp}") <(awk '{print $1}' "${segments}") > /dev/null
+
+# split
+num_all=$(wc -l < "${scp}")
+num_deveval=$((num_dev + num_eval))
+num_train=$((num_all - num_deveval))
+utils/split_data.sh \
+    --num_first "${num_train}" \
+    --num_second "${num_deveval}" \
+    --shuffle "${shuffle}" \
+    "${data_dir}/all" \
+    "${data_dir}/${train_set}" \
+    "${data_dir}/deveval"
+utils/split_data.sh \
+    --num_first "${num_dev}" \
+    --num_second "${num_eval}" \
+    --shuffle "${shuffle}" \
+    "${data_dir}/deveval" \
+    "${data_dir}/${dev_set}" \
+    "${data_dir}/${eval_set}"
+
+# remove tmp directories
+rm -rf "${data_dir}/all"
+rm -rf "${data_dir}/deveval"
+
+echo "Successfully prepared data."
diff --git a/ParallelWaveGAN/egs/csmsc/voc1/path.sh b/ParallelWaveGAN/egs/csmsc/voc1/path.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b0ca27c615f70aa29e240222ec370f8ad4e7b45a
--- /dev/null
+++ b/ParallelWaveGAN/egs/csmsc/voc1/path.sh
@@ -0,0 +1,33 @@
+# cuda related
+export CUDA_HOME=/usr/local/cuda-10.0
+export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
+
+# path related
+export PRJ_ROOT="${PWD}/../../.."
+if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then
+    # shellcheck disable=SC1090
+    . "${PRJ_ROOT}/tools/venv/bin/activate"
+fi
+
+# python related
+export OMP_NUM_THREADS=1
+export PYTHONIOENCODING=UTF-8
+export MPL_BACKEND=Agg
+
+# check installation
+if ! command -v parallel-wavegan-train > /dev/null; then
+    echo "Error: It seems setup is not finished." >&2
+    echo "Error: Please setup your environment by following README.md" >&2
+    return 1
+fi
+if ! command -v jq > /dev/null; then
+    echo "Error: It seems jq is not installed." >&2
+    echo "Error: Please install via \`sudo apt-get install jq\`." >&2
+    echo "Error: If you do not have sudo, please download from https://stedolan.github.io/jq/download/." >&2
+    return 1
+fi
+if ! command -v yq > /dev/null; then
+    echo "Error: It seems yq is not installed." >&2
+    echo "Error: Please install via \`pip install yq\`." >&2
+    return 1
+fi
diff --git a/ParallelWaveGAN/egs/csmsc/voc1/run.sh b/ParallelWaveGAN/egs/csmsc/voc1/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..f8ee36a16ae078ddb5729c6f5a9fb6fa25f13e34
--- /dev/null
+++ b/ParallelWaveGAN/egs/csmsc/voc1/run.sh
@@ -0,0 +1,164 @@
+#!/bin/bash
+
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+. ./cmd.sh || exit 1;
+. ./path.sh || exit 1;
+
+# basic settings
+stage=-1       # stage to start
+stop_stage=100 # stage to stop
+verbose=1      # verbosity level (lower is less info)
+n_gpus=1       # number of gpus in training
+n_jobs=16      # number of parallel jobs in feature extraction
+
+# NOTE(kan-bayashi): renamed to conf to avoid conflict in parse_options.sh
+conf=conf/parallel_wavegan.v1.yaml
+
+# directory path setting
+download_dir=downloads # direcotry to save downloaded files
+dumpdir=dump           # directory to dump features
+
+# training related setting
+tag=""     # tag for directory to save model
+resume=""  # checkpoint path to resume training
+           # (e.g. <path>/<to>/checkpoint-10000steps.pkl)
+
+# decoding related setting
+checkpoint="" # checkpoint path to be used for decoding
+              # if not provided, the latest one will be used
+              # (e.g. <path>/<to>/checkpoint-400000steps.pkl)
+
+# shellcheck disable=SC1091
+. utils/parse_options.sh || exit 1;
+
+train_set="train_nodev" # name of training data directory
+dev_set="dev"           # name of development data direcotry
+eval_set="eval"         # name of evaluation data direcotry
+
+set -euo pipefail
+
+if [ "${stage}" -le -1 ] && [ "${stop_stage}" -ge -1 ]; then
+    echo "Stage -1: Data download"
+    local/data_download.sh "${download_dir}"
+fi
+
+if [ "${stage}" -le 0 ] && [ "${stop_stage}" -ge 0 ]; then
+    echo "Stage 0: Data preparation"
+    local/data_prep.sh \
+        --train_set "${train_set}" \
+        --dev_set "${dev_set}" \
+        --eval_set "${eval_set}" \
+        "${download_dir}/CSMSC" data
+fi
+
+stats_ext=$(grep -q "hdf5" <(yq ".format" "${conf}") && echo "h5" || echo "npy")
+if [ "${stage}" -le 1 ] && [ "${stop_stage}" -ge 1 ]; then
+    echo "Stage 1: Feature extraction"
+    # extract raw features
+    pids=()
+    for name in "${train_set}" "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${dumpdir}/${name}/raw" ] && mkdir -p "${dumpdir}/${name}/raw"
+        echo "Feature extraction start. See the progress via ${dumpdir}/${name}/raw/preprocessing.*.log."
+        utils/make_subset_data.sh "data/${name}" "${n_jobs}" "${dumpdir}/${name}/raw"
+        ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/raw/preprocessing.JOB.log" \
+            parallel-wavegan-preprocess \
+                --config "${conf}" \
+                --scp "${dumpdir}/${name}/raw/wav.JOB.scp" \
+                --segments "${dumpdir}/${name}/raw/segments.JOB" \
+                --dumpdir "${dumpdir}/${name}/raw/dump.JOB" \
+                --verbose "${verbose}"
+        echo "Successfully finished feature extraction of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished feature extraction."
+
+    # calculate statistics for normalization
+    echo "Statistics computation start. See the progress via ${dumpdir}/${train_set}/compute_statistics.log."
+    ${train_cmd} "${dumpdir}/${train_set}/compute_statistics.log" \
+        parallel-wavegan-compute-statistics \
+            --config "${conf}" \
+            --rootdir "${dumpdir}/${train_set}/raw" \
+            --dumpdir "${dumpdir}/${train_set}" \
+            --verbose "${verbose}"
+    echo "Successfully finished calculation of statistics."
+
+    # normalize and dump them
+    pids=()
+    for name in "${train_set}" "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${dumpdir}/${name}/norm" ] && mkdir -p "${dumpdir}/${name}/norm"
+        echo "Nomalization start. See the progress via ${dumpdir}/${name}/norm/normalize.*.log."
+        ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/norm/normalize.JOB.log" \
+            parallel-wavegan-normalize \
+                --config "${conf}" \
+                --stats "${dumpdir}/${train_set}/stats.${stats_ext}" \
+                --rootdir "${dumpdir}/${name}/raw/dump.JOB" \
+                --dumpdir "${dumpdir}/${name}/norm/dump.JOB" \
+                --verbose "${verbose}"
+        echo "Successfully finished normalization of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished normalization."
+fi
+
+if [ -z "${tag}" ]; then
+    expdir="exp/${train_set}_csmsc_$(basename "${conf}" .yaml)"
+else
+    expdir="exp/${train_set}_csmsc_${tag}"
+fi
+if [ "${stage}" -le 2 ] && [ "${stop_stage}" -ge 2 ]; then
+    echo "Stage 2: Network training"
+    [ ! -e "${expdir}" ] && mkdir -p "${expdir}"
+    cp "${dumpdir}/${train_set}/stats.${stats_ext}" "${expdir}"
+    if [ "${n_gpus}" -gt 1 ]; then
+        train="python -m parallel_wavegan.distributed.launch --nproc_per_node ${n_gpus} -c parallel-wavegan-train"
+    else
+        train="parallel-wavegan-train"
+    fi
+    echo "Training start. See the progress via ${expdir}/train.log."
+    ${cuda_cmd} --gpu "${n_gpus}" "${expdir}/train.log" \
+        ${train} \
+            --config "${conf}" \
+            --train-dumpdir "${dumpdir}/${train_set}/norm" \
+            --dev-dumpdir "${dumpdir}/${dev_set}/norm" \
+            --outdir "${expdir}" \
+            --resume "${resume}" \
+            --verbose "${verbose}"
+    echo "Successfully finished training."
+fi
+
+if [ "${stage}" -le 3 ] && [ "${stop_stage}" -ge 3 ]; then
+    echo "Stage 3: Network decoding"
+    # shellcheck disable=SC2012
+    [ -z "${checkpoint}" ] && checkpoint="$(ls -dt "${expdir}"/*.pkl | head -1 || true)"
+    outdir="${expdir}/wav/$(basename "${checkpoint}" .pkl)"
+    pids=()
+    for name in "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${outdir}/${name}" ] && mkdir -p "${outdir}/${name}"
+        [ "${n_gpus}" -gt 1 ] && n_gpus=1
+        echo "Decoding start. See the progress via ${outdir}/${name}/decode.log."
+        ${cuda_cmd} --gpu "${n_gpus}" "${outdir}/${name}/decode.log" \
+            parallel-wavegan-decode \
+                --dumpdir "${dumpdir}/${name}/norm" \
+                --checkpoint "${checkpoint}" \
+                --outdir "${outdir}/${name}" \
+                --verbose "${verbose}"
+        echo "Successfully finished decoding of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished decoding."
+fi
+echo "Finished."
diff --git a/ParallelWaveGAN/egs/csmsc/voc1/utils b/ParallelWaveGAN/egs/csmsc/voc1/utils
new file mode 120000
index 0000000000000000000000000000000000000000..973afe674f2c85f7a400600f963e0709767602dc
--- /dev/null
+++ b/ParallelWaveGAN/egs/csmsc/voc1/utils
@@ -0,0 +1 @@
+../../../utils
\ No newline at end of file
diff --git a/ParallelWaveGAN/egs/jnas/voc1/cmd.sh b/ParallelWaveGAN/egs/jnas/voc1/cmd.sh
new file mode 100644
index 0000000000000000000000000000000000000000..19f342102fc4f3389157c48f1196b16b68eb1cf1
--- /dev/null
+++ b/ParallelWaveGAN/egs/jnas/voc1/cmd.sh
@@ -0,0 +1,91 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend="local"
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="utils/run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="utils/run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="utils/run.pl"
+
+# Local machine, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="utils/stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="utils/stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="utils/stdout.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="utils/queue.pl"
+    export cuda_cmd="utils/queue.pl"
+    export decode_cmd="utils/queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="utils/slurm.pl"
+    export cuda_cmd="utils/slurm.pl"
+    export decode_cmd="utils/slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="utils/ssh.pl"
+    export cuda_cmd="utils/ssh.pl"
+    export decode_cmd="utils/ssh.pl"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/ParallelWaveGAN/egs/jnas/voc1/conf/parallel_wavegan.v1.long.yaml b/ParallelWaveGAN/egs/jnas/voc1/conf/parallel_wavegan.v1.long.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4f9103e9a2c8f4e109e7ce0d30ad75cedae838b8
--- /dev/null
+++ b/ParallelWaveGAN/egs/jnas/voc1/conf/parallel_wavegan.v1.long.yaml
@@ -0,0 +1,123 @@
+# This is the hyperparameter configuration file for Parallel WaveGAN.
+# Please make sure this is adjusted for the JNAS dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration trains more steps up to 1000k compared to v1 config.
+# It requires 12 GB GPU memory and takes ~7 days on TITAN V.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 16000     # Sampling rate.
+fft_size: 1024           # FFT size.
+hop_size: 256            # Hop size.
+win_length: null         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: true       # Whether to trim the start and end of silence.
+trim_threshold_in_db: 20 # Need to tune carefully if the recording is not good.
+trim_frame_size: 1024    # Frame size in trimming.
+trim_hop_size: 256       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Kernel size of dilated convolution.
+    layers: 30            # Number of residual block layers.
+    stacks: 3             # Number of stacks i.e., dilation cycles.
+    residual_channels: 64 # Number of channels in residual conv.
+    gate_channels: 128    # Number of channels in gated conv.
+    skip_channels: 64     # Number of channels in skip conv.
+    aux_channels: 80      # Number of channels for auxiliary feature conv.
+                          # Must be the same as num_mels.
+    aux_context_window: 2 # Context window size for auxiliary feature.
+                          # If set to 2, previous 2 and future 2 frames will be considered.
+    dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
+    upsample_params:                      # Upsampling network parameters.
+        upsample_scales: [4, 4, 4, 4]     # Upsampling scales. Prodcut of these must be the same as hop size.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Number of output channels.
+    layers: 10            # Number of conv layers.
+    conv_channels: 64     # Number of chnn layers.
+    bias: true            # Whether to use bias parameter in conv.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
+    nonlinear_activation_params:      # Nonlinear function parameters
+        negative_slope: 0.2           # Alpha in LeakyReLU.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 4.0  # Loss balancing coefficient.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 10             # Batch size.
+batch_max_steps: 15360     # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2             # Number of workers in Pytorch DataLoader.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    lr: 0.0001             # Generator's learning rate.
+    eps: 1.0e-6            # Generator's epsilon.
+    weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+    step_size: 200000      # Generator's scheduler step size.
+    gamma: 0.5             # Generator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+    lr: 0.00005            # Discriminator's learning rate.
+    eps: 1.0e-6            # Discriminator's epsilon.
+    weight_decay: 0.0      # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    step_size: 200000      # Discriminator's scheduler step size.
+    gamma: 0.5             # Discriminator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 1000000                # Number of training steps.
+save_interval_steps: 5000               # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+log_interval_steps: 100                 # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/jnas/voc1/conf/parallel_wavegan.v1.yaml b/ParallelWaveGAN/egs/jnas/voc1/conf/parallel_wavegan.v1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ec9deadc108ab175d1bf1a974be1be9b88f8f966
--- /dev/null
+++ b/ParallelWaveGAN/egs/jnas/voc1/conf/parallel_wavegan.v1.yaml
@@ -0,0 +1,122 @@
+# This is the hyperparameter configuration file for Parallel WaveGAN.
+# Please make sure this is adjusted for the JNAS dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration requires 12 GB GPU memory and takes ~3 days on TITAN V.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 16000     # Sampling rate.
+fft_size: 1024           # FFT size.
+hop_size: 256            # Hop size.
+win_length: null         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: true       # Whether to trim the start and end of silence.
+trim_threshold_in_db: 20 # Need to tune carefully if the recording is not good.
+trim_frame_size: 1024    # Frame size in trimming.
+trim_hop_size: 256       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Kernel size of dilated convolution.
+    layers: 30            # Number of residual block layers.
+    stacks: 3             # Number of stacks i.e., dilation cycles.
+    residual_channels: 64 # Number of channels in residual conv.
+    gate_channels: 128    # Number of channels in gated conv.
+    skip_channels: 64     # Number of channels in skip conv.
+    aux_channels: 80      # Number of channels for auxiliary feature conv.
+                          # Must be the same as num_mels.
+    aux_context_window: 2 # Context window size for auxiliary feature.
+                          # If set to 2, previous 2 and future 2 frames will be considered.
+    dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
+    upsample_params:                      # Upsampling network parameters.
+        upsample_scales: [4, 4, 4, 4]     # Upsampling scales. Prodcut of these must be the same as hop size.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Number of output channels.
+    layers: 10            # Number of conv layers.
+    conv_channels: 64     # Number of chnn layers.
+    bias: true            # Whether to use bias parameter in conv.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
+    nonlinear_activation_params:      # Nonlinear function parameters
+        negative_slope: 0.2           # Alpha in LeakyReLU.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 4.0  # Loss balancing coefficient.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 10             # Batch size.
+batch_max_steps: 15360     # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2             # Number of workers in Pytorch DataLoader.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    lr: 0.0001             # Generator's learning rate.
+    eps: 1.0e-6            # Generator's epsilon.
+    weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+    step_size: 200000      # Generator's scheduler step size.
+    gamma: 0.5             # Generator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+    lr: 0.00005            # Discriminator's learning rate.
+    eps: 1.0e-6            # Discriminator's epsilon.
+    weight_decay: 0.0      # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    step_size: 200000      # Discriminator's scheduler step size.
+    gamma: 0.5             # Discriminator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 400000                 # Number of training steps.
+save_interval_steps: 5000               # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+log_interval_steps: 100                 # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/jnas/voc1/conf/slurm.conf b/ParallelWaveGAN/egs/jnas/voc1/conf/slurm.conf
new file mode 100644
index 0000000000000000000000000000000000000000..cefd21f031d647f798e8045a6beb92b0b833d041
--- /dev/null
+++ b/ParallelWaveGAN/egs/jnas/voc1/conf/slurm.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command sbatch --export=PATH  --ntasks-per-node=1
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
+option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1 # Do not add anything to qsub_opts
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/ParallelWaveGAN/egs/jnas/voc1/conf/train_speakers.txt b/ParallelWaveGAN/egs/jnas/voc1/conf/train_speakers.txt
new file mode 100644
index 0000000000000000000000000000000000000000..df35e7030fce52358e0b9420c93921166bf71c69
--- /dev/null
+++ b/ParallelWaveGAN/egs/jnas/voc1/conf/train_speakers.txt
@@ -0,0 +1,261 @@
+F001
+F002
+F003
+F004
+F007
+F008
+F009
+F010
+F012
+F013
+F015
+F016
+F018
+F019
+F020
+F022
+F023
+F024
+F025
+F026
+F027
+F028
+F029
+F030
+F031
+F032
+F033
+F034
+F035
+F036
+F037
+F038
+F039
+F040
+F041
+F042
+F043
+F044
+F045
+F046
+F047
+F049
+F050
+F051
+F052
+F053
+F054
+F055
+F056
+F057
+F058
+F059
+F061
+F062
+F063
+F065
+F066
+F067
+F069
+F070
+F071
+F073
+F074
+F076
+F077
+F079
+F081
+F083
+F084
+F085
+F087
+F090
+F091
+F092
+F093
+F094
+F095
+F096
+F097
+F098
+F099
+F100
+F101
+F103
+F104
+F105
+F106
+F107
+F108
+F110
+F111
+F112
+F113
+F114
+F115
+F116
+F117
+F118
+F119
+F120
+F121
+F123
+F124
+F125
+F126
+F127
+F128
+F130
+F131
+F132
+F133
+F134
+F135
+F136
+F137
+F138
+F143A
+F143B
+F144A
+F145A
+F145B
+F146A
+F146B
+F149
+F150
+FP01
+FP02
+FP03
+FP04
+FP05
+M001
+M002
+M003
+M004
+M007
+M008
+M009
+M010
+M011
+M012
+M013
+M015
+M016
+M018
+M019
+M020
+M021
+M022
+M023
+M024
+M025
+M027
+M028
+M029
+M030
+M031
+M032
+M033
+M034
+M035
+M036
+M037
+M038
+M039
+M040
+M041
+M042
+M043
+M044
+M045
+M046
+M047
+M049
+M050
+M051
+M052
+M053
+M054
+M055
+M056
+M057
+M058
+M059
+M061
+M062
+M063
+M065
+M066
+M067
+M069
+M070
+M071
+M073
+M074
+M076
+M077
+M079
+M081
+M083
+M084
+M085
+M087
+M090
+M091
+M092
+M093
+M094
+M095
+M096
+M097
+M098
+M099
+M100
+M101
+M103
+M104
+M105
+M106
+M107
+M108
+M110
+M111
+M112
+M113
+M114
+M116
+M117
+M118
+M119
+M120
+M121
+M123
+M124
+M125
+M126
+M127
+M128
+M130
+M131
+M132
+M133
+M134
+M135
+M136
+M137
+M138
+M139A
+M139B
+M140A
+M140B
+M141A
+M141B
+M142A
+M142B
+M147
+M148
+MP01
+MP02
+MP03
+MP04
+MP05
diff --git a/ParallelWaveGAN/egs/jnas/voc1/local/data_prep.sh b/ParallelWaveGAN/egs/jnas/voc1/local/data_prep.sh
new file mode 100755
index 0000000000000000000000000000000000000000..f7ca32c0f9df4f11f57647c650cfec658f185350
--- /dev/null
+++ b/ParallelWaveGAN/egs/jnas/voc1/local/data_prep.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+# shellcheck disable=SC1091
+. ./path.sh || exit 1;
+
+num_dev=500
+train_set="train_nodev"
+dev_set="dev"
+eval_set="eval"
+shuffle=false
+
+# shellcheck disable=SC1091
+. utils/parse_options.sh || exit 1;
+
+# check arguments
+if [ $# != 3 ]; then
+    echo "Usage: $0 <db_root> <data_dir> <spk_list>"
+    echo "e.g.: $0 /database/JNAS data conf/train_speakers.txt"
+    echo ""
+    echo "Options:"
+    echo "    --num_dev: number of development uttreances (default=500)."
+    echo "    --train_set: name of train set (default=train_nodev)."
+    echo "    --dev_set: name of dev set (default=dev)."
+    echo "    --eval_set: name of eval set (default=eval)."
+    echo "    --shuffle: whether to perform shuffle in making dev / eval set (default=false)."
+    exit 1
+fi
+
+set -euo pipefail
+
+db_root=$1  # database root directory
+data_dir=$2
+spk_list=$3
+
+eval_db_root=${db_root}/DOCS/Test_set
+wav_type=HS  # DT or HS
+
+# make directories
+for name in train "${eval_set}"; do
+    [ ! -e "${data_dir}/${name}" ] && mkdir -p "${data_dir}/${name}"
+done
+
+# make training & development data
+scp="${data_dir}/train/wav.scp"
+
+# check file existence
+[ -e "${scp}" ] && rm "${scp}"
+
+# shellcheck disable=SC2013
+for spk in $(cat "${spk_list}"); do
+    wavdir=${db_root}/WAVES_${wav_type}/${spk}
+    [ ! -e "${wavdir}" ] && echo "There are no such a directory (${wavdir})" && exit 1
+    find "${wavdir}" -follow -name "*.wav" | sort | while read -r filename; do
+        id=$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")
+        echo "${spk}_${id} ${filename}" >> "${scp}"
+    done
+done
+
+# shuffle
+cp "${scp}" "${scp}.tmp"
+sort -R "${scp}.tmp" > "${scp}"
+rm -r "${scp}.tmp"
+
+# split
+utils/split_data.sh \
+    --num_second ${num_dev} \
+    --shuffle "${shuffle}" \
+    "${data_dir}/train" \
+    "${data_dir}/${train_set}" \
+    "${data_dir}/${dev_set}"
+
+# make evaluation data
+scp="${data_dir}/${eval_set}/wav.scp"
+
+# check file existence
+[ -e "${scp}" ] && rm "${scp}"
+
+for name in JNAS_testset_100 JNAS_testset_500; do
+    find "${eval_db_root}/${name}/WAVES" -follow -name "*.wav" | sort | while read -r filename; do
+        id=$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")
+        dirname=$(basename "$(dirname "${filename}")")
+        echo "${name}_${dirname}_${id} ${filename}" >> "${scp}"
+    done
+done
+
+echo "Successfully prepared data."
diff --git a/ParallelWaveGAN/egs/jnas/voc1/path.sh b/ParallelWaveGAN/egs/jnas/voc1/path.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b0ca27c615f70aa29e240222ec370f8ad4e7b45a
--- /dev/null
+++ b/ParallelWaveGAN/egs/jnas/voc1/path.sh
@@ -0,0 +1,33 @@
+# cuda related
+export CUDA_HOME=/usr/local/cuda-10.0
+export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
+
+# path related
+export PRJ_ROOT="${PWD}/../../.."
+if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then
+    # shellcheck disable=SC1090
+    . "${PRJ_ROOT}/tools/venv/bin/activate"
+fi
+
+# python related
+export OMP_NUM_THREADS=1
+export PYTHONIOENCODING=UTF-8
+export MPL_BACKEND=Agg
+
+# check installation
+if ! command -v parallel-wavegan-train > /dev/null; then
+    echo "Error: It seems setup is not finished." >&2
+    echo "Error: Please setup your environment by following README.md" >&2
+    return 1
+fi
+if ! command -v jq > /dev/null; then
+    echo "Error: It seems jq is not installed." >&2
+    echo "Error: Please install via \`sudo apt-get install jq\`." >&2
+    echo "Error: If you do not have sudo, please download from https://stedolan.github.io/jq/download/." >&2
+    return 1
+fi
+if ! command -v yq > /dev/null; then
+    echo "Error: It seems yq is not installed." >&2
+    echo "Error: Please install via \`pip install yq\`." >&2
+    return 1
+fi
diff --git a/ParallelWaveGAN/egs/jnas/voc1/run.sh b/ParallelWaveGAN/egs/jnas/voc1/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..fd614e1d07287b107012877565db015e57ebc96b
--- /dev/null
+++ b/ParallelWaveGAN/egs/jnas/voc1/run.sh
@@ -0,0 +1,158 @@
+#!/bin/bash
+
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+. ./cmd.sh || exit 1;
+. ./path.sh || exit 1;
+
+# basic settings
+stage=0        # stage to start
+stop_stage=100 # stage to stop
+verbose=1      # verbosity level (lower is less info)
+n_gpus=1       # number of gpus in training
+n_jobs=16      # number of parallel jobs in feature extraction
+
+# NOTE(kan-bayashi): renamed to conf to avoid conflict in parse_options.sh
+conf=conf/parallel_wavegan.v1.yaml
+
+# directory path setting
+db_root=/database/JNAS # database direcotry
+dumpdir=dump           # directory to dump features
+
+# training related setting
+tag=""     # tag for directory to save model
+resume=""  # checkpoint path to resume training
+           # (e.g. <path>/<to>/checkpoint-10000steps.pkl)
+
+# decoding related setting
+checkpoint="" # checkpoint path to be used for decoding
+              # if not provided, the latest one will be used
+              # (e.g. <path>/<to>/checkpoint-400000steps.pkl)
+
+# shellcheck disable=SC1091
+. utils/parse_options.sh || exit 1;
+
+train_set="train_nodev" # name of training data directory
+dev_set="dev"           # name of development data direcotry
+eval_set="eval"         # name of evaluation data direcotry
+
+set -euo pipefail
+
+if [ "${stage}" -le 0 ] && [ "${stop_stage}" -ge 0 ]; then
+    echo "Stage 0: Data preparation"
+    local/data_prep.sh \
+        --train_set "${train_set}" \
+        --dev_set "${dev_set}" \
+        --eval_set "${eval_set}" \
+        "${db_root}" data conf/train_speakers.txt
+fi
+
+stats_ext=$(grep -q "hdf5" <(yq ".format" "${conf}") && echo "h5" || echo "npy")
+if [ "${stage}" -le 1 ] && [ "${stop_stage}" -ge 1 ]; then
+    echo "Stage 1: Feature extraction"
+    # extract raw features
+    pids=()
+    for name in "${train_set}" "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${dumpdir}/${name}/raw" ] && mkdir -p "${dumpdir}/${name}/raw"
+        echo "Feature extraction start. See the progress via ${dumpdir}/${name}/raw/preprocessing.*.log."
+        utils/make_subset_data.sh "data/${name}" "${n_jobs}" "${dumpdir}/${name}/raw"
+        ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/raw/preprocessing.JOB.log" \
+            parallel-wavegan-preprocess \
+                --config "${conf}" \
+                --scp "${dumpdir}/${name}/raw/wav.JOB.scp" \
+                --dumpdir "${dumpdir}/${name}/raw/dump.JOB" \
+                --verbose "${verbose}"
+        echo "Successfully finished feature extraction of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished feature extraction."
+
+    # calculate statistics for normalization
+    echo "Statistics computation start. See the progress via ${dumpdir}/${train_set}/compute_statistics.log."
+    ${train_cmd} "${dumpdir}/${train_set}/compute_statistics.log" \
+        parallel-wavegan-compute-statistics \
+            --config "${conf}" \
+            --rootdir "${dumpdir}/${train_set}/raw" \
+            --dumpdir "${dumpdir}/${train_set}" \
+            --verbose "${verbose}"
+    echo "Successfully finished calculation of statistics."
+
+    # normalize and dump them
+    pids=()
+    for name in "${train_set}" "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${dumpdir}/${name}/norm" ] && mkdir -p "${dumpdir}/${name}/norm"
+        echo "Nomalization start. See the progress via ${dumpdir}/${name}/norm/normalize.*.log."
+        ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/norm/normalize.JOB.log" \
+            parallel-wavegan-normalize \
+                --config "${conf}" \
+                --stats "${dumpdir}/${train_set}/stats.${stats_ext}" \
+                --rootdir "${dumpdir}/${name}/raw/dump.JOB" \
+                --dumpdir "${dumpdir}/${name}/norm/dump.JOB" \
+                --verbose "${verbose}"
+        echo "Successfully finished normalization of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished normalization."
+fi
+
+if [ -z "${tag}" ]; then
+    expdir="exp/${train_set}_jnas_$(basename "${conf}" .yaml)"
+else
+    expdir="exp/${train_set}_jnas_${tag}"
+fi
+if [ "${stage}" -le 2 ] && [ "${stop_stage}" -ge 2 ]; then
+    echo "Stage 2: Network training"
+    [ ! -e "${expdir}" ] && mkdir -p "${expdir}"
+    cp "${dumpdir}/${train_set}/stats.${stats_ext}" "${expdir}"
+    if [ "${n_gpus}" -gt 1 ]; then
+        train="python -m parallel_wavegan.distributed.launch --nproc_per_node ${n_gpus} -c parallel-wavegan-train"
+    else
+        train="parallel-wavegan-train"
+    fi
+    echo "Training start. See the progress via ${expdir}/train.log."
+    ${cuda_cmd} --gpu "${n_gpus}" "${expdir}/train.log" \
+        ${train} \
+            --config "${conf}" \
+            --train-dumpdir "${dumpdir}/${train_set}/norm" \
+            --dev-dumpdir "${dumpdir}/${dev_set}/norm" \
+            --outdir "${expdir}" \
+            --resume "${resume}" \
+            --verbose "${verbose}"
+    echo "Successfully finished training."
+fi
+
+if [ "${stage}" -le 3 ] && [ "${stop_stage}" -ge 3 ]; then
+    echo "Stage 3: Network decoding"
+    # shellcheck disable=SC2012
+    [ -z "${checkpoint}" ] && checkpoint="$(ls -dt "${expdir}"/*.pkl | head -1 || true)"
+    outdir="${expdir}/wav/$(basename "${checkpoint}" .pkl)"
+    pids=()
+    for name in "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${outdir}/${name}" ] && mkdir -p "${outdir}/${name}"
+        [ "${n_gpus}" -gt 1 ] && n_gpus=1
+        echo "Decoding start. See the progress via ${outdir}/${name}/decode.log."
+        ${cuda_cmd} --gpu "${n_gpus}" "${outdir}/${name}/decode.log" \
+            parallel-wavegan-decode \
+                --dumpdir "${dumpdir}/${name}/norm" \
+                --checkpoint "${checkpoint}" \
+                --outdir "${outdir}/${name}" \
+                --verbose "${verbose}"
+        echo "Successfully finished decoding of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished decoding."
+fi
+echo "Finished."
diff --git a/ParallelWaveGAN/egs/jnas/voc1/utils b/ParallelWaveGAN/egs/jnas/voc1/utils
new file mode 120000
index 0000000000000000000000000000000000000000..973afe674f2c85f7a400600f963e0709767602dc
--- /dev/null
+++ b/ParallelWaveGAN/egs/jnas/voc1/utils
@@ -0,0 +1 @@
+../../../utils
\ No newline at end of file
diff --git a/ParallelWaveGAN/egs/jsss/voc1/cmd.sh b/ParallelWaveGAN/egs/jsss/voc1/cmd.sh
new file mode 100644
index 0000000000000000000000000000000000000000..19f342102fc4f3389157c48f1196b16b68eb1cf1
--- /dev/null
+++ b/ParallelWaveGAN/egs/jsss/voc1/cmd.sh
@@ -0,0 +1,91 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend="local"
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="utils/run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="utils/run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="utils/run.pl"
+
+# Local machine, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="utils/stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="utils/stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="utils/stdout.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="utils/queue.pl"
+    export cuda_cmd="utils/queue.pl"
+    export decode_cmd="utils/queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="utils/slurm.pl"
+    export cuda_cmd="utils/slurm.pl"
+    export decode_cmd="utils/slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="utils/ssh.pl"
+    export cuda_cmd="utils/ssh.pl"
+    export decode_cmd="utils/ssh.pl"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/ParallelWaveGAN/egs/jsss/voc1/conf/parallel_wavegan.v1.yaml b/ParallelWaveGAN/egs/jsss/voc1/conf/parallel_wavegan.v1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1332852d069e6eed48d470e0b71c029e5c2e5f1e
--- /dev/null
+++ b/ParallelWaveGAN/egs/jsss/voc1/conf/parallel_wavegan.v1.yaml
@@ -0,0 +1,122 @@
+# This is the hyperparameter configuration file for Parallel WaveGAN.
+# Please make sure this is adjusted for the JSSS dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration requires 12 GB GPU memory and takes ~3 days on RTX TITAN.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 24000     # Sampling rate.
+fft_size: 2048           # FFT size.
+hop_size: 300            # Hop size.
+win_length: 1200         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: false      # Whether to trim the start and end of silence.
+trim_threshold_in_db: 40 # Need to tune carefully if the recording is not good.
+trim_frame_size: 2048    # Frame size in trimming.
+trim_hop_size: 512       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Kernel size of dilated convolution.
+    layers: 30            # Number of residual block layers.
+    stacks: 3             # Number of stacks i.e., dilation cycles.
+    residual_channels: 64 # Number of channels in residual conv.
+    gate_channels: 128    # Number of channels in gated conv.
+    skip_channels: 64     # Number of channels in skip conv.
+    aux_channels: 80      # Number of channels for auxiliary feature conv.
+                          # Must be the same as num_mels.
+    aux_context_window: 2 # Context window size for auxiliary feature.
+                          # If set to 2, previous 2 and future 2 frames will be considered.
+    dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
+    upsample_params:                      # Upsampling network parameters.
+        upsample_scales: [4, 5, 3, 5]     # Upsampling scales. Prodcut of these must be the same as hop size.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Number of output channels.
+    layers: 10            # Number of conv layers.
+    conv_channels: 64     # Number of chnn layers.
+    bias: true            # Whether to use bias parameter in conv.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
+    nonlinear_activation_params:      # Nonlinear function parameters
+        negative_slope: 0.2           # Alpha in LeakyReLU.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 4.0  # Loss balancing coefficient.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 6              # Batch size.
+batch_max_steps: 25500     # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2             # Number of workers in Pytorch DataLoader.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    lr: 0.0001             # Generator's learning rate.
+    eps: 1.0e-6            # Generator's epsilon.
+    weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+    step_size: 200000      # Generator's scheduler step size.
+    gamma: 0.5             # Generator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+    lr: 0.00005            # Discriminator's learning rate.
+    eps: 1.0e-6            # Discriminator's epsilon.
+    weight_decay: 0.0      # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    step_size: 200000      # Discriminator's scheduler step size.
+    gamma: 0.5             # Discriminator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 400000                 # Number of training steps.
+save_interval_steps: 5000               # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+log_interval_steps: 100                 # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/jsss/voc1/conf/slurm.conf b/ParallelWaveGAN/egs/jsss/voc1/conf/slurm.conf
new file mode 100644
index 0000000000000000000000000000000000000000..cefd21f031d647f798e8045a6beb92b0b833d041
--- /dev/null
+++ b/ParallelWaveGAN/egs/jsss/voc1/conf/slurm.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command sbatch --export=PATH  --ntasks-per-node=1
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
+option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1 # Do not add anything to qsub_opts
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/ParallelWaveGAN/egs/jsss/voc1/local/data_download.sh b/ParallelWaveGAN/egs/jsss/voc1/local/data_download.sh
new file mode 100755
index 0000000000000000000000000000000000000000..d9ae2557d008b48d6262393f45dadadc0dd78a25
--- /dev/null
+++ b/ParallelWaveGAN/egs/jsss/voc1/local/data_download.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+# Copyright 2020 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+# Download JSSS Corpus
+
+# shellcheck disable=SC1091
+. ./path.sh || exit 1
+
+download_dir=$1
+
+# check arguments
+if [ $# != 1 ]; then
+    echo "Usage: $0 <download_dir>"
+    exit 1
+fi
+
+set -euo pipefail
+
+url="https://drive.google.com/a/g.sp.m.is.nagoya-u.ac.jp/uc?id=1NyiZCXkYTdYBNtD1B-IMAYCVa-0SQsKX"
+if [ ! -e "${download_dir}/jsss_ver1" ]; then
+    utils/download_from_google_drive.sh "${url}" "${download_dir}" zip
+    echo "Successfully downloaded JSSS corpus."
+else
+    echo "Already exists. Skipped."
+fi
+
+cwd=$(pwd)
+if [ ! -e "${download_dir}/JSSSLabel" ]; then
+    echo "Downloading full-context labels for jsut v1.1..."
+    cd "${download_dir}"
+    git clone https://github.com/kan-bayashi/JSSSLabel
+    for name in long-form short-form simplification summarization; do
+        cp -vr JSSSLabel/${name} jsss_ver1
+    done
+    cd "${cwd}"
+    echo "Successfully downloaded JSSS label."
+else
+    echo "Already exists. Skipped."
+fi
diff --git a/ParallelWaveGAN/egs/jsss/voc1/local/data_prep.sh b/ParallelWaveGAN/egs/jsss/voc1/local/data_prep.sh
new file mode 100755
index 0000000000000000000000000000000000000000..62e42b6e1468c9b502cdf33131ba6bc0982c458a
--- /dev/null
+++ b/ParallelWaveGAN/egs/jsss/voc1/local/data_prep.sh
@@ -0,0 +1,180 @@
+#!/bin/bash
+
+# Copyright 2020 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+# Prepare kaldi-style data directory for JSSS corpus
+
+fs=24000
+num_dev=50
+num_eval=50
+train_set="train_nodev"
+dev_set="dev"
+eval_set="eval"
+shuffle=false
+
+# shellcheck disable=SC1091
+. utils/parse_options.sh || exit 1;
+
+db=$1
+data_dir_root=$2
+
+# check arguments
+if [ $# != 2 ]; then
+    echo "Usage: $0 [Options] <db> <data_dir>"
+    echo "e.g.: $0 downloads/jsss_ver1 data"
+    echo ""
+    echo "Options:"
+    echo "    --fs: target sampling rate (default=24000)."
+    echo "    --num_dev: number of development uttreances (default=50)."
+    echo "    --num_eval: number of evaluation uttreances (default=50)."
+    echo "    --train_set: name of train set (default=train_nodev)."
+    echo "    --dev_set: name of dev set (default=dev)."
+    echo "    --eval_set: name of eval set (default=eval)."
+    echo "    --shuffle: whether to perform shuffle in making dev / eval set (default=false)."
+    exit 1
+fi
+
+set -euo pipefail
+
+######################################
+#    process data without segments   #
+######################################
+dsets_without_segments="
+short-form/basic5000
+short-form/onomatopee300
+short-form/voiceactress100
+simplification
+"
+for dset in ${dsets_without_segments}; do
+    # check directory existence
+    _data_dir=${data_dir_root}/$(basename "${dset}")
+    [ ! -e "${_data_dir}" ] && mkdir -p "${_data_dir}"
+
+    # set filenames
+    scp=${_data_dir}/wav.scp
+    segments=${_data_dir}/segments
+
+    # check file existence
+    [ -e "${scp}" ] && rm "${scp}"
+    [ -e "${segments}" ] && rm "${segments}"
+
+    # make wav.scp and segments
+    find "${db}/${dset}/wav24kHz16bit" -name "*.wav" | sort | while read -r filename; do
+        utt_id=$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")
+        lab_filename="${db}/${dset}/lab/$(basename "${filename}" .wav).lab"
+        if [ ! -e "${lab_filename}" ]; then
+            echo "${lab_filename} does not exist. Skipped."
+            continue
+        fi
+        start_sec=$(head -n 1 "${lab_filename}" | cut -d " " -f 2)
+        end_sec=$(tail -n 1 "${lab_filename}" | cut -d " " -f 1)
+        echo "${utt_id} ${utt_id} ${start_sec} ${end_sec}" >> "${segments}"
+        if [ "${fs}" -eq 24000 ]; then
+            # default sampling rate
+            echo "${utt_id} ${filename}" >> "${scp}"
+        else
+            echo "${utt_id} sox ${filename} -t wav -r $fs - |" >> "${scp}"
+        fi
+    done
+    echo "Successfully prepared ${dset}."
+done
+
+######################################
+#     process data with segments     #
+######################################
+dsets_with_segments="
+long-form/katsura-masakazu
+long-form/udon
+long-form/washington-dc
+summarization
+"
+for dset in ${dsets_with_segments}; do
+    # check directory existence
+    _data_dir=${data_dir_root}/$(basename "${dset}")
+    [ ! -e "${_data_dir}" ] && mkdir -p "${_data_dir}"
+
+    # set filenames
+    scp=${_data_dir}/wav.scp
+    segments=${_data_dir}/segments
+
+    # check file existence
+    [ -e "${scp}" ] && rm "${scp}"
+    [ -e "${segments}" ] && rm "${segments}"
+
+    # make wav.scp
+    find "${db}/${dset}/wav24kHz16bit" -name "*.wav" | sort | while read -r filename; do
+        wav_id=$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")
+        if [ "${fs}" -eq 24000 ]; then
+            # default sampling rate
+            echo "${wav_id} ${filename}" >> "${scp}"
+        else
+            echo "${wav_id} sox ${filename} -t wav -r $fs - |" >> "${scp}"
+        fi
+    done
+
+    # make segments
+    find "${db}/${dset}/transcript_utf8" -name "*.txt" | sort | while read -r filename; do
+        wav_id=$(basename "${filename}" .txt)
+        while read -r line; do
+            start_sec=$(echo "${line}" | cut -f 1)
+            end_sec=$(echo "${line}" | cut -f 2)
+            utt_id=${wav_id}
+            utt_id+="_$(printf %010d "$(echo "${start_sec}" | tr -d "." | sed -e "s/^[0]*//g")")"
+            utt_id+="_$(printf %010d "$(echo "${end_sec}" | tr -d "." | sed -e "s/^[0]*//g")")"
+
+            # modify segment information with force alignment results
+            lab_filename=${db}/${dset}/lab/${utt_id}.lab
+            if [ ! -e "${lab_filename}" ]; then
+                echo "${lab_filename} does not exist. Skipped."
+                continue
+            fi
+            start_sec_offset=$(head -n 1 "${lab_filename}" | cut -d " " -f 2)
+            end_sec_offset=$(tail -n 1 "${lab_filename}" | cut -d " " -f 1)
+            start_sec=$(python -c "print(${start_sec} + ${start_sec_offset})")
+            end_sec=$(python -c "print(${start_sec} + ${end_sec_offset} - ${start_sec_offset})")
+            echo "${utt_id} ${wav_id} ${start_sec} ${end_sec}" >> "${segments}"
+        done < "${filename}"
+    done
+
+    # fix
+    echo "Successfully prepared ${dset}."
+done
+
+######################################
+#       combine and split data       #
+######################################
+# combine all data
+combined_data_dirs=""
+for dset in ${dsets_without_segments} ${dsets_with_segments}; do
+    combined_data_dirs+="${data_dir_root}/$(basename "${dset}") "
+done
+# shellcheck disable=SC2086
+utils/combine_data.sh "${data_dir_root}/all" ${combined_data_dirs}
+# shellcheck disable=SC2086
+rm -rf ${combined_data_dirs}
+
+# split
+num_all=$(wc -l < "${data_dir_root}/all/segments")
+num_deveval=$((num_dev + num_eval))
+num_train=$((num_all - num_deveval))
+utils/split_data.sh \
+    --num_first "${num_deveval}" \
+    --num_second "${num_train}" \
+    --shuffle "${shuffle}" \
+    "${data_dir_root}/all" \
+    "${data_dir_root}/deveval" \
+    "${data_dir_root}/${train_set}"
+utils/split_data.sh \
+    --num_first "${num_eval}" \
+    --num_second "${num_dev}" \
+    --shuffle "${shuffle}" \
+    "${data_dir_root}/deveval" \
+    "${data_dir_root}/${eval_set}" \
+    "${data_dir_root}/${dev_set}"
+
+# remove tmp directories
+rm -rf "${data_dir_root}/all"
+rm -rf "${data_dir_root}/deveval"
+
+echo "Successfully prepared data."
diff --git a/ParallelWaveGAN/egs/jsss/voc1/path.sh b/ParallelWaveGAN/egs/jsss/voc1/path.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b0ca27c615f70aa29e240222ec370f8ad4e7b45a
--- /dev/null
+++ b/ParallelWaveGAN/egs/jsss/voc1/path.sh
@@ -0,0 +1,33 @@
+# cuda related
+export CUDA_HOME=/usr/local/cuda-10.0
+export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
+
+# path related
+export PRJ_ROOT="${PWD}/../../.."
+if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then
+    # shellcheck disable=SC1090
+    . "${PRJ_ROOT}/tools/venv/bin/activate"
+fi
+
+# python related
+export OMP_NUM_THREADS=1
+export PYTHONIOENCODING=UTF-8
+export MPL_BACKEND=Agg
+
+# check installation
+if ! command -v parallel-wavegan-train > /dev/null; then
+    echo "Error: It seems setup is not finished." >&2
+    echo "Error: Please setup your environment by following README.md" >&2
+    return 1
+fi
+if ! command -v jq > /dev/null; then
+    echo "Error: It seems jq is not installed." >&2
+    echo "Error: Please install via \`sudo apt-get install jq\`." >&2
+    echo "Error: If you do not have sudo, please download from https://stedolan.github.io/jq/download/." >&2
+    return 1
+fi
+if ! command -v yq > /dev/null; then
+    echo "Error: It seems yq is not installed." >&2
+    echo "Error: Please install via \`pip install yq\`." >&2
+    return 1
+fi
diff --git a/ParallelWaveGAN/egs/jsss/voc1/run.sh b/ParallelWaveGAN/egs/jsss/voc1/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..01ff33fc99c64072aabf5bbcc00bc04bee6471e2
--- /dev/null
+++ b/ParallelWaveGAN/egs/jsss/voc1/run.sh
@@ -0,0 +1,186 @@
+#!/bin/bash
+
+# Copyright 2020 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+. ./cmd.sh || exit 1;
+. ./path.sh || exit 1;
+
+# basic settings
+stage=-1       # stage to start
+stop_stage=100 # stage to stop
+verbose=1      # verbosity level (lower is less info)
+n_gpus=1       # number of gpus in training
+n_jobs=4       # number of parallel jobs in feature extraction
+
+# NOTE(kan-bayashi): renamed to conf to avoid conflict in parse_options.sh
+conf=conf/parallel_wavegan.v1.yaml
+
+# directory path setting
+download_dir=downloads # direcotry to save downloaded files
+dumpdir=dump # directory to dump features
+
+# subset setting
+shuffle=false # whether to shuffle the data to create subset
+num_dev=50    # the number of development data
+num_eval=50   # the number of evaluation data
+              # (if set to 0, the same dev set is used as eval set)
+
+# training related setting
+tag=""     # tag for directory to save model
+resume=""  # checkpoint path to resume training
+           # (e.g. <path>/<to>/checkpoint-10000steps.pkl)
+pretrain="" # checkpoint path to load pretrained parameters
+            # (e.g. ../../jsut/<path>/<to>/checkpoint-400000steps.pkl)
+
+# decoding related setting
+checkpoint="" # checkpoint path to be used for decoding
+              # if not provided, the latest one will be used
+              # (e.g. <path>/<to>/checkpoint-400000steps.pkl)
+
+# shellcheck disable=SC1091
+. utils/parse_options.sh || exit 1;
+
+train_set="train_nodev" # name of training data directory
+dev_set="dev"           # name of development data direcotry
+eval_set="eval"         # name of evaluation data direcotry
+
+set -euo pipefail
+
+if [ "${stage}" -le -1 ] && [ "${stop_stage}" -ge -1 ]; then
+    echo "Stage -1: Data download"
+    local/data_download.sh "${download_dir}"
+fi
+
+if [ "${stage}" -le 0 ] && [ "${stop_stage}" -ge 0 ]; then
+    echo "Stage 0: Data preparation"
+    local/data_prep.sh \
+        --fs "$(yq ".sampling_rate" "${conf}")" \
+        --num_dev "${num_dev}" \
+        --num_eval "${num_eval}" \
+        --train_set "${train_set}" \
+        --dev_set "${dev_set}" \
+        --eval_set "${eval_set}" \
+        --shuffle "${shuffle}" \
+        "${download_dir}/jsss_ver1" data
+fi
+
+stats_ext=$(grep -q "hdf5" <(yq ".format" "${conf}") && echo "h5" || echo "npy")
+if [ "${stage}" -le 1 ] && [ "${stop_stage}" -ge 1 ]; then
+    echo "Stage 1: Feature extraction"
+    # extract raw features
+    pids=()
+    for name in "${train_set}" "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${dumpdir}/${name}/raw" ] && mkdir -p "${dumpdir}/${name}/raw"
+        echo "Feature extraction start. See the progress via ${dumpdir}/${name}/raw/preprocessing.*.log."
+        utils/make_subset_data.sh "data/${name}" "${n_jobs}" "${dumpdir}/${name}/raw"
+        ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/raw/preprocessing.JOB.log" \
+            parallel-wavegan-preprocess \
+                --config "${conf}" \
+                --scp "${dumpdir}/${name}/raw/wav.JOB.scp" \
+                --dumpdir "${dumpdir}/${name}/raw/dump.JOB" \
+                --verbose "${verbose}"
+        echo "Successfully finished feature extraction of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished feature extraction."
+
+    # calculate statistics for normalization
+    if [ -z "${pretrain}" ]; then
+        # calculate statistics for normalization
+        echo "Statistics computation start. See the progress via ${dumpdir}/${train_set}/compute_statistics.log."
+        ${train_cmd} "${dumpdir}/${train_set}/compute_statistics.log" \
+            parallel-wavegan-compute-statistics \
+                --config "${conf}" \
+                --rootdir "${dumpdir}/${train_set}/raw" \
+                --dumpdir "${dumpdir}/${train_set}" \
+                --verbose "${verbose}"
+        echo "Successfully finished calculation of statistics."
+    else
+        echo "Use statistics of pretrained model. Skip statistics computation."
+        cp "$(dirname "${pretrain}")/stats.${stats_ext}" "${dumpdir}/${train_set}"
+    fi
+
+    # normalize and dump them
+    pids=()
+    for name in "${train_set}" "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${dumpdir}/${name}/norm" ] && mkdir -p "${dumpdir}/${name}/norm"
+        echo "Nomalization start. See the progress via ${dumpdir}/${name}/norm/normalize.*.log."
+        ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/norm/normalize.JOB.log" \
+            parallel-wavegan-normalize \
+                --config "${conf}" \
+                --stats "${dumpdir}/${train_set}/stats.${stats_ext}" \
+                --rootdir "${dumpdir}/${name}/raw/dump.JOB" \
+                --dumpdir "${dumpdir}/${name}/norm/dump.JOB" \
+                --verbose "${verbose}"
+        echo "Successfully finished normalization of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished normalization."
+fi
+
+if [ -z "${tag}" ]; then
+    expdir="exp/${train_set}_jsss_$(basename "${conf}" .yaml)"
+    if [ -n "${pretrain}" ]; then
+        pretrain_tag=$(basename "$(dirname "${pretrain}")")
+        expdir+="_${pretrain_tag}"
+    fi
+else
+    expdir="exp/${train_set}_jsss_${tag}"
+fi
+if [ "${stage}" -le 2 ] && [ "${stop_stage}" -ge 2 ]; then
+    echo "Stage 2: Network training"
+    [ ! -e "${expdir}" ] && mkdir -p "${expdir}"
+    cp "${dumpdir}/${train_set}/stats.${stats_ext}" "${expdir}"
+    if [ "${n_gpus}" -gt 1 ]; then
+        train="python -m parallel_wavegan.distributed.launch --nproc_per_node ${n_gpus} -c parallel-wavegan-train"
+    else
+        train="parallel-wavegan-train"
+    fi
+    echo "Training start. See the progress via ${expdir}/train.log."
+    ${cuda_cmd} --gpu "${n_gpus}" "${expdir}/train.log" \
+        ${train} \
+            --config "${conf}" \
+            --train-dumpdir "${dumpdir}/${train_set}/norm" \
+            --dev-dumpdir "${dumpdir}/${dev_set}/norm" \
+            --outdir "${expdir}" \
+            --resume "${resume}" \
+            --pretrain "${pretrain}" \
+            --verbose "${verbose}"
+    echo "Successfully finished training."
+fi
+
+if [ "${stage}" -le 3 ] && [ "${stop_stage}" -ge 3 ]; then
+    echo "Stage 3: Network decoding"
+    # shellcheck disable=SC2012
+    [ -z "${checkpoint}" ] && checkpoint="$(ls -dt "${expdir}"/*.pkl | head -1 || true)"
+    outdir="${expdir}/wav/$(basename "${checkpoint}" .pkl)"
+    pids=()
+    for name in "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${outdir}/${name}" ] && mkdir -p "${outdir}/${name}"
+        [ "${n_gpus}" -gt 1 ] && n_gpus=1
+        echo "Decoding start. See the progress via ${outdir}/${name}/decode.log."
+        ${cuda_cmd} --gpu "${n_gpus}" "${outdir}/${name}/decode.log" \
+            parallel-wavegan-decode \
+                --dumpdir "${dumpdir}/${name}/norm" \
+                --checkpoint "${checkpoint}" \
+                --outdir "${outdir}/${name}" \
+                --verbose "${verbose}"
+        echo "Successfully finished decoding of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished decoding."
+fi
+echo "Finished."
diff --git a/ParallelWaveGAN/egs/jsss/voc1/utils b/ParallelWaveGAN/egs/jsss/voc1/utils
new file mode 120000
index 0000000000000000000000000000000000000000..973afe674f2c85f7a400600f963e0709767602dc
--- /dev/null
+++ b/ParallelWaveGAN/egs/jsss/voc1/utils
@@ -0,0 +1 @@
+../../../utils
\ No newline at end of file
diff --git a/ParallelWaveGAN/egs/jsut/voc1/cmd.sh b/ParallelWaveGAN/egs/jsut/voc1/cmd.sh
new file mode 100644
index 0000000000000000000000000000000000000000..19f342102fc4f3389157c48f1196b16b68eb1cf1
--- /dev/null
+++ b/ParallelWaveGAN/egs/jsut/voc1/cmd.sh
@@ -0,0 +1,91 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend="local"
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="utils/run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="utils/run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="utils/run.pl"
+
+# Local machine, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="utils/stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="utils/stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="utils/stdout.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="utils/queue.pl"
+    export cuda_cmd="utils/queue.pl"
+    export decode_cmd="utils/queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="utils/slurm.pl"
+    export cuda_cmd="utils/slurm.pl"
+    export decode_cmd="utils/slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="utils/ssh.pl"
+    export cuda_cmd="utils/ssh.pl"
+    export decode_cmd="utils/ssh.pl"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/ParallelWaveGAN/egs/jsut/voc1/conf/hifigan.v1.yaml b/ParallelWaveGAN/egs/jsut/voc1/conf/hifigan.v1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0cedc83f27ef5f88ad9615d92227f0a0f94cb196
--- /dev/null
+++ b/ParallelWaveGAN/egs/jsut/voc1/conf/hifigan.v1.yaml
@@ -0,0 +1,180 @@
+# This is the configuration file for JSUT dataset.
+# This configuration is based on HiFiGAN V1, which is
+# an official configuration. But I found that the optimizer
+# setting does not work well with my implementation.
+# So I changed optimizer settings as follows:
+# - AdamW -> Adam
+# - betas: [0.8, 0.99] -> betas: [0.5, 0.9]
+# - Scheduler: ExponentialLR -> MultiStepLR
+# To match the shift size difference, the upsample scales
+# is also modified from the original 256 shift setting.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 24000     # Sampling rate.
+fft_size: 2048           # FFT size.
+hop_size: 300            # Hop size.
+win_length: 1200         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: false      # Whether to trim the start and end of silence.
+trim_threshold_in_db: 20 # Need to tune carefully if the recording is not good.
+trim_frame_size: 1024    # Frame size in trimming.
+trim_hop_size: 256       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_type: HiFiGANGenerator
+generator_params:
+    in_channels: 80                       # Number of input channels.
+    out_channels: 1                       # Number of output channels.
+    channels: 512                         # Number of initial channels.
+    kernel_size: 7                        # Kernel size of initial and final conv layers.
+    upsample_scales: [5, 5, 4, 3]         # Upsampling scales.
+    upsample_kernel_sizes: [10, 10, 8, 6] # Kernel size for upsampling layers.
+    resblock_kernel_sizes: [3, 7, 11]     # Kernel size for residual blocks.
+    resblock_dilations:                   # Dilations for residual blocks.
+        - [1, 3, 5]
+        - [1, 3, 5]
+        - [1, 3, 5]
+    use_additional_convs: true            # Whether to use additional conv layer in residual blocks.
+    bias: true                            # Whether to use bias parameter in conv.
+    nonlinear_activation: "LeakyReLU"     # Nonlinear activation type.
+    nonlinear_activation_params:          # Nonlinear activation paramters.
+        negative_slope: 0.1
+    use_weight_norm: true                 # Whether to apply weight normalization.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_type: HiFiGANMultiScaleMultiPeriodDiscriminator
+discriminator_params:
+    scales: 3                              # Number of multi-scale discriminator.
+    scale_downsample_pooling: "AvgPool1d"  # Pooling operation for scale discriminator.
+    scale_downsample_pooling_params:
+        kernel_size: 4                     # Pooling kernel size.
+        stride: 2                          # Pooling stride.
+        padding: 2                         # Padding size.
+    scale_discriminator_params:
+        in_channels: 1                     # Number of input channels.
+        out_channels: 1                    # Number of output channels.
+        kernel_sizes: [15, 41, 5, 3]       # List of kernel sizes.
+        channels: 128                      # Initial number of channels.
+        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
+        max_groups: 16                     # Maximum number of groups in downsampling conv layers.
+        bias: true
+        downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales.
+        nonlinear_activation: "LeakyReLU"  # Nonlinear activation.
+        nonlinear_activation_params:
+            negative_slope: 0.1
+    follow_official_norm: true             # Whether to follow the official norm setting.
+    periods: [2, 3, 5, 7, 11]              # List of period for multi-period discriminator.
+    period_discriminator_params:
+        in_channels: 1                     # Number of input channels.
+        out_channels: 1                    # Number of output channels.
+        kernel_sizes: [5, 3]               # List of kernel sizes.
+        channels: 32                       # Initial number of channels.
+        downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
+        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
+        bias: true                         # Whether to use bias parameter in conv layer."
+        nonlinear_activation: "LeakyReLU"  # Nonlinear activation.
+        nonlinear_activation_params:       # Nonlinear activation paramters.
+            negative_slope: 0.1
+        use_weight_norm: true              # Whether to apply weight normalization.
+        use_spectral_norm: false           # Whether to apply spectral normalization.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+use_stft_loss: false                 # Whether to use multi-resolution STFT loss.
+use_mel_loss: true                   # Whether to use Mel-spectrogram loss.
+mel_loss_params:
+    fs: 24000
+    fft_size: 2048
+    hop_size: 300
+    win_length: 1200
+    window: "hann"
+    num_mels: 80
+    fmin: 0
+    fmax: 12000
+    log_base: null
+generator_adv_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+discriminator_adv_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+use_feat_match_loss: true
+feat_match_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+    average_by_layers: false         # Whether to average loss by #layers in each discriminator.
+    include_final_outputs: false     # Whether to include final outputs in feat match loss calculation.
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_aux: 45.0       # Loss balancing coefficient for STFT loss.
+lambda_adv: 1.0        # Loss balancing coefficient for adversarial loss.
+lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss..
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 16              # Batch size.
+batch_max_steps: 8400       # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2              # Number of workers in Pytorch DataLoader.
+remove_short_samples: false # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_type: Adam
+generator_optimizer_params:
+    lr: 2.0e-4
+    betas: [0.5, 0.9]
+    weight_decay: 0.0
+generator_scheduler_type: MultiStepLR
+generator_scheduler_params:
+    gamma: 0.5
+    milestones:
+        - 200000
+        - 400000
+        - 600000
+        - 800000
+generator_grad_norm: -1
+discriminator_optimizer_type: Adam
+discriminator_optimizer_params:
+    lr: 2.0e-4
+    betas: [0.5, 0.9]
+    weight_decay: 0.0
+discriminator_scheduler_type: MultiStepLR
+discriminator_scheduler_params:
+    gamma: 0.5
+    milestones:
+        - 200000
+        - 400000
+        - 600000
+        - 800000
+discriminator_grad_norm: -1
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+generator_train_start_steps: 1     # Number of steps to start to train discriminator.
+discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
+train_max_steps: 2500000           # Number of training steps.
+save_interval_steps: 10000         # Interval steps to save checkpoint.
+eval_interval_steps: 1000          # Interval steps to evaluate the network.
+log_interval_steps: 100            # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/jsut/voc1/conf/multi_band_melgan.v2.yaml b/ParallelWaveGAN/egs/jsut/voc1/conf/multi_band_melgan.v2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..29542f7a530452726a4b78deb9972a9325164745
--- /dev/null
+++ b/ParallelWaveGAN/egs/jsut/voc1/conf/multi_band_melgan.v2.yaml
@@ -0,0 +1,150 @@
+# This is the hyperparameter configuration file for MelGAN.
+# Please make sure this is adjusted for the JSUT dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration requires ~ 8GB memory and will finish within 4 days on Titan V.
+
+# This configuration is based on full-band MelGAN but the hop size and sampling
+# rate is different from the paper (16kHz vs 24kHz). The number of iteraions
+# is now shown in the paper so currently we train 1M iterations (not sure enough
+# to converge). The optimizer setting is based on @dathudeptrai advice.
+# https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 24000     # Sampling rate.
+fft_size: 2048           # FFT size.
+hop_size: 300            # Hop size.
+win_length: 1200         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: false      # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 2048    # Frame size in trimming.
+trim_hop_size: 512       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_type: "MelGANGenerator" # Generator type.
+generator_params:
+    in_channels: 80               # Number of input channels.
+    out_channels: 4               # Number of output channels.
+    kernel_size: 7                # Kernel size of initial and final conv layers.
+    channels: 384                 # Initial number of channels for conv layers.
+    upsample_scales: [5, 5, 3]    # List of Upsampling scales.
+    stack_kernel_size: 3          # Kernel size of dilated conv layers in residual stack.
+    stacks: 4                     # Number of stacks in a single residual stack module.
+    use_weight_norm: True         # Whether to use weight normalization.
+    use_causal_conv: False        # Whether to use causal convolution.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_type: "MelGANMultiScaleDiscriminator" # Discriminator type.
+discriminator_params:
+    in_channels: 1                    # Number of input channels.
+    out_channels: 1                   # Number of output channels.
+    scales: 3                         # Number of multi-scales.
+    downsample_pooling: "AvgPool1d"   # Pooling type for the input downsampling.
+    downsample_pooling_params:        # Parameters of the above pooling function.
+        kernel_size: 4
+        stride: 2
+        padding: 1
+        count_include_pad: False
+    kernel_sizes: [5, 3]              # List of kernel size.
+    channels: 16                      # Number of channels of the initial conv layer.
+    max_downsample_channels: 512      # Maximum number of channels of downsampling layers.
+    downsample_scales: [4, 4, 4]      # List of downsampling scales.
+    nonlinear_activation: "LeakyReLU" # Nonlinear activation function.
+    nonlinear_activation_params:      # Parameters of nonlinear activation function.
+        negative_slope: 0.2
+    use_weight_norm: True             # Whether to use weight norm.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+use_subband_stft_loss: true
+subband_stft_loss_params:
+    fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
+    hop_sizes: [30, 60, 10]     # List of hop size for STFT-based loss
+    win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
+    window: "hann_window"       # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+use_feat_match_loss: false # Whether to use feature matching loss.
+lambda_adv: 2.5            # Loss balancing coefficient for adversarial loss.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 64             # Batch size.
+batch_max_steps: 16200     # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
+num_workers: 4             # Number of workers in Pytorch DataLoader.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_type: "Adam"       # Generator's optimizer type.
+generator_optimizer_params:
+    lr: 1.0e-3                          # Generator's learning rate.
+    eps: 1.0e-7                         # Generator's epsilon.
+    weight_decay: 0.0                   # Generator's weight decay coefficient.
+    amsgrad: true
+generator_grad_norm: -1                 # Generator's gradient norm.
+generator_scheduler_type: "MultiStepLR" # Generator's scheduler type.
+generator_scheduler_params:
+    gamma: 0.5                          # Generator's scheduler gamma.
+    milestones:                         # At each milestone, lr will be multiplied by gamma.
+        - 100000
+        - 200000
+        - 300000
+        - 400000
+        - 500000
+        - 600000
+discriminator_optimizer_type: "Adam"        # Discriminator's optimizer type.
+discriminator_optimizer_params:
+    lr: 1.0e-3                              # Discriminator's learning rate.
+    eps: 1.0e-7                             # Discriminator's epsilon.
+    weight_decay: 0.0                       # Discriminator's weight decay coefficient.
+    amsgrad: true
+discriminator_grad_norm: -1                 # Discriminator's gradient norm.
+discriminator_scheduler_type: "MultiStepLR" # Discriminator's scheduler type.
+discriminator_scheduler_params:
+    gamma: 0.5                              # Discriminator's scheduler gamma.
+    milestones:                             # At each milestone, lr will be multiplied by gamma.
+        - 100000
+        - 200000
+        - 300000
+        - 400000
+        - 500000
+        - 600000
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 200000 # Number of steps to start to train discriminator.
+train_max_steps: 1000000                # Number of training steps.
+save_interval_steps: 50000              # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+log_interval_steps: 1000                # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/jsut/voc1/conf/parallel_wavegan.v1.yaml b/ParallelWaveGAN/egs/jsut/voc1/conf/parallel_wavegan.v1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a840d336e180d1e69e1de111bdbc4fabeffc0297
--- /dev/null
+++ b/ParallelWaveGAN/egs/jsut/voc1/conf/parallel_wavegan.v1.yaml
@@ -0,0 +1,122 @@
+# This is the hyperparameter configuration file for Parallel WaveGAN.
+# Please make sure this is adjusted for the JSUT dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration requires 12 GB GPU memory and takes ~3 days on RTX TITAN.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 24000     # Sampling rate.
+fft_size: 2048           # FFT size.
+hop_size: 300            # Hop size.
+win_length: 1200         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: false      # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 2048    # Frame size in trimming.
+trim_hop_size: 512       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Kernel size of dilated convolution.
+    layers: 30            # Number of residual block layers.
+    stacks: 3             # Number of stacks i.e., dilation cycles.
+    residual_channels: 64 # Number of channels in residual conv.
+    gate_channels: 128    # Number of channels in gated conv.
+    skip_channels: 64     # Number of channels in skip conv.
+    aux_channels: 80      # Number of channels for auxiliary feature conv.
+                          # Must be the same as num_mels.
+    aux_context_window: 2 # Context window size for auxiliary feature.
+                          # If set to 2, previous 2 and future 2 frames will be considered.
+    dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
+    upsample_params:                      # Upsampling network parameters.
+        upsample_scales: [4, 5, 3, 5]     # Upsampling scales. Prodcut of these must be the same as hop size.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Number of output channels.
+    layers: 10            # Number of conv layers.
+    conv_channels: 64     # Number of chnn layers.
+    bias: true            # Whether to use bias parameter in conv.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
+    nonlinear_activation_params:      # Nonlinear function parameters
+        negative_slope: 0.2           # Alpha in LeakyReLU.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 4.0  # Loss balancing coefficient.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 6              # Batch size.
+batch_max_steps: 25500     # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2             # Number of workers in Pytorch DataLoader.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    lr: 0.0001             # Generator's learning rate.
+    eps: 1.0e-6            # Generator's epsilon.
+    weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+    step_size: 200000      # Generator's scheduler step size.
+    gamma: 0.5             # Generator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+    lr: 0.00005            # Discriminator's learning rate.
+    eps: 1.0e-6            # Discriminator's epsilon.
+    weight_decay: 0.0      # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    step_size: 200000      # Discriminator's scheduler step size.
+    gamma: 0.5             # Discriminator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 400000                 # Number of training steps.
+save_interval_steps: 5000               # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+log_interval_steps: 100                 # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/jsut/voc1/conf/slurm.conf b/ParallelWaveGAN/egs/jsut/voc1/conf/slurm.conf
new file mode 100644
index 0000000000000000000000000000000000000000..cefd21f031d647f798e8045a6beb92b0b833d041
--- /dev/null
+++ b/ParallelWaveGAN/egs/jsut/voc1/conf/slurm.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command sbatch --export=PATH  --ntasks-per-node=1
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
+option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1 # Do not add anything to qsub_opts
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/ParallelWaveGAN/egs/jsut/voc1/conf/style_melgan.v1.yaml b/ParallelWaveGAN/egs/jsut/voc1/conf/style_melgan.v1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..141421a5f4aedd5211adbf8bdcb20001a03e836c
--- /dev/null
+++ b/ParallelWaveGAN/egs/jsut/voc1/conf/style_melgan.v1.yaml
@@ -0,0 +1,147 @@
+# This is the configuration file for JSUT dataset.
+# This configuration is based on StyleMelGAN paper but
+# uses MSE loss instead of Hinge loss. And I found that
+# batch_size = 8 is also working good. So maybe if you
+# want to accelerate the training, you can reduce the
+# batch size (e.g. 8 or 16). Upsampling scales is modified
+# to fit the shift size 300 pt.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 24000     # Sampling rate.
+fft_size: 2048           # FFT size.
+hop_size: 300            # Hop size.
+win_length: 1200         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: false      # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 1024    # Frame size in trimming.
+trim_hop_size: 256       # Hop size in trimming.
+format: "hdf5"           # Feature file format. " npy " or " hdf5 " is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_type: "StyleMelGANGenerator" # Generator type.
+generator_params:
+    in_channels: 128
+    aux_channels: 80
+    channels: 64
+    out_channels: 1
+    kernel_size: 9
+    dilation: 2
+    bias: True
+    noise_upsample_scales: [10, 2, 2, 2]
+    noise_upsample_activation: "LeakyReLU"
+    noise_upsample_activation_params:
+        negative_slope: 0.2
+    upsample_scales: [5, 1, 5, 1, 3, 1, 2, 2, 1]
+    upsample_mode: "nearest"
+    gated_function: "softmax"
+    use_weight_norm: True
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_type: "StyleMelGANDiscriminator" # Discriminator type.
+discriminator_params:
+    repeats: 4
+    window_sizes: [512, 1024, 2048, 4096]
+    pqmf_params:
+        - [1, None, None, None]
+        - [2, 62, 0.26700, 9.0]
+        - [4, 62, 0.14200, 9.0]
+        - [8, 62, 0.07949, 9.0]
+    discriminator_params:
+        out_channels: 1
+        kernel_sizes: [5, 3]
+        channels: 16
+        max_downsample_channels: 512
+        bias: True
+        downsample_scales: [4, 4, 4, 1]
+        nonlinear_activation: "LeakyReLU"
+        nonlinear_activation_params:
+            negative_slope: 0.2
+    use_weight_norm: True
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+lambda_aux: 1.0                   # Loss balancing coefficient for aux loss.
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 1.0 # Loss balancing coefficient for adv loss.
+generator_adv_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+discriminator_adv_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 32              # Batch size.
+batch_max_steps: 24000      # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2              # Number of workers in Pytorch DataLoader.
+remove_short_samples: false # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_type: Adam
+generator_optimizer_params:
+    lr: 1.0e-4
+    betas: [0.5, 0.9]
+    weight_decay: 0.0
+generator_scheduler_type: MultiStepLR
+generator_scheduler_params:
+    gamma: 0.5
+    milestones:
+        - 100000
+        - 300000
+        - 500000
+        - 700000
+        - 900000
+generator_grad_norm: -1
+discriminator_optimizer_type: Adam
+discriminator_optimizer_params:
+    lr: 2.0e-4
+    betas: [0.5, 0.9]
+    weight_decay: 0.0
+discriminator_scheduler_type: MultiStepLR
+discriminator_scheduler_params:
+    gamma: 0.5
+    milestones:
+        - 200000
+        - 400000
+        - 600000
+        - 800000
+discriminator_grad_norm: -1
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 1500000                # Number of training steps.
+save_interval_steps: 50000              # Interval steps to save checkpoint.
+eval_interval_steps: 1000              # Interval steps to evaluate the network.
+log_interval_steps: 100                # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/jsut/voc1/local/data_download.sh b/ParallelWaveGAN/egs/jsut/voc1/local/data_download.sh
new file mode 100755
index 0000000000000000000000000000000000000000..f5f251f2057126473392251c92610a3272215fec
--- /dev/null
+++ b/ParallelWaveGAN/egs/jsut/voc1/local/data_download.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+download_dir=$1
+
+# check arguments
+if [ $# != 1 ]; then
+    echo "Usage: $0 <download_dir>"
+    exit 1
+fi
+
+set -euo pipefail
+
+cwd=$(pwd)
+if [ ! -e "${download_dir}/jsut_ver1.1" ]; then
+    mkdir -p "${download_dir}"
+    cd "${download_dir}" || exit 1;
+    wget http://ss-takashi.sakura.ne.jp/corpus/jsut_ver1.1.zip
+    unzip -o ./*.zip
+    rm ./*.zip
+    cd "${cwd}" || exit 1;
+    echo "Successfully downloaded data."
+else
+    echo "Already exists. Skipped."
+fi
+
+if [ ! -e "${download_dir}/jsut_lab" ]; then
+    cd "${download_dir}" || exit 1;
+    git clone https://github.com/r9y9/jsut-lab
+    for name in loanword128 repeat500 voiceactress100 basic5000 onomatopee300 travel1000 countersuffix26 precedent130 utparaphrase512; do
+        cp -vr "jsut-lab/${name}" jsut_ver1.1/
+    done
+    cd - || exit 1;
+    echo "Successfully downloaded context label."
+else
+    echo "Already exists. Skipped."
+fi
diff --git a/ParallelWaveGAN/egs/jsut/voc1/local/data_prep.sh b/ParallelWaveGAN/egs/jsut/voc1/local/data_prep.sh
new file mode 100755
index 0000000000000000000000000000000000000000..7fd2321aaba7c3b282ab783b4740bc913094bc0e
--- /dev/null
+++ b/ParallelWaveGAN/egs/jsut/voc1/local/data_prep.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+# shellcheck disable=SC1091
+. ./path.sh || exit 1;
+
+fs=24000
+num_dev=250
+num_eval=250
+train_set="train_nodev"
+dev_set="dev"
+eval_set="eval"
+shuffle=false
+
+# shellcheck disable=SC1091
+. utils/parse_options.sh || exit 1;
+
+db_root=$1
+data_dir=$2
+
+# check arguments
+if [ $# != 2 ]; then
+    echo "Usage: $0 [Options] <db_root> <data_dir>"
+    echo "e.g.: $0 downloads/jsut_ver1.1 data"
+    echo ""
+    echo "Options:"
+    echo "    --fs: target sampling rate (default=24000)."
+    echo "    --num_dev: number of development uttreances (default=250)."
+    echo "    --num_eval: number of evaluation uttreances (default=250)."
+    echo "    --train_set: name of train set (default=train_nodev)."
+    echo "    --dev_set: name of dev set (default=dev)."
+    echo "    --eval_set: name of eval set (default=eval)."
+    echo "    --shuffle: whether to perform shuffle in making dev / eval set (default=false)."
+    exit 1
+fi
+
+set -euo pipefail
+
+[ ! -e "${data_dir}/all" ] && mkdir -p "${data_dir}/all"
+
+# set filenames
+scp="${data_dir}/all/wav.scp"
+segments="${data_dir}/all/segments"
+
+# check file existence
+[ -e "${scp}" ] && rm "${scp}"
+[ -e "${segments}" ] && rm "${segments}"
+
+# make scp
+find "${db_root}" -follow -name "*.wav" | sort | while read -r filename; do
+    id=$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")
+    echo "${id} cat ${filename} | sox -t wav - -c 1 -b 16 -t wav - rate ${fs} |" >> "${scp}"
+done
+
+# make segments
+find "${db_root}" -name "*.lab" -follow | sort | while read -r filename; do
+    id=$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")
+    start_nsec=$(head -n 1 "${filename}" | cut -d " " -f 2)
+    end_nsec=$(tail -n 1 "${filename}" | cut -d " " -f 1)
+    start_sec=$(echo "${start_nsec}*0.0000001" | bc | sed "s/^\./0./")
+    end_sec=$(echo "${end_nsec}*0.0000001" | bc | sed "s/^\./0./")
+    echo "${id} ${id} ${start_sec} ${end_sec}" >> "${segments}"
+done
+
+# check
+diff -q <(awk '{print $1}' "${scp}") <(awk '{print $1}' "${segments}") > /dev/null
+
+# split
+num_all=$(wc -l < "${scp}")
+num_deveval=$((num_dev + num_eval))
+num_train=$((num_all - num_deveval))
+utils/split_data.sh \
+    --num_first "${num_train}" \
+    --num_second "${num_deveval}" \
+    --shuffle "${shuffle}" \
+    "${data_dir}/all" \
+    "${data_dir}/${train_set}" \
+    "${data_dir}/deveval"
+utils/split_data.sh \
+    --num_first "${num_dev}" \
+    --num_second "${num_eval}" \
+    --shuffle "${shuffle}" \
+    "${data_dir}/deveval" \
+    "${data_dir}/${dev_set}" \
+    "${data_dir}/${eval_set}"
+
+# remove tmp directories
+rm -rf "${data_dir}/all"
+rm -rf "${data_dir}/deveval"
+
+echo "Successfully prepared data."
diff --git a/ParallelWaveGAN/egs/jsut/voc1/path.sh b/ParallelWaveGAN/egs/jsut/voc1/path.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b0ca27c615f70aa29e240222ec370f8ad4e7b45a
--- /dev/null
+++ b/ParallelWaveGAN/egs/jsut/voc1/path.sh
@@ -0,0 +1,33 @@
+# cuda related
+export CUDA_HOME=/usr/local/cuda-10.0
+export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
+
+# path related
+export PRJ_ROOT="${PWD}/../../.."
+if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then
+    # shellcheck disable=SC1090
+    . "${PRJ_ROOT}/tools/venv/bin/activate"
+fi
+
+# python related
+export OMP_NUM_THREADS=1
+export PYTHONIOENCODING=UTF-8
+export MPL_BACKEND=Agg
+
+# check installation
+if ! command -v parallel-wavegan-train > /dev/null; then
+    echo "Error: It seems setup is not finished." >&2
+    echo "Error: Please setup your environment by following README.md" >&2
+    return 1
+fi
+if ! command -v jq > /dev/null; then
+    echo "Error: It seems jq is not installed." >&2
+    echo "Error: Please install via \`sudo apt-get install jq\`." >&2
+    echo "Error: If you do not have sudo, please download from https://stedolan.github.io/jq/download/." >&2
+    return 1
+fi
+if ! command -v yq > /dev/null; then
+    echo "Error: It seems yq is not installed." >&2
+    echo "Error: Please install via \`pip install yq\`." >&2
+    return 1
+fi
diff --git a/ParallelWaveGAN/egs/jsut/voc1/run.sh b/ParallelWaveGAN/egs/jsut/voc1/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..25338a5df6e3225cebb83188f839c567647ad7b7
--- /dev/null
+++ b/ParallelWaveGAN/egs/jsut/voc1/run.sh
@@ -0,0 +1,164 @@
+#!/bin/bash
+
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+. ./cmd.sh || exit 1;
+. ./path.sh || exit 1;
+
+# basic settings
+stage=-1       # stage to start
+stop_stage=100 # stage to stop
+verbose=1      # verbosity level (lower is less info)
+n_gpus=1       # number of gpus in training
+n_jobs=16      # number of parallel jobs in feature extraction
+
+# NOTE(kan-bayashi): renamed to conf to avoid conflict in parse_options.sh
+conf=conf/parallel_wavegan.v1.yaml
+
+# directory path setting
+download_dir=downloads # direcotry to save downloaded files
+dumpdir=dump           # directory to dump features
+
+# training related setting
+tag=""     # tag for directory to save model
+resume=""  # checkpoint path to resume training
+           # (e.g. <path>/<to>/checkpoint-10000steps.pkl)
+
+# decoding related setting
+checkpoint="" # checkpoint path to be used for decoding
+              # if not provided, the latest one will be used
+              # (e.g. <path>/<to>/checkpoint-400000steps.pkl)
+
+# shellcheck disable=SC1091
+. utils/parse_options.sh || exit 1;
+
+train_set="train_nodev" # name of training data directory
+dev_set="dev"           # name of development data direcotry
+eval_set="eval"         # name of evaluation data direcotry
+
+set -euo pipefail
+
+if [ "${stage}" -le -1 ] && [ "${stop_stage}" -ge -1 ]; then
+    echo "Stage -1: Data download"
+    local/data_download.sh "${download_dir}"
+fi
+
+if [ "${stage}" -le 0 ] && [ "${stop_stage}" -ge 0 ]; then
+    echo "Stage 0: Data preparation"
+    local/data_prep.sh \
+        --train_set "${train_set}" \
+        --dev_set "${dev_set}" \
+        --eval_set "${eval_set}" \
+        "${download_dir}/jsut_ver1.1" data
+fi
+
+stats_ext=$(grep -q "hdf5" <(yq ".format" "${conf}") && echo "h5" || echo "npy")
+if [ "${stage}" -le 1 ] && [ "${stop_stage}" -ge 1 ]; then
+    echo "Stage 1: Feature extraction"
+    # extract raw features
+    pids=()
+    for name in "${train_set}" "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${dumpdir}/${name}/raw" ] && mkdir -p "${dumpdir}/${name}/raw"
+        echo "Feature extraction start. See the progress via ${dumpdir}/${name}/raw/preprocessing.*.log."
+        utils/make_subset_data.sh "data/${name}" "${n_jobs}" "${dumpdir}/${name}/raw"
+        ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/raw/preprocessing.JOB.log" \
+            parallel-wavegan-preprocess \
+                --config "${conf}" \
+                --scp "${dumpdir}/${name}/raw/wav.JOB.scp" \
+                --segments "${dumpdir}/${name}/raw/segments.JOB" \
+                --dumpdir "${dumpdir}/${name}/raw/dump.JOB" \
+                --verbose "${verbose}"
+        echo "Successfully finished feature extraction of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished feature extraction."
+
+    # calculate statistics for normalization
+    echo "Statistics computation start. See the progress via ${dumpdir}/${train_set}/compute_statistics.log."
+    ${train_cmd} "${dumpdir}/${train_set}/compute_statistics.log" \
+        parallel-wavegan-compute-statistics \
+            --config "${conf}" \
+            --rootdir "${dumpdir}/${train_set}/raw" \
+            --dumpdir "${dumpdir}/${train_set}" \
+            --verbose "${verbose}"
+    echo "Successfully finished calculation of statistics."
+
+    # normalize and dump them
+    pids=()
+    for name in "${train_set}" "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${dumpdir}/${name}/norm" ] && mkdir -p "${dumpdir}/${name}/norm"
+        echo "Nomalization start. See the progress via ${dumpdir}/${name}/norm/normalize.*.log."
+        ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/norm/normalize.JOB.log" \
+            parallel-wavegan-normalize \
+                --config "${conf}" \
+                --stats "${dumpdir}/${train_set}/stats.${stats_ext}" \
+                --rootdir "${dumpdir}/${name}/raw/dump.JOB" \
+                --dumpdir "${dumpdir}/${name}/norm/dump.JOB" \
+                --verbose "${verbose}"
+        echo "Successfully finished normalization of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished normalization."
+fi
+
+if [ -z "${tag}" ]; then
+    expdir="exp/${train_set}_jsut_$(basename "${conf}" .yaml)"
+else
+    expdir="exp/${train_set}_jsut_${tag}"
+fi
+if [ "${stage}" -le 2 ] && [ "${stop_stage}" -ge 2 ]; then
+    echo "Stage 2: Network training"
+    [ ! -e "${expdir}" ] && mkdir -p "${expdir}"
+    cp "${dumpdir}/${train_set}/stats.${stats_ext}" "${expdir}"
+    if [ "${n_gpus}" -gt 1 ]; then
+        train="python -m parallel_wavegan.distributed.launch --nproc_per_node ${n_gpus} -c parallel-wavegan-train"
+    else
+        train="parallel-wavegan-train"
+    fi
+    echo "Training start. See the progress via ${expdir}/train.log."
+    ${cuda_cmd} --gpu "${n_gpus}" "${expdir}/train.log" \
+        ${train} \
+            --config "${conf}" \
+            --train-dumpdir "${dumpdir}/${train_set}/norm" \
+            --dev-dumpdir "${dumpdir}/${dev_set}/norm" \
+            --outdir "${expdir}" \
+            --resume "${resume}" \
+            --verbose "${verbose}"
+    echo "Successfully finished training."
+fi
+
+if [ "${stage}" -le 3 ] && [ "${stop_stage}" -ge 3 ]; then
+    echo "Stage 3: Network decoding"
+    # shellcheck disable=SC2012
+    [ -z "${checkpoint}" ] && checkpoint="$(ls -dt "${expdir}"/*.pkl | head -1 || true)"
+    outdir="${expdir}/wav/$(basename "${checkpoint}" .pkl)"
+    pids=()
+    for name in "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${outdir}/${name}" ] && mkdir -p "${outdir}/${name}"
+        [ "${n_gpus}" -gt 1 ] && n_gpus=1
+        echo "Decoding start. See the progress via ${outdir}/${name}/decode.log."
+        ${cuda_cmd} --gpu "${n_gpus}" "${outdir}/${name}/decode.log" \
+            parallel-wavegan-decode \
+                --dumpdir "${dumpdir}/${name}/norm" \
+                --checkpoint "${checkpoint}" \
+                --outdir "${outdir}/${name}" \
+                --verbose "${verbose}"
+        echo "Successfully finished decoding of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished decoding."
+fi
+echo "Finished."
diff --git a/ParallelWaveGAN/egs/jsut/voc1/utils b/ParallelWaveGAN/egs/jsut/voc1/utils
new file mode 120000
index 0000000000000000000000000000000000000000..973afe674f2c85f7a400600f963e0709767602dc
--- /dev/null
+++ b/ParallelWaveGAN/egs/jsut/voc1/utils
@@ -0,0 +1 @@
+../../../utils
\ No newline at end of file
diff --git a/ParallelWaveGAN/egs/libritts/voc1/cmd.sh b/ParallelWaveGAN/egs/libritts/voc1/cmd.sh
new file mode 100644
index 0000000000000000000000000000000000000000..19f342102fc4f3389157c48f1196b16b68eb1cf1
--- /dev/null
+++ b/ParallelWaveGAN/egs/libritts/voc1/cmd.sh
@@ -0,0 +1,91 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend="local"
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="utils/run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="utils/run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="utils/run.pl"
+
+# Local machine, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="utils/stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="utils/stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="utils/stdout.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="utils/queue.pl"
+    export cuda_cmd="utils/queue.pl"
+    export decode_cmd="utils/queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="utils/slurm.pl"
+    export cuda_cmd="utils/slurm.pl"
+    export decode_cmd="utils/slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="utils/ssh.pl"
+    export cuda_cmd="utils/ssh.pl"
+    export decode_cmd="utils/ssh.pl"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/ParallelWaveGAN/egs/libritts/voc1/conf/hifigan.v1.yaml b/ParallelWaveGAN/egs/libritts/voc1/conf/hifigan.v1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cca0028b8f451cb0da49bfd675d32f7cca23ed84
--- /dev/null
+++ b/ParallelWaveGAN/egs/libritts/voc1/conf/hifigan.v1.yaml
@@ -0,0 +1,180 @@
+# This is the configuration file for LibriTTS dataset.
+# This configuration is based on HiFiGAN V1, which is
+# an official configuration. But I found that the optimizer
+# setting does not work well with my implementation.
+# So I changed optimizer settings as follows:
+# - AdamW -> Adam
+# - betas: [0.8, 0.99] -> betas: [0.5, 0.9]
+# - Scheduler: ExponentialLR -> MultiStepLR
+# To match the shift size difference, the upsample scales
+# is also modified from the original 256 shift setting.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 24000     # Sampling rate.
+fft_size: 2048           # FFT size.
+hop_size: 300            # Hop size.
+win_length: 1200         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: false      # Whether to trim the start and end of silence.
+trim_threshold_in_db: 20 # Need to tune carefully if the recording is not good.
+trim_frame_size: 1024    # Frame size in trimming.
+trim_hop_size: 256       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_type: HiFiGANGenerator
+generator_params:
+    in_channels: 80                       # Number of input channels.
+    out_channels: 1                       # Number of output channels.
+    channels: 512                         # Number of initial channels.
+    kernel_size: 7                        # Kernel size of initial and final conv layers.
+    upsample_scales: [5, 5, 4, 3]         # Upsampling scales.
+    upsample_kernel_sizes: [10, 10, 8, 6] # Kernel size for upsampling layers.
+    resblock_kernel_sizes: [3, 7, 11]     # Kernel size for residual blocks.
+    resblock_dilations:                   # Dilations for residual blocks.
+        - [1, 3, 5]
+        - [1, 3, 5]
+        - [1, 3, 5]
+    use_additional_convs: true            # Whether to use additional conv layer in residual blocks.
+    bias: true                            # Whether to use bias parameter in conv.
+    nonlinear_activation: "LeakyReLU"     # Nonlinear activation type.
+    nonlinear_activation_params:          # Nonlinear activation paramters.
+        negative_slope: 0.1
+    use_weight_norm: true                 # Whether to apply weight normalization.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_type: HiFiGANMultiScaleMultiPeriodDiscriminator
+discriminator_params:
+    scales: 3                              # Number of multi-scale discriminator.
+    scale_downsample_pooling: "AvgPool1d"  # Pooling operation for scale discriminator.
+    scale_downsample_pooling_params:
+        kernel_size: 4                     # Pooling kernel size.
+        stride: 2                          # Pooling stride.
+        padding: 2                         # Padding size.
+    scale_discriminator_params:
+        in_channels: 1                     # Number of input channels.
+        out_channels: 1                    # Number of output channels.
+        kernel_sizes: [15, 41, 5, 3]       # List of kernel sizes.
+        channels: 128                      # Initial number of channels.
+        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
+        max_groups: 16                     # Maximum number of groups in downsampling conv layers.
+        bias: true
+        downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales.
+        nonlinear_activation: "LeakyReLU"  # Nonlinear activation.
+        nonlinear_activation_params:
+            negative_slope: 0.1
+    follow_official_norm: true             # Whether to follow the official norm setting.
+    periods: [2, 3, 5, 7, 11]              # List of period for multi-period discriminator.
+    period_discriminator_params:
+        in_channels: 1                     # Number of input channels.
+        out_channels: 1                    # Number of output channels.
+        kernel_sizes: [5, 3]               # List of kernel sizes.
+        channels: 32                       # Initial number of channels.
+        downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
+        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
+        bias: true                         # Whether to use bias parameter in conv layer."
+        nonlinear_activation: "LeakyReLU"  # Nonlinear activation.
+        nonlinear_activation_params:       # Nonlinear activation paramters.
+            negative_slope: 0.1
+        use_weight_norm: true              # Whether to apply weight normalization.
+        use_spectral_norm: false           # Whether to apply spectral normalization.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+use_stft_loss: false                 # Whether to use multi-resolution STFT loss.
+use_mel_loss: true                   # Whether to use Mel-spectrogram loss.
+mel_loss_params:
+    fs: 24000
+    fft_size: 2048
+    hop_size: 300
+    win_length: 1200
+    window: "hann"
+    num_mels: 80
+    fmin: 0
+    fmax: 12000
+    log_base: null
+generator_adv_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+discriminator_adv_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+use_feat_match_loss: true
+feat_match_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+    average_by_layers: false         # Whether to average loss by #layers in each discriminator.
+    include_final_outputs: false     # Whether to include final outputs in feat match loss calculation.
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_aux: 45.0       # Loss balancing coefficient for STFT loss.
+lambda_adv: 1.0        # Loss balancing coefficient for adversarial loss.
+lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss..
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 16              # Batch size.
+batch_max_steps: 8400       # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2              # Number of workers in Pytorch DataLoader.
+remove_short_samples: false # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: false          # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_type: Adam
+generator_optimizer_params:
+    lr: 2.0e-4
+    betas: [0.5, 0.9]
+    weight_decay: 0.0
+generator_scheduler_type: MultiStepLR
+generator_scheduler_params:
+    gamma: 0.5
+    milestones:
+        - 200000
+        - 400000
+        - 600000
+        - 800000
+generator_grad_norm: -1
+discriminator_optimizer_type: Adam
+discriminator_optimizer_params:
+    lr: 2.0e-4
+    betas: [0.5, 0.9]
+    weight_decay: 0.0
+discriminator_scheduler_type: MultiStepLR
+discriminator_scheduler_params:
+    gamma: 0.5
+    milestones:
+        - 200000
+        - 400000
+        - 600000
+        - 800000
+discriminator_grad_norm: -1
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+generator_train_start_steps: 1     # Number of steps to start to train discriminator.
+discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
+train_max_steps: 2500000           # Number of training steps.
+save_interval_steps: 10000         # Interval steps to save checkpoint.
+eval_interval_steps: 1000          # Interval steps to evaluate the network.
+log_interval_steps: 100            # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/libritts/voc1/conf/parallel_wavegan.v1.long.yaml b/ParallelWaveGAN/egs/libritts/voc1/conf/parallel_wavegan.v1.long.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1a7f73be955e449f32ee649240ce098153776fbc
--- /dev/null
+++ b/ParallelWaveGAN/egs/libritts/voc1/conf/parallel_wavegan.v1.long.yaml
@@ -0,0 +1,122 @@
+# This is the hyperparameter configuration file for Parallel WaveGAN.
+# Please make sure this is adjusted for the LibriTTS corpus. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration requires 12 GB GPU memory and takes ~7 days on RTX TITAN.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 24000     # Sampling rate.
+fft_size: 2048           # FFT size.
+hop_size: 300            # Hop size.
+win_length: 1200         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: false      # Whether to trim the start and end of silence.
+trim_threshold_in_db: 20 # Need to tune carefully if the recording is not good.
+trim_frame_size: 1024    # Frame size in trimming.
+trim_hop_size: 256       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Kernel size of dilated convolution.
+    layers: 30            # Number of residual block layers.
+    stacks: 3             # Number of stacks i.e., dilation cycles.
+    residual_channels: 64 # Number of channels in residual conv.
+    gate_channels: 128    # Number of channels in gated conv.
+    skip_channels: 64     # Number of channels in skip conv.
+    aux_channels: 80      # Number of channels for auxiliary feature conv.
+                          # Must be the same as num_mels.
+    aux_context_window: 2 # Context window size for auxiliary feature.
+                          # If set to 2, previous 2 and future 2 frames will be considered.
+    dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
+    upsample_params:                      # Upsampling network parameters.
+        upsample_scales: [4, 5, 3, 5]     # Upsampling scales. Prodcut of these must be the same as hop size.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Number of output channels.
+    layers: 10            # Number of conv layers.
+    conv_channels: 64     # Number of chnn layers.
+    bias: true            # Whether to use bias parameter in conv.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
+    nonlinear_activation_params:      # Nonlinear function parameters
+        negative_slope: 0.2           # Alpha in LeakyReLU.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 4.0  # Loss balancing coefficient.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 6              # Batch size.
+batch_max_steps: 24000     # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2             # Number of workers in Pytorch DataLoader.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: false         # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    lr: 0.0001             # Generator's learning rate.
+    eps: 1.0e-6            # Generator's epsilon.
+    weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+    step_size: 200000      # Generator's scheduler step size.
+    gamma: 0.5             # Generator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+    lr: 0.00005            # Discriminator's learning rate.
+    eps: 1.0e-6            # Discriminator's epsilon.
+    weight_decay: 0.0      # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    step_size: 200000      # Discriminator's scheduler step size.
+    gamma: 0.5             # Discriminator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 1000000                # Number of training steps.
+save_interval_steps: 5000               # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+log_interval_steps: 100                 # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/libritts/voc1/conf/parallel_wavegan.v1.yaml b/ParallelWaveGAN/egs/libritts/voc1/conf/parallel_wavegan.v1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c52de71c1aa6ca411fd3ec61cf06f0d0e8edab61
--- /dev/null
+++ b/ParallelWaveGAN/egs/libritts/voc1/conf/parallel_wavegan.v1.yaml
@@ -0,0 +1,122 @@
+# This is the hyperparameter configuration file for Parallel WaveGAN.
+# Please make sure this is adjusted for the LibriTTS corpus. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration requires 12 GB GPU memory and takes ~3 days on RTX TITAN.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 24000     # Sampling rate.
+fft_size: 2048           # FFT size.
+hop_size: 300            # Hop size.
+win_length: 1200         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: false      # Whether to trim the start and end of silence.
+trim_threshold_in_db: 20 # Need to tune carefully if the recording is not good.
+trim_frame_size: 1024    # Frame size in trimming.
+trim_hop_size: 256       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Kernel size of dilated convolution.
+    layers: 30            # Number of residual block layers.
+    stacks: 3             # Number of stacks i.e., dilation cycles.
+    residual_channels: 64 # Number of channels in residual conv.
+    gate_channels: 128    # Number of channels in gated conv.
+    skip_channels: 64     # Number of channels in skip conv.
+    aux_channels: 80      # Number of channels for auxiliary feature conv.
+                          # Must be the same as num_mels.
+    aux_context_window: 2 # Context window size for auxiliary feature.
+                          # If set to 2, previous 2 and future 2 frames will be considered.
+    dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
+    upsample_params:                      # Upsampling network parameters.
+        upsample_scales: [4, 5, 3, 5]     # Upsampling scales. Prodcut of these must be the same as hop size.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Number of output channels.
+    layers: 10            # Number of conv layers.
+    conv_channels: 64     # Number of chnn layers.
+    bias: true            # Whether to use bias parameter in conv.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
+    nonlinear_activation_params:      # Nonlinear function parameters
+        negative_slope: 0.2           # Alpha in LeakyReLU.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 4.0  # Loss balancing coefficient.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 6              # Batch size.
+batch_max_steps: 24000     # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2             # Number of workers in Pytorch DataLoader.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: false         # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    lr: 0.0001             # Generator's learning rate.
+    eps: 1.0e-6            # Generator's epsilon.
+    weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+    step_size: 200000      # Generator's scheduler step size.
+    gamma: 0.5             # Generator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+    lr: 0.00005            # Discriminator's learning rate.
+    eps: 1.0e-6            # Discriminator's epsilon.
+    weight_decay: 0.0      # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    step_size: 200000      # Discriminator's scheduler step size.
+    gamma: 0.5             # Discriminator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 1000000                # Number of training steps.
+save_interval_steps: 5000               # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+log_interval_steps: 100                 # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/libritts/voc1/conf/slurm.conf b/ParallelWaveGAN/egs/libritts/voc1/conf/slurm.conf
new file mode 100644
index 0000000000000000000000000000000000000000..cefd21f031d647f798e8045a6beb92b0b833d041
--- /dev/null
+++ b/ParallelWaveGAN/egs/libritts/voc1/conf/slurm.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command sbatch --export=PATH  --ntasks-per-node=1
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
+option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1 # Do not add anything to qsub_opts
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/ParallelWaveGAN/egs/libritts/voc1/conf/style_melgan.v1.yaml b/ParallelWaveGAN/egs/libritts/voc1/conf/style_melgan.v1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f8eecbf9ddc001fb11d96ae812d43503e150cc51
--- /dev/null
+++ b/ParallelWaveGAN/egs/libritts/voc1/conf/style_melgan.v1.yaml
@@ -0,0 +1,147 @@
+# This is the configuration file for LibriTTS dataset.
+# This configuration is based on StyleMelGAN paper but
+# uses MSE loss instead of Hinge loss. And I found that
+# batch_size = 8 is also working good. So maybe if you
+# want to accelerate the training, you can reduce the
+# batch size (e.g. 8 or 16). Upsampling scales is modified
+# to fit the shift size 300 pt.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 24000     # Sampling rate.
+fft_size: 2048           # FFT size.
+hop_size: 300            # Hop size.
+win_length: 1200         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: false      # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 1024    # Frame size in trimming.
+trim_hop_size: 256       # Hop size in trimming.
+format: "hdf5"           # Feature file format. " npy " or " hdf5 " is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_type: "StyleMelGANGenerator" # Generator type.
+generator_params:
+    in_channels: 128
+    aux_channels: 80
+    channels: 64
+    out_channels: 1
+    kernel_size: 9
+    dilation: 2
+    bias: True
+    noise_upsample_scales: [10, 2, 2, 2]
+    noise_upsample_activation: "LeakyReLU"
+    noise_upsample_activation_params:
+        negative_slope: 0.2
+    upsample_scales: [5, 1, 5, 1, 3, 1, 2, 2, 1]
+    upsample_mode: "nearest"
+    gated_function: "softmax"
+    use_weight_norm: True
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_type: "StyleMelGANDiscriminator" # Discriminator type.
+discriminator_params:
+    repeats: 4
+    window_sizes: [512, 1024, 2048, 4096]
+    pqmf_params:
+        - [1, None, None, None]
+        - [2, 62, 0.26700, 9.0]
+        - [4, 62, 0.14200, 9.0]
+        - [8, 62, 0.07949, 9.0]
+    discriminator_params:
+        out_channels: 1
+        kernel_sizes: [5, 3]
+        channels: 16
+        max_downsample_channels: 512
+        bias: True
+        downsample_scales: [4, 4, 4, 1]
+        nonlinear_activation: "LeakyReLU"
+        nonlinear_activation_params:
+            negative_slope: 0.2
+    use_weight_norm: True
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+lambda_aux: 1.0                   # Loss balancing coefficient for aux loss.
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 1.0 # Loss balancing coefficient for adv loss.
+generator_adv_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+discriminator_adv_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 32              # Batch size.
+batch_max_steps: 24000      # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2              # Number of workers in Pytorch DataLoader.
+remove_short_samples: false # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: false          # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_type: Adam
+generator_optimizer_params:
+    lr: 1.0e-4
+    betas: [0.5, 0.9]
+    weight_decay: 0.0
+generator_scheduler_type: MultiStepLR
+generator_scheduler_params:
+    gamma: 0.5
+    milestones:
+        - 100000
+        - 300000
+        - 500000
+        - 700000
+        - 900000
+generator_grad_norm: -1
+discriminator_optimizer_type: Adam
+discriminator_optimizer_params:
+    lr: 2.0e-4
+    betas: [0.5, 0.9]
+    weight_decay: 0.0
+discriminator_scheduler_type: MultiStepLR
+discriminator_scheduler_params:
+    gamma: 0.5
+    milestones:
+        - 200000
+        - 400000
+        - 600000
+        - 800000
+discriminator_grad_norm: -1
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 1500000                # Number of training steps.
+save_interval_steps: 50000              # Interval steps to save checkpoint.
+eval_interval_steps: 1000              # Interval steps to evaluate the network.
+log_interval_steps: 100                # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/libritts/voc1/local/data_download.sh b/ParallelWaveGAN/egs/libritts/voc1/local/data_download.sh
new file mode 100755
index 0000000000000000000000000000000000000000..881fe5fe40bd8e53aa40c8439fab01eb8662eb9c
--- /dev/null
+++ b/ParallelWaveGAN/egs/libritts/voc1/local/data_download.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# Copyright 2020 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+download_dir=$1
+
+# check arguments
+if [ $# != 1 ]; then
+    echo "Usage: $0 <download_dir>"
+    exit 1
+fi
+
+set -euo pipefail
+
+base_url=http://www.openslr.org/resources/60
+parts="dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500"
+
+cwd=$(pwd)
+if [ ! -e "${download_dir}/LibriTTS/.done" ]; then
+    mkdir -p "${download_dir}"
+    cd "${download_dir}" || exit 1;
+    for part in ${parts}; do
+        if [ -e "./LibriTTS/.${part}_done" ]; then
+            echo "Download of ${part} is already finished. skipped."
+            continue
+        fi
+        wget --no-check-certificate "${base_url}/${part}.tar.gz"
+        tar xvzf "${part}.tar.gz"
+        touch "./LibriTTS/.${part}_done"
+    done
+    touch ./LibriTTS/.done
+    cd "${cwd}" || exit 1;
+    echo "Successfully downloaded data."
+else
+    echo "Already exists. Skipped."
+fi
+
+if [ ! -e "${download_dir}/LibriTTSLabel/.done" ]; then
+    cd "${download_dir}" || exit 1;
+    rm -rf LibriTTSLabel
+    git clone https://github.com/kan-bayashi/LibriTTSLabel.git
+    cd LibriTTSLabel
+    cat lab.tar.gz-* > lab.tar.gz
+    tar xvzf lab.tar.gz
+    touch .done
+    cd "${cwd}" || exit 1;
+    echo "Successfully downloaded label data."
+else
+    echo "Already exists. Skipped."
+fi
diff --git a/ParallelWaveGAN/egs/libritts/voc1/local/data_prep.sh b/ParallelWaveGAN/egs/libritts/voc1/local/data_prep.sh
new file mode 100755
index 0000000000000000000000000000000000000000..b4102b80fcd75e320a0f3540112adc4311171dd9
--- /dev/null
+++ b/ParallelWaveGAN/egs/libritts/voc1/local/data_prep.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+
+# Copyright 2020 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+# shellcheck disable=SC1091
+. ./path.sh || exit 1;
+
+# shellcheck disable=SC1091
+. utils/parse_options.sh || exit 1;
+
+db_root=$1
+part=$2
+data_dir=$3
+db_label_root=$4
+
+# check arguments
+if [ $# -lt 3 ] || [ $# -gt 4 ]; then
+    echo "Usage: $0 [Options] <db_root> <part> <data_dir> [<db_label_root>]"
+    echo "e.g.: $0 downloads/LibriTTS train-clean-100 data"
+    echo "e.g.: $0 downloads/LibriTTS train-clean-100 data downloads/LibriTTSLabel"
+    exit 1
+fi
+
+set -euo pipefail
+
+# check spk existence
+[ ! -e "${db_root}/${part}" ] && \
+    echo "${part} does not exist." >&2 && exit 1;
+
+[ ! -e "${data_dir}/${part}" ] && mkdir -p "${data_dir}/${part}"
+
+# set filenames
+scp="${data_dir}/${part}/wav.scp"
+if [ -n "${db_label_root}" ]; then
+    use_segments=true
+    segments="${data_dir}/${part}/segments"
+else
+    use_segments=false
+fi
+
+# check file existence
+[ -e "${scp}" ] && rm "${scp}"
+if "${use_segments}"; then
+    [ -e "${segments}" ] && rm "${segments}"
+fi
+
+# make scp and segments
+find "${db_root}/${part}" -follow -name "*.wav" | sort | while read -r wav; do
+    id=$(basename "${wav}" | sed -e "s/\.[^\.]*$//g")
+    lab=$(echo "${wav}" | sed -e "s;${db_root}/${part};${db_label_root}/lab/phone/${part};g" -e "s/.wav/.lab/g")
+
+    # check lab existence
+    if "${use_segments}" && [ ! -e "${lab}" ]; then
+        echo "${id} does not have a label file. skipped."
+        continue
+    fi
+
+    echo "${id} ${wav}" >> "${scp}"
+
+    if "${use_segments}"; then
+        # parse label
+        idx=1
+        while true; do
+            symbol=$(sed -n "${idx}p" "${lab}" | awk '{print $3}')
+            if [ "${symbol}" != "sil" ]; then
+                start_sec=$(sed -n "${idx}p" "${lab}" | awk '{print $1}')
+                break
+            fi
+            idx=$((idx+1))
+        done
+        idx=$(wc -l < "${lab}")
+        while true; do
+            symbol=$(sed -n "${idx}p" "${lab}" | awk '{print $3}')
+            if [ -n "${symbol}" ] && [ "${symbol}" != "sp" ]; then
+                end_sec=$(sed -n "${idx}p" "${lab}" | awk '{print $2}')
+                break
+            fi
+            idx=$((idx-1))
+        done
+        echo "${id} ${id} ${start_sec} ${end_sec}" >> "${segments}"
+    fi
+done
+
+echo "Successfully prepared ${part} data."
diff --git a/ParallelWaveGAN/egs/libritts/voc1/path.sh b/ParallelWaveGAN/egs/libritts/voc1/path.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b0ca27c615f70aa29e240222ec370f8ad4e7b45a
--- /dev/null
+++ b/ParallelWaveGAN/egs/libritts/voc1/path.sh
@@ -0,0 +1,33 @@
+# cuda related
+export CUDA_HOME=/usr/local/cuda-10.0
+export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
+
+# path related
+export PRJ_ROOT="${PWD}/../../.."
+if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then
+    # shellcheck disable=SC1090
+    . "${PRJ_ROOT}/tools/venv/bin/activate"
+fi
+
+# python related
+export OMP_NUM_THREADS=1
+export PYTHONIOENCODING=UTF-8
+export MPL_BACKEND=Agg
+
+# check installation
+if ! command -v parallel-wavegan-train > /dev/null; then
+    echo "Error: It seems setup is not finished." >&2
+    echo "Error: Please setup your environment by following README.md" >&2
+    return 1
+fi
+if ! command -v jq > /dev/null; then
+    echo "Error: It seems jq is not installed." >&2
+    echo "Error: Please install via \`sudo apt-get install jq\`." >&2
+    echo "Error: If you do not have sudo, please download from https://stedolan.github.io/jq/download/." >&2
+    return 1
+fi
+if ! command -v yq > /dev/null; then
+    echo "Error: It seems yq is not installed." >&2
+    echo "Error: Please install via \`pip install yq\`." >&2
+    return 1
+fi
diff --git a/ParallelWaveGAN/egs/libritts/voc1/run.sh b/ParallelWaveGAN/egs/libritts/voc1/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..f938deb45e1759af1ae35337082ecce1a825ab71
--- /dev/null
+++ b/ParallelWaveGAN/egs/libritts/voc1/run.sh
@@ -0,0 +1,199 @@
+#!/bin/bash
+
+# Copyright 2020 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+. ./cmd.sh || exit 1;
+. ./path.sh || exit 1;
+
+# basic settings
+stage=-1       # stage to start
+stop_stage=100 # stage to stop
+verbose=1      # verbosity level (lower is less info)
+n_gpus=1       # number of gpus in training
+n_jobs=128     # number of parallel jobs in feature extraction
+
+# NOTE(kan-bayashi): renamed to conf to avoid conflict in parse_options.sh
+conf=conf/parallel_wavegan.v1.yaml
+
+# speaker setting
+part="clean" # "clean" or "all"
+             # if set to "clean", use only clean data
+             # if set to "all", use clean + other data
+
+# directory path setting
+download_dir=downloads # directory to save database
+dumpdir=dump           # directory to dump features
+
+# training related setting
+tag=""     # tag for directory to save model
+resume=""  # checkpoint path to resume training
+           # (e.g. <path>/<to>/checkpoint-10000steps.pkl)
+
+# decoding related setting
+checkpoint="" # checkpoint path to be used for decoding
+              # if not provided, the latest one will be used
+              # (e.g. <path>/<to>/checkpoint-400000steps.pkl)
+
+# shellcheck disable=SC1091
+. utils/parse_options.sh || exit 1;
+
+train_set="train_nodev_${part}" # name of training data directory
+dev_set="dev_${part}"           # name of development data directory
+eval_set="eval_${part}"         # name of evaluation data directory
+
+set -euo pipefail
+
+if [ "${stage}" -le -1 ] && [ "${stop_stage}" -ge -1 ]; then
+    echo "Stage -1: Data download"
+    local/data_download.sh "${download_dir}"
+fi
+
+if [ "${stage}" -le 0 ] && [ "${stop_stage}" -ge 0 ]; then
+    echo "Stage 0: Data preparation"
+    if [ "${part}" = "clean" ]; then
+        train_parts="train-clean-100 train-clean-360"
+        dev_parts="dev-clean"
+        eval_parts="test-clean"
+    elif [ "${part}" = "all" ]; then
+        train_parts="train-clean-100 train-clean-360 train-other-500"
+        dev_parts="dev-clean dev-other"
+        eval_parts="test-clean test-other"
+    else
+        echo "You must select from all or clean." >&2; exit 1;
+    fi
+    train_data_dirs=""
+    dev_data_dirs=""
+    eval_data_dirs=""
+    for train_part in ${train_parts}; do
+        local/data_prep.sh "${download_dir}/LibriTTS" \
+            "${train_part}" data "${download_dir}/LibriTTSLabel"
+        train_data_dirs+=" data/${train_part}"
+    done
+    for dev_part in ${dev_parts}; do
+        local/data_prep.sh "${download_dir}/LibriTTS" \
+            "${dev_part}" data "${download_dir}/LibriTTSLabel"
+        dev_data_dirs+=" data/${dev_part}"
+    done
+    for eval_part in ${eval_parts}; do
+        local/data_prep.sh "${download_dir}/LibriTTS" \
+            "${eval_part}" data "${download_dir}/LibriTTSLabel"
+        eval_data_dirs+=" data/${eval_part}"
+    done
+    # shellcheck disable=SC2086
+    utils/combine_data.sh "data/${train_set}" ${train_data_dirs}
+    # shellcheck disable=SC2086
+    utils/combine_data.sh "data/${dev_set}" ${dev_data_dirs}
+    # shellcheck disable=SC2086
+    utils/combine_data.sh "data/${eval_set}" ${eval_data_dirs}
+fi
+
+stats_ext=$(grep -q "hdf5" <(yq ".format" "${conf}") && echo "h5" || echo "npy")
+if [ "${stage}" -le 1 ] && [ "${stop_stage}" -ge 1 ]; then
+    echo "Stage 1: Feature extraction"
+    # extract raw features
+    pids=()
+    for name in "${train_set}" "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${dumpdir}/${name}/raw" ] && mkdir -p "${dumpdir}/${name}/raw"
+        echo "Feature extraction start. See the progress via ${dumpdir}/${name}/raw/preprocessing.*.log."
+        utils/make_subset_data.sh "data/${name}" "${n_jobs}" "${dumpdir}/${name}/raw"
+        ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/raw/preprocessing.JOB.log" \
+            parallel-wavegan-preprocess \
+                --config "${conf}" \
+                --scp "${dumpdir}/${name}/raw/wav.JOB.scp" \
+                --segments "${dumpdir}/${name}/raw/segments.JOB" \
+                --dumpdir "${dumpdir}/${name}/raw/dump.JOB" \
+                --verbose "${verbose}"
+        echo "Successfully finished feature extraction of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished feature extraction."
+
+    # calculate statistics for normalization
+    echo "Statistics computation start. See the progress via ${dumpdir}/${train_set}/compute_statistics.log."
+    ${train_cmd} "${dumpdir}/${train_set}/compute_statistics.log" \
+        parallel-wavegan-compute-statistics \
+            --config "${conf}" \
+            --rootdir "${dumpdir}/${train_set}/raw" \
+            --dumpdir "${dumpdir}/${train_set}" \
+            --verbose "${verbose}"
+    echo "Successfully finished calculation of statistics."
+
+    # normalize and dump them
+    pids=()
+    for name in "${train_set}" "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${dumpdir}/${name}/norm" ] && mkdir -p "${dumpdir}/${name}/norm"
+        echo "Nomalization start. See the progress via ${dumpdir}/${name}/norm/normalize.*.log."
+        ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/norm/normalize.JOB.log" \
+            parallel-wavegan-normalize \
+                --config "${conf}" \
+                --stats "${dumpdir}/${train_set}/stats.${stats_ext}" \
+                --rootdir "${dumpdir}/${name}/raw/dump.JOB" \
+                --dumpdir "${dumpdir}/${name}/norm/dump.JOB" \
+                --verbose "${verbose}"
+        echo "Successfully finished normalization of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished normalization."
+fi
+
+if [ -z "${tag}" ]; then
+    expdir="exp/${train_set}_libritts_$(basename "${conf}" .yaml)"
+else
+    expdir="exp/${train_set}_libritts_${tag}"
+fi
+if [ "${stage}" -le 2 ] && [ "${stop_stage}" -ge 2 ]; then
+    echo "Stage 2: Network training"
+    [ ! -e "${expdir}" ] && mkdir -p "${expdir}"
+    cp "${dumpdir}/${train_set}/stats.${stats_ext}" "${expdir}"
+    if [ "${n_gpus}" -gt 1 ]; then
+        train="python -m parallel_wavegan.distributed.launch --nproc_per_node ${n_gpus} -c parallel-wavegan-train"
+    else
+        train="parallel-wavegan-train"
+    fi
+    echo "Training start. See the progress via ${expdir}/train.log."
+    ${cuda_cmd} --gpu "${n_gpus}" "${expdir}/train.log" \
+        ${train} \
+            --config "${conf}" \
+            --train-dumpdir "${dumpdir}/${train_set}/norm" \
+            --dev-dumpdir "${dumpdir}/${dev_set}/norm" \
+            --outdir "${expdir}" \
+            --resume "${resume}" \
+            --verbose "${verbose}"
+    echo "Successfully finished training."
+fi
+
+if [ "${stage}" -le 3 ] && [ "${stop_stage}" -ge 3 ]; then
+    echo "Stage 3: Network decoding"
+    # shellcheck disable=SC2012
+    [ -z "${checkpoint}" ] && checkpoint="$(ls -dt "${expdir}"/*.pkl | head -1 || true)"
+    outdir="${expdir}/wav/$(basename "${checkpoint}" .pkl)"
+    pids=()
+    for name in "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${outdir}/${name}" ] && mkdir -p "${outdir}/${name}"
+        [ "${n_gpus}" -gt 1 ] && n_gpus=1
+        echo "Decoding start. See the progress via ${outdir}/${name}/decode.log."
+        ${cuda_cmd} --gpu "${n_gpus}" "${outdir}/${name}/decode.log" \
+            parallel-wavegan-decode \
+                --dumpdir "${dumpdir}/${name}/norm" \
+                --checkpoint "${checkpoint}" \
+                --outdir "${outdir}/${name}" \
+                --verbose "${verbose}"
+        echo "Successfully finished decoding of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished decoding."
+fi
+echo "Finished."
diff --git a/ParallelWaveGAN/egs/libritts/voc1/utils b/ParallelWaveGAN/egs/libritts/voc1/utils
new file mode 120000
index 0000000000000000000000000000000000000000..973afe674f2c85f7a400600f963e0709767602dc
--- /dev/null
+++ b/ParallelWaveGAN/egs/libritts/voc1/utils
@@ -0,0 +1 @@
+../../../utils
\ No newline at end of file
diff --git a/ParallelWaveGAN/egs/ljspeech/voc1/cmd.sh b/ParallelWaveGAN/egs/ljspeech/voc1/cmd.sh
new file mode 100644
index 0000000000000000000000000000000000000000..19f342102fc4f3389157c48f1196b16b68eb1cf1
--- /dev/null
+++ b/ParallelWaveGAN/egs/ljspeech/voc1/cmd.sh
@@ -0,0 +1,91 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend="local"
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="utils/run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="utils/run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="utils/run.pl"
+
+# Local machine, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="utils/stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="utils/stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="utils/stdout.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="utils/queue.pl"
+    export cuda_cmd="utils/queue.pl"
+    export decode_cmd="utils/queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="utils/slurm.pl"
+    export cuda_cmd="utils/slurm.pl"
+    export decode_cmd="utils/slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="utils/ssh.pl"
+    export cuda_cmd="utils/ssh.pl"
+    export decode_cmd="utils/ssh.pl"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/ParallelWaveGAN/egs/ljspeech/voc1/conf/full_band_melgan.v1.yaml b/ParallelWaveGAN/egs/ljspeech/voc1/conf/full_band_melgan.v1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..19c2330e4780de0e69a6944f31dce9e62c4f91a2
--- /dev/null
+++ b/ParallelWaveGAN/egs/ljspeech/voc1/conf/full_band_melgan.v1.yaml
@@ -0,0 +1,141 @@
+# This is the hyperparameter configuration file for MelGAN.
+# Please make sure this is adjusted for the LJSpeech dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This cofiguration requires ~ 11 GB memory and takes 1 week on Titan V.
+
+# This configuration is based on full-band MelGAN but the hop size and sampling
+# rate is different from the paper (16kHz vs 22.05kHz). And we use RAdam instaed
+# of Adam. Tne number of iteraions is now shown in the paper so currently we train
+# 1M iterations (not sure enough to converge).
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 22050     # Sampling rate.
+fft_size: 1024           # FFT size.
+hop_size: 256            # Hop size.
+win_length: null         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: true       # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 2048    # Frame size in trimming.
+trim_hop_size: 512       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_type: "MelGANGenerator" # Generator type.
+generator_params:
+    in_channels: 80               # Number of input channels.
+    out_channels: 1               # Number of output channels.
+    kernel_size: 7                # Kernel size of initial and final conv layers.
+    channels: 512                 # Initial number of channels for conv layers.
+    upsample_scales: [8, 8, 4]    # List of Upsampling scales.
+    stack_kernel_size: 3          # Kernel size of dilated conv layers in residual stack.
+    stacks: 4                     # Number of stacks in a single residual stack module.
+    use_weight_norm: True         # Whether to use weight normalization.
+    use_causal_conv: False        # Whether to use causal convolution.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_type: "MelGANMultiScaleDiscriminator" # Discriminator type.
+discriminator_params:
+    in_channels: 1                    # Number of input channels.
+    out_channels: 1                   # Number of output channels.
+    scales: 3                         # Number of multi-scales.
+    downsample_pooling: "AvgPool1d"   # Pooling type for the input downsampling.
+    downsample_pooling_params:        # Parameters of the above pooling function.
+        kernel_size: 4
+        stride: 2
+        padding: 1
+        count_include_pad: False
+    kernel_sizes: [5, 3]              # List of kernel size.
+    channels: 16                      # Number of channels of the initial conv layer.
+    max_downsample_channels: 512      # Maximum number of channels of downsampling layers.
+    downsample_scales: [4, 4, 4]      # List of downsampling scales.
+    nonlinear_activation: "LeakyReLU" # Nonlinear activation function.
+    nonlinear_activation_params:      # Parameters of nonlinear activation function.
+        negative_slope: 0.2
+    use_weight_norm: True             # Whether to use weight norm.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+use_feat_match_loss: false # Whether to use feature matching loss.
+lambda_adv: 2.5            # Loss balancing coefficient for adversarial loss.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 48             # Batch size.
+batch_max_steps: 16384     # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2             # Number of workers in Pytorch DataLoader.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_type: "RAdam"       # Generator's optimizer type.
+generator_optimizer_params:
+    lr: 0.0001                          # Generator's learning rate.
+    eps: 1.0e-6                         # Generator's epsilon.
+    weight_decay: 0.0                   # Generator's weight decay coefficient.
+generator_grad_norm: 10                 # Generator's gradient norm.
+generator_scheduler_type: "MultiStepLR" # Generator's scheduler type.
+generator_scheduler_params:
+    gamma: 0.5                          # Generator's scheduler gamma.
+    milestones:                         # At each milestone, lr will be multiplied by gamma.
+        - 100000
+        - 200000
+        - 300000
+        - 400000
+        - 500000
+        - 600000
+discriminator_optimizer_type: "RAdam"       # Discriminator's optimizer type.
+discriminator_optimizer_params:
+    lr: 0.0001                              # Discriminator's learning rate.
+    eps: 1.0e-6                             # Discriminator's epsilon.
+    weight_decay: 0.0                       # Discriminator's weight decay coefficient.
+discriminator_grad_norm: 1                  # Discriminator's gradient norm.
+discriminator_scheduler_type: "MultiStepLR" # Discriminator's scheduler type.
+discriminator_scheduler_params:
+    gamma: 0.5                              # Discriminator's scheduler gamma.
+    milestones:                             # At each milestone, lr will be multiplied by gamma.
+        - 100000
+        - 200000
+        - 300000
+        - 400000
+        - 500000
+        - 600000
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 200000 # Number of steps to start to train discriminator.
+train_max_steps: 1000000                # Number of training steps.
+save_interval_steps: 50000              # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+log_interval_steps: 1000                # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/ljspeech/voc1/conf/full_band_melgan.v2.yaml b/ParallelWaveGAN/egs/ljspeech/voc1/conf/full_band_melgan.v2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9386087d98d128c5bcbb45b549c59981a8bb96f3
--- /dev/null
+++ b/ParallelWaveGAN/egs/ljspeech/voc1/conf/full_band_melgan.v2.yaml
@@ -0,0 +1,144 @@
+# This is the hyperparameter configuration file for MelGAN.
+# Please make sure this is adjusted for the LJSpeech dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This cofiguration requires ~ 11 GB memory and takes 1 week on Titan V.
+
+# # This configuration is based on full-band MelGAN but the hop size and sampling
+# rate is different from the paper (16kHz vs 22.05kHz). The number of iteraions
+# is now shown in the paper so currently we train 1M iterations (not sure enough
+# to converge). The optimizer setting is based on @dathudeptrai advice.
+# https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 22050     # Sampling rate.
+fft_size: 1024           # FFT size.
+hop_size: 256            # Hop size.
+win_length: null         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: true       # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 2048    # Frame size in trimming.
+trim_hop_size: 512       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_type: "MelGANGenerator" # Generator type.
+generator_params:
+    in_channels: 80               # Number of input channels.
+    out_channels: 1               # Number of output channels.
+    kernel_size: 7                # Kernel size of initial and final conv layers.
+    channels: 512                 # Initial number of channels for conv layers.
+    upsample_scales: [8, 8, 4]    # List of Upsampling scales.
+    stack_kernel_size: 3          # Kernel size of dilated conv layers in residual stack.
+    stacks: 4                     # Number of stacks in a single residual stack module.
+    use_weight_norm: True         # Whether to use weight normalization.
+    use_causal_conv: False        # Whether to use causal convolution.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_type: "MelGANMultiScaleDiscriminator" # Discriminator type.
+discriminator_params:
+    in_channels: 1                    # Number of input channels.
+    out_channels: 1                   # Number of output channels.
+    scales: 3                         # Number of multi-scales.
+    downsample_pooling: "AvgPool1d"   # Pooling type for the input downsampling.
+    downsample_pooling_params:        # Parameters of the above pooling function.
+        kernel_size: 4
+        stride: 2
+        padding: 1
+        count_include_pad: False
+    kernel_sizes: [5, 3]              # List of kernel size.
+    channels: 16                      # Number of channels of the initial conv layer.
+    max_downsample_channels: 512      # Maximum number of channels of downsampling layers.
+    downsample_scales: [4, 4, 4]      # List of downsampling scales.
+    nonlinear_activation: "LeakyReLU" # Nonlinear activation function.
+    nonlinear_activation_params:      # Parameters of nonlinear activation function.
+        negative_slope: 0.2
+    use_weight_norm: True             # Whether to use weight norm.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+use_feat_match_loss: false # Whether to use feature matching loss.
+lambda_adv: 2.5            # Loss balancing coefficient for adversarial loss.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 48             # Batch size.
+batch_max_steps: 16384     # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2             # Number of workers in Pytorch DataLoader.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_type: "Adam"       # Generator's optimizer type.
+generator_optimizer_params:
+    lr: 1.0e-3                          # Generator's learning rate.
+    eps: 1.0e-7                         # Generator's epsilon.
+    weight_decay: 0.0                   # Generator's weight decay coefficient.
+    amsgrad: true
+generator_grad_norm: -1                 # Generator's gradient norm.
+generator_scheduler_type: "MultiStepLR" # Generator's scheduler type.
+generator_scheduler_params:
+    gamma: 0.5                          # Generator's scheduler gamma.
+    milestones:                         # At each milestone, lr will be multiplied by gamma.
+        - 100000
+        - 200000
+        - 300000
+        - 400000
+        - 500000
+        - 600000
+discriminator_optimizer_type: "Adam"        # Discriminator's optimizer type.
+discriminator_optimizer_params:
+    lr: 1.0e-3                              # Discriminator's learning rate.
+    eps: 1.0e-7                             # Discriminator's epsilon.
+    weight_decay: 0.0                       # Discriminator's weight decay coefficient.
+    amsgrad: true
+discriminator_grad_norm: -1                 # Discriminator's gradient norm.
+discriminator_scheduler_type: "MultiStepLR" # Discriminator's scheduler type.
+discriminator_scheduler_params:
+    gamma: 0.5                              # Discriminator's scheduler gamma.
+    milestones:                             # At each milestone, lr will be multiplied by gamma.
+        - 100000
+        - 200000
+        - 300000
+        - 400000
+        - 500000
+        - 600000
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 200000 # Number of steps to start to train discriminator.
+train_max_steps: 1000000                # Number of training steps.
+save_interval_steps: 50000              # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+log_interval_steps: 1000                # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/ljspeech/voc1/conf/hifigan.v1.yaml b/ParallelWaveGAN/egs/ljspeech/voc1/conf/hifigan.v1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fd4d86cc1e8c00968cae6a4bf53e643fb75fb3c9
--- /dev/null
+++ b/ParallelWaveGAN/egs/ljspeech/voc1/conf/hifigan.v1.yaml
@@ -0,0 +1,178 @@
+# This is the configuration file for LJSpeech dataset.
+# This configuration is based on HiFiGAN V1, which is
+# an official configuration. But I found that the optimizer
+# setting does not work well with my implementation.
+# So I changed optimizer settings as follows:
+# - AdamW -> Adam
+# - betas: [0.8, 0.99] -> betas: [0.5, 0.9]
+# - Scheduler: ExponentialLR -> MultiStepLR
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 22050     # Sampling rate.
+fft_size: 1024           # FFT size.
+hop_size: 256            # Hop size.
+win_length: null         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: false      # Whether to trim the start and end of silence.
+trim_threshold_in_db: 20 # Need to tune carefully if the recording is not good.
+trim_frame_size: 1024    # Frame size in trimming.
+trim_hop_size: 256       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_type: HiFiGANGenerator
+generator_params:
+    in_channels: 80                       # Number of input channels.
+    out_channels: 1                       # Number of output channels.
+    channels: 512                         # Number of initial channels.
+    kernel_size: 7                        # Kernel size of initial and final conv layers.
+    upsample_scales: [8, 8, 2, 2]         # Upsampling scales.
+    upsample_kernel_sizes: [16, 16, 4, 4] # Kernel size for upsampling layers.
+    resblock_kernel_sizes: [3, 7, 11]     # Kernel size for residual blocks.
+    resblock_dilations:                   # Dilations for residual blocks.
+        - [1, 3, 5]
+        - [1, 3, 5]
+        - [1, 3, 5]
+    use_additional_convs: true            # Whether to use additional conv layer in residual blocks.
+    bias: true                            # Whether to use bias parameter in conv.
+    nonlinear_activation: "LeakyReLU"     # Nonlinear activation type.
+    nonlinear_activation_params:          # Nonlinear activation paramters.
+        negative_slope: 0.1
+    use_weight_norm: true                 # Whether to apply weight normalization.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_type: HiFiGANMultiScaleMultiPeriodDiscriminator
+discriminator_params:
+    scales: 3                              # Number of multi-scale discriminator.
+    scale_downsample_pooling: "AvgPool1d"  # Pooling operation for scale discriminator.
+    scale_downsample_pooling_params:
+        kernel_size: 4                     # Pooling kernel size.
+        stride: 2                          # Pooling stride.
+        padding: 2                         # Padding size.
+    scale_discriminator_params:
+        in_channels: 1                     # Number of input channels.
+        out_channels: 1                    # Number of output channels.
+        kernel_sizes: [15, 41, 5, 3]       # List of kernel sizes.
+        channels: 128                      # Initial number of channels.
+        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
+        max_groups: 16                     # Maximum number of groups in downsampling conv layers.
+        bias: true
+        downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales.
+        nonlinear_activation: "LeakyReLU"  # Nonlinear activation.
+        nonlinear_activation_params:
+            negative_slope: 0.1
+    follow_official_norm: true             # Whether to follow the official norm setting.
+    periods: [2, 3, 5, 7, 11]              # List of period for multi-period discriminator.
+    period_discriminator_params:
+        in_channels: 1                     # Number of input channels.
+        out_channels: 1                    # Number of output channels.
+        kernel_sizes: [5, 3]               # List of kernel sizes.
+        channels: 32                       # Initial number of channels.
+        downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
+        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
+        bias: true                         # Whether to use bias parameter in conv layer."
+        nonlinear_activation: "LeakyReLU"  # Nonlinear activation.
+        nonlinear_activation_params:       # Nonlinear activation paramters.
+            negative_slope: 0.1
+        use_weight_norm: true              # Whether to apply weight normalization.
+        use_spectral_norm: false           # Whether to apply spectral normalization.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+use_stft_loss: false                 # Whether to use multi-resolution STFT loss.
+use_mel_loss: true                   # Whether to use Mel-spectrogram loss.
+mel_loss_params:
+    fs: 22050
+    fft_size: 1024
+    hop_size: 256
+    win_length: null
+    window: "hann"
+    num_mels: 80
+    fmin: 0
+    fmax: 11025
+    log_base: null
+generator_adv_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+discriminator_adv_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+use_feat_match_loss: true
+feat_match_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+    average_by_layers: false         # Whether to average loss by #layers in each discriminator.
+    include_final_outputs: false     # Whether to include final outputs in feat match loss calculation.
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_aux: 45.0       # Loss balancing coefficient for STFT loss.
+lambda_adv: 1.0        # Loss balancing coefficient for adversarial loss.
+lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss..
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 16               # Batch size.
+batch_max_steps: 8192       # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2              # Number of workers in Pytorch DataLoader.
+remove_short_samples: false # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_type: Adam
+generator_optimizer_params:
+    lr: 2.0e-4
+    betas: [0.5, 0.9]
+    weight_decay: 0.0
+generator_scheduler_type: MultiStepLR
+generator_scheduler_params:
+    gamma: 0.5
+    milestones:
+        - 200000
+        - 400000
+        - 600000
+        - 800000
+generator_grad_norm: -1
+discriminator_optimizer_type: Adam
+discriminator_optimizer_params:
+    lr: 2.0e-4
+    betas: [0.5, 0.9]
+    weight_decay: 0.0
+discriminator_scheduler_type: MultiStepLR
+discriminator_scheduler_params:
+    gamma: 0.5
+    milestones:
+        - 200000
+        - 400000
+        - 600000
+        - 800000
+discriminator_grad_norm: -1
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+generator_train_start_steps: 1     # Number of steps to start to train discriminator.
+discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
+train_max_steps: 2500000           # Number of training steps.
+save_interval_steps: 10000         # Interval steps to save checkpoint.
+eval_interval_steps: 1000          # Interval steps to evaluate the network.
+log_interval_steps: 100            # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/ljspeech/voc1/conf/melgan.v1.long.yaml b/ParallelWaveGAN/egs/ljspeech/voc1/conf/melgan.v1.long.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2b18a3a037cb32a3ed5282c5f3b1427009947dd7
--- /dev/null
+++ b/ParallelWaveGAN/egs/ljspeech/voc1/conf/melgan.v1.long.yaml
@@ -0,0 +1,115 @@
+# This is the hyperparameter configuration file for MelGAN.
+# Please make sure this is adjusted for the LJSpeech dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration performs 1000k iters and it requires ~4 GB GPU memory and
+# takes ~2 days on TITAN V.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 22050     # Sampling rate.
+fft_size: 1024           # FFT size.
+hop_size: 256            # Hop size.
+win_length: null         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: true       # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 2048    # Frame size in trimming.
+trim_hop_size: 512       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_type: "MelGANGenerator" # Generator type.
+generator_params:
+    in_channels: 80               # Number of input channels.
+    out_channels: 1               # Number of output channels.
+    kernel_size: 7                # Kernel size of initial and final conv layers.
+    channels: 512                 # Initial number of channels for conv layers.
+    upsample_scales: [8, 8, 2, 2] # List of Upsampling scales.
+    stack_kernel_size: 3          # Kernel size of dilated conv layers in residual stack.
+    stacks: 3                     # Number of stacks in a single residual stack module.
+    use_weight_norm: True         # Whether to use weight normalization.
+    use_causal_conv: False        # Whether to use causal convolution.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Number of output channels.
+    layers: 10            # Number of conv layers.
+    conv_channels: 64     # Number of chnn layers.
+    bias: true            # Whether to use bias parameter in conv.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
+    nonlinear_activation_params:      # Nonlinear function parameters
+        negative_slope: 0.2           # Alpha in LeakyReLU.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 4.0  # Loss balancing coefficient.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 8              # Batch size.
+batch_max_steps: 25600     # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2             # Number of workers in Pytorch DataLoader.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    lr: 0.0001             # Generator's learning rate.
+    eps: 1.0e-6            # Generator's epsilon.
+    weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+    step_size: 200000      # Generator's scheduler step size.
+    gamma: 0.5             # Generator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+    lr: 0.00005            # Discriminator's learning rate.
+    eps: 1.0e-6            # Discriminator's epsilon.
+    weight_decay: 0.0      # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    step_size: 200000      # Discriminator's scheduler step size.
+    gamma: 0.5             # Discriminator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 1000000                # Number of training steps.
+save_interval_steps: 5000               # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+log_interval_steps: 100                 # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/ljspeech/voc1/conf/melgan.v1.yaml b/ParallelWaveGAN/egs/ljspeech/voc1/conf/melgan.v1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a5a01daa1138bff8ecb047f360b0e9aa4deb2cfe
--- /dev/null
+++ b/ParallelWaveGAN/egs/ljspeech/voc1/conf/melgan.v1.yaml
@@ -0,0 +1,114 @@
+# This is the hyperparameter configuration file for MelGAN.
+# Please make sure this is adjusted for the LJSpeech dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration requires ~4 GB GPU memory and takes ~15 hours on TITAN V.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 22050     # Sampling rate.
+fft_size: 1024           # FFT size.
+hop_size: 256            # Hop size.
+win_length: null         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: true       # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 2048    # Frame size in trimming.
+trim_hop_size: 512       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_type: "MelGANGenerator" # Generator type.
+generator_params:
+    in_channels: 80               # Number of input channels.
+    out_channels: 1               # Number of output channels.
+    kernel_size: 7                # Kernel size of initial and final conv layers.
+    channels: 512                 # Initial number of channels for conv layers.
+    upsample_scales: [8, 8, 2, 2] # List of Upsampling scales.
+    stack_kernel_size: 3          # Kernel size of dilated conv layers in residual stack.
+    stacks: 3                     # Number of stacks in a single residual stack module.
+    use_weight_norm: True         # Whether to use weight normalization.
+    use_causal_conv: False        # Whether to use causal convolution.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Number of output channels.
+    layers: 10            # Number of conv layers.
+    conv_channels: 64     # Number of chnn layers.
+    bias: true            # Whether to use bias parameter in conv.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
+    nonlinear_activation_params:      # Nonlinear function parameters
+        negative_slope: 0.2           # Alpha in LeakyReLU.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 4.0  # Loss balancing coefficient.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 8              # Batch size.
+batch_max_steps: 25600     # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2             # Number of workers in Pytorch DataLoader.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    lr: 0.0001             # Generator's learning rate.
+    eps: 1.0e-6            # Generator's epsilon.
+    weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+    step_size: 200000      # Generator's scheduler step size.
+    gamma: 0.5             # Generator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+    lr: 0.00005            # Discriminator's learning rate.
+    eps: 1.0e-6            # Discriminator's epsilon.
+    weight_decay: 0.0      # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    step_size: 200000      # Discriminator's scheduler step size.
+    gamma: 0.5             # Discriminator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 400000                 # Number of training steps.
+save_interval_steps: 5000               # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+log_interval_steps: 100                 # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/ljspeech/voc1/conf/melgan.v3.long.yaml b/ParallelWaveGAN/egs/ljspeech/voc1/conf/melgan.v3.long.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d43a62e662c3358c1e6ac0742e3760d20e5b4734
--- /dev/null
+++ b/ParallelWaveGAN/egs/ljspeech/voc1/conf/melgan.v3.long.yaml
@@ -0,0 +1,129 @@
+# This is the hyperparameter configuration file for MelGAN.
+# Please make sure this is adjusted for the LJSpeech dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# Both generator and discriminator are based on MelGAN but we also use
+# STFT-based auxiliary loss with fixed lr. This configuration requires ~4 GB
+# GPU memory and takes ~7 days on TITAN V.
+# The discriminator loss is not stable as v1, i.e., gradually decreasing both
+# real and fake losses (Also, feature matching loss keeps increasing). But in
+# terms of naturalness, this model is better than v1. To get the stable results,
+# this model basically requires ~3M iters.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 22050     # Sampling rate.
+fft_size: 1024           # FFT size.
+hop_size: 256            # Hop size.
+win_length: null         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: true       # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 2048    # Frame size in trimming.
+trim_hop_size: 512       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_type: "MelGANGenerator" # Generator type.
+generator_params:
+    in_channels: 80               # Number of input channels.
+    out_channels: 1               # Number of output channels.
+    kernel_size: 7                # Kernel size of initial and final conv layers.
+    channels: 512                 # Initial number of channels for conv layers.
+    upsample_scales: [8, 8, 2, 2] # List of Upsampling scales.
+    stack_kernel_size: 3          # Kernel size of dilated conv layers in residual stack.
+    stacks: 3                     # Number of stacks in a single residual stack module.
+    use_weight_norm: True         # Whether to use weight normalization.
+    use_causal_conv: False        # Whether to use causal convolution.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_type: "MelGANMultiScaleDiscriminator" # Discriminator type.
+discriminator_params:
+    in_channels: 1                    # Number of input channels.
+    out_channels: 1                   # Number of output channels.
+    scales: 3                         # Number of multi-scales.
+    downsample_pooling: "AvgPool1d"   # Pooling type for the input downsampling.
+    downsample_pooling_params:        # Parameters of the above pooling function.
+        kernel_size: 4
+        stride: 2
+        padding: 1
+        count_include_pad: False
+    kernel_sizes: [5, 3]              # List of kernel size.
+    channels: 16                      # Number of channels of the initial conv layer.
+    max_downsample_channels: 1024     # Maximum number of channels of downsampling layers.
+    downsample_scales: [4, 4, 4, 4]   # List of downsampling scales.
+    nonlinear_activation: "LeakyReLU" # Nonlinear activation function.
+    nonlinear_activation_params:      # Parameters of nonlinear activation function.
+        negative_slope: 0.2
+    use_weight_norm: True             # Whether to use weight norm.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+use_feat_match_loss: true # Whether to use feature matching loss.
+lambda_feat_match: 25.0   # Loss balancing coefficient for feature matching loss.
+lambda_adv: 4.0          # Loss balancing coefficient for adversarial loss.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 16             # Batch size.
+batch_max_steps: 8192      # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2             # Number of workers in Pytorch DataLoader.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    lr: 0.0001             # Generator's learning rate.
+    eps: 1.0e-6            # Generator's epsilon.
+    weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+    step_size: 4000000     # Generator's scheduler step size.
+    gamma: 0.5             # Generator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+    lr: 0.00005            # Discriminator's learning rate.
+    eps: 1.0e-6            # Discriminator's epsilon.
+    weight_decay: 0.0      # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    step_size: 4000000     # Discriminator's scheduler step size.
+    gamma: 0.5             # Discriminator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 4000000                # Number of training steps.
+save_interval_steps: 5000               # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+log_interval_steps: 100                 # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/ljspeech/voc1/conf/melgan.v3.yaml b/ParallelWaveGAN/egs/ljspeech/voc1/conf/melgan.v3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..37a71d16b52fbc46c23d5ff560526c82aa3611dd
--- /dev/null
+++ b/ParallelWaveGAN/egs/ljspeech/voc1/conf/melgan.v3.yaml
@@ -0,0 +1,129 @@
+# This is the hyperparameter configuration file for MelGAN.
+# Please make sure this is adjusted for the LJSpeech dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# Both generator and discriminator are based on MelGAN but we also use
+# STFT-based auxiliary loss with fixed lr. This configuration requires ~4 GB
+# GPU memory and takes ~4 days on TITAN V.
+# The discriminator loss is not stable as v1, i.e., gradually decreasing both
+# real and fake losses (Also, feature matching loss keeps increasing). But in
+# terms of naturalness, this model is better than v1.
+# If you get unstable results, please increase train_max_steps or use v3.long.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 22050     # Sampling rate.
+fft_size: 1024           # FFT size.
+hop_size: 256            # Hop size.
+win_length: null         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: true       # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 2048    # Frame size in trimming.
+trim_hop_size: 512       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_type: "MelGANGenerator" # Generator type.
+generator_params:
+    in_channels: 80               # Number of input channels.
+    out_channels: 1               # Number of output channels.
+    kernel_size: 7                # Kernel size of initial and final conv layers.
+    channels: 512                 # Initial number of channels for conv layers.
+    upsample_scales: [8, 8, 2, 2] # List of Upsampling scales.
+    stack_kernel_size: 3          # Kernel size of dilated conv layers in residual stack.
+    stacks: 3                     # Number of stacks in a single residual stack module.
+    use_weight_norm: True         # Whether to use weight normalization.
+    use_causal_conv: False        # Whether to use causal convolution.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_type: "MelGANMultiScaleDiscriminator" # Discriminator type.
+discriminator_params:
+    in_channels: 1                    # Number of input channels.
+    out_channels: 1                   # Number of output channels.
+    scales: 3                         # Number of multi-scales.
+    downsample_pooling: "AvgPool1d"   # Pooling type for the input downsampling.
+    downsample_pooling_params:        # Parameters of the above pooling function.
+        kernel_size: 4
+        stride: 2
+        padding: 1
+        count_include_pad: False
+    kernel_sizes: [5, 3]              # List of kernel size.
+    channels: 16                      # Number of channels of the initial conv layer.
+    max_downsample_channels: 1024     # Maximum number of channels of downsampling layers.
+    downsample_scales: [4, 4, 4, 4]   # List of downsampling scales.
+    nonlinear_activation: "LeakyReLU" # Nonlinear activation function.
+    nonlinear_activation_params:      # Parameters of nonlinear activation function.
+        negative_slope: 0.2
+    use_weight_norm: True             # Whether to use weight norm.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+use_feat_match_loss: true # Whether to use feature matching loss.
+lambda_feat_match: 25.0   # Loss balancing coefficient for feature matching loss.
+lambda_adv: 4.0           # Loss balancing coefficient for adversarial loss.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 16             # Batch size.
+batch_max_steps: 8192      # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2             # Number of workers in Pytorch DataLoader.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    lr: 0.0001             # Generator's learning rate.
+    eps: 1.0e-6            # Generator's epsilon.
+    weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+    step_size: 2000000     # Generator's scheduler step size.
+    gamma: 0.5             # Generator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+    lr: 0.00005            # Discriminator's learning rate.
+    eps: 1.0e-6            # Discriminator's epsilon.
+    weight_decay: 0.0      # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    step_size: 2000000     # Discriminator's scheduler step size.
+    gamma: 0.5             # Discriminator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 2000000                # Number of training steps.
+save_interval_steps: 5000               # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+log_interval_steps: 100                 # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/ljspeech/voc1/conf/melgan_large.v1.long.yaml b/ParallelWaveGAN/egs/ljspeech/voc1/conf/melgan_large.v1.long.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b17a44b769278adfa1e5634f45a1d66e296efccf
--- /dev/null
+++ b/ParallelWaveGAN/egs/ljspeech/voc1/conf/melgan_large.v1.long.yaml
@@ -0,0 +1,115 @@
+# This is the hyperparameter configuration file for MelGAN.
+# Please make sure this is adjusted for the LJSpeech dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration use larger generator and performs 1000k iters.
+# It requires ~5 GB GPU memory and takes ~3 days on TITAN V.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 22050     # Sampling rate.
+fft_size: 1024           # FFT size.
+hop_size: 256            # Hop size.
+win_length: null         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: true       # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 2048    # Frame size in trimming.
+trim_hop_size: 512       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_type: "MelGANGenerator" # Generator type.
+generator_params:
+    in_channels: 80                  # Number of input channels.
+    out_channels: 1                  # Number of output channels.
+    kernel_size: 7                   # Kernel size of initial and final conv layers.
+    channels: 1024                   # Initial number of channels for conv layers.
+    upsample_scales: [4, 4, 4, 2, 2] # List of Upsampling scales.
+    stack_kernel_size: 3             # Kernel size of dilated conv layers in residual stack.
+    stacks: 4                        # Number of stacks in a single residual stack module.
+    use_weight_norm: True            # Whether to use weight normalization.
+    use_causal_conv: False           # Whether to use causal convolution.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Number of output channels.
+    layers: 10            # Number of conv layers.
+    conv_channels: 64     # Number of chnn layers.
+    bias: true            # Whether to use bias parameter in conv.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
+    nonlinear_activation_params:      # Nonlinear function parameters
+        negative_slope: 0.2           # Alpha in LeakyReLU.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 4.0  # Loss balancing coefficient.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 8              # Batch size.
+batch_max_steps: 25600     # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2             # Number of workers in Pytorch DataLoader.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    lr: 0.0001             # Generator's learning rate.
+    eps: 1.0e-6            # Generator's epsilon.
+    weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+    step_size: 200000      # Generator's scheduler step size.
+    gamma: 0.5             # Generator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+    lr: 0.00005            # Discriminator's learning rate.
+    eps: 1.0e-6            # Discriminator's epsilon.
+    weight_decay: 0.0      # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    step_size: 200000      # Discriminator's scheduler step size.
+    gamma: 0.5             # Discriminator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 1000000                # Number of training steps.
+save_interval_steps: 5000               # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+log_interval_steps: 100                 # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/ljspeech/voc1/conf/melgan_large.v1.yaml b/ParallelWaveGAN/egs/ljspeech/voc1/conf/melgan_large.v1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..57e4319a5e7e029b25379f5f4f141738c8271346
--- /dev/null
+++ b/ParallelWaveGAN/egs/ljspeech/voc1/conf/melgan_large.v1.yaml
@@ -0,0 +1,115 @@
+# This is the hyperparameter configuration file for MelGAN.
+# Please make sure this is adjusted for the LJSpeech dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration use larger generator. It requires ~5 GB GPU memory and
+# takes ~1 days on TITAN V.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 22050     # Sampling rate.
+fft_size: 1024           # FFT size.
+hop_size: 256            # Hop size.
+win_length: null         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: true       # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 2048    # Frame size in trimming.
+trim_hop_size: 512       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_type: "MelGANGenerator" # Generator type.
+generator_params:
+    in_channels: 80                  # Number of input channels.
+    out_channels: 1                  # Number of output channels.
+    kernel_size: 7                   # Kernel size of initial and final conv layers.
+    channels: 1024                   # Initial number of channels for conv layers.
+    upsample_scales: [4, 4, 4, 2, 2] # List of Upsampling scales.
+    stack_kernel_size: 3             # Kernel size of dilated conv layers in residual stack.
+    stacks: 4                        # Number of stacks in a single residual stack module.
+    use_weight_norm: True            # Whether to use weight normalization.
+    use_causal_conv: False           # Whether to use causal convolution.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Number of output channels.
+    layers: 10            # Number of conv layers.
+    conv_channels: 64     # Number of chnn layers.
+    bias: true            # Whether to use bias parameter in conv.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
+    nonlinear_activation_params:      # Nonlinear function parameters
+        negative_slope: 0.2           # Alpha in LeakyReLU.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 4.0  # Loss balancing coefficient.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 8              # Batch size.
+batch_max_steps: 25600     # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2             # Number of workers in Pytorch DataLoader.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    lr: 0.0001             # Generator's learning rate.
+    eps: 1.0e-6            # Generator's epsilon.
+    weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+    step_size: 200000      # Generator's scheduler step size.
+    gamma: 0.5             # Generator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+    lr: 0.00005            # Discriminator's learning rate.
+    eps: 1.0e-6            # Discriminator's epsilon.
+    weight_decay: 0.0      # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    step_size: 200000      # Discriminator's scheduler step size.
+    gamma: 0.5             # Discriminator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 400000                 # Number of training steps.
+save_interval_steps: 5000               # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+log_interval_steps: 100                 # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/ljspeech/voc1/conf/multi_band_melgan.v1.yaml b/ParallelWaveGAN/egs/ljspeech/voc1/conf/multi_band_melgan.v1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b224c2c9f3e2496cdd82657b90e9dd8af6f3e0b3
--- /dev/null
+++ b/ParallelWaveGAN/egs/ljspeech/voc1/conf/multi_band_melgan.v1.yaml
@@ -0,0 +1,147 @@
+# This is the hyperparameter configuration file for MelGAN.
+# Please make sure this is adjusted for the LJSpeech dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration requires ~ 8GB memory and will finish within 5 days on Titan V.
+
+# This configuration is based on multi-band MelGAN. Hop size and sampling rate are
+# different from the paper (16kHz vs 22.05kHz). And we use RAdam instaed of Adam
+# with batch size = 64 instaed of 128. Tne number of iteraions is now shown in
+# the paper so currently we train 1M iterations (not sure enough to converge).
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 22050     # Sampling rate.
+fft_size: 1024           # FFT size.
+hop_size: 256            # Hop size.
+win_length: null         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: true       # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 2048    # Frame size in trimming.
+trim_hop_size: 512       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_type: "MelGANGenerator" # Generator type.
+generator_params:
+    in_channels: 80               # Number of input channels.
+    out_channels: 4               # Number of output channels.
+    kernel_size: 7                # Kernel size of initial and final conv layers.
+    channels: 384                 # Initial number of channels for conv layers.
+    upsample_scales: [8, 4, 2]    # List of Upsampling scales.
+    stack_kernel_size: 3          # Kernel size of dilated conv layers in residual stack.
+    stacks: 4                     # Number of stacks in a single residual stack module.
+    use_weight_norm: True         # Whether to use weight normalization.
+    use_causal_conv: False        # Whether to use causal convolution.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_type: "MelGANMultiScaleDiscriminator" # Discriminator type.
+discriminator_params:
+    in_channels: 1                    # Number of input channels.
+    out_channels: 1                   # Number of output channels.
+    scales: 3                         # Number of multi-scales.
+    downsample_pooling: "AvgPool1d"   # Pooling type for the input downsampling.
+    downsample_pooling_params:        # Parameters of the above pooling function.
+        kernel_size: 4
+        stride: 2
+        padding: 1
+        count_include_pad: False
+    kernel_sizes: [5, 3]              # List of kernel size.
+    channels: 16                      # Number of channels of the initial conv layer.
+    max_downsample_channels: 512      # Maximum number of channels of downsampling layers.
+    downsample_scales: [4, 4, 4]      # List of downsampling scales.
+    nonlinear_activation: "LeakyReLU" # Nonlinear activation function.
+    nonlinear_activation_params:      # Parameters of nonlinear activation function.
+        negative_slope: 0.2
+    use_weight_norm: True             # Whether to use weight norm.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+use_subband_stft_loss: true
+subband_stft_loss_params:
+    fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
+    hop_sizes: [30, 60, 10]     # List of hop size for STFT-based loss
+    win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
+    window: "hann_window"       # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+use_feat_match_loss: false # Whether to use feature matching loss.
+lambda_adv: 2.5            # Loss balancing coefficient for adversarial loss.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 64             # Batch size.
+batch_max_steps: 16384     # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
+num_workers: 4             # Number of workers in Pytorch DataLoader.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_type: "RAdam"       # Generator's optimizer type.
+generator_optimizer_params:
+    lr: 0.0001                          # Generator's learning rate.
+    eps: 1.0e-6                         # Generator's epsilon.
+    weight_decay: 0.0                   # Generator's weight decay coefficient.
+generator_grad_norm: 10                 # Generator's gradient norm.
+generator_scheduler_type: "MultiStepLR" # Generator's scheduler type.
+generator_scheduler_params:
+    gamma: 0.5                          # Generator's scheduler gamma.
+    milestones:                         # At each milestone, lr will be multiplied by gamma.
+        - 100000
+        - 200000
+        - 300000
+        - 400000
+        - 500000
+        - 600000
+discriminator_optimizer_type: "RAdam"       # Discriminator's optimizer type.
+discriminator_optimizer_params:
+    lr: 0.0001                              # Discriminator's learning rate.
+    eps: 1.0e-6                             # Discriminator's epsilon.
+    weight_decay: 0.0                       # Discriminator's weight decay coefficient.
+discriminator_grad_norm: 1                  # Discriminator's gradient norm.
+discriminator_scheduler_type: "MultiStepLR" # Discriminator's scheduler type.
+discriminator_scheduler_params:
+    gamma: 0.5                              # Discriminator's scheduler gamma.
+    milestones:                             # At each milestone, lr will be multiplied by gamma.
+        - 100000
+        - 200000
+        - 300000
+        - 400000
+        - 500000
+        - 600000
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 200000 # Number of steps to start to train discriminator.
+train_max_steps: 1000000                # Number of training steps.
+save_interval_steps: 50000              # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+log_interval_steps: 1000                # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/ljspeech/voc1/conf/multi_band_melgan.v2.yaml b/ParallelWaveGAN/egs/ljspeech/voc1/conf/multi_band_melgan.v2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7f226bb40d71339039bc3e0b5f4054d11585569a
--- /dev/null
+++ b/ParallelWaveGAN/egs/ljspeech/voc1/conf/multi_band_melgan.v2.yaml
@@ -0,0 +1,154 @@
+# This is the hyperparameter configuration file for MelGAN.
+# Please make sure this is adjusted for the LJSpeech dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration requires ~ 8GB memory and will finish within 5 days on Titan V.
+
+# This configuration is based on multi-band MelGAN but the hop size and sampling
+# rate is different from the paper (16kHz vs 22.05kHz). The number of iteraions
+# is now shown in the paper so currently we train 1M iterations (not sure enough
+# to converge). Compared to multi_band_melgan.v1 config, Adam optimizer without
+# gradient norm is used, which is based on @dathudeptrai advice.
+# https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906
+
+# We found that the use of small batch_max_steps (e.g. 8192) has no bad effect for
+# the quality, so if you want to accelerate the training, please reduce it.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 22050     # Sampling rate.
+fft_size: 1024           # FFT size.
+hop_size: 256            # Hop size.
+win_length: null         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: true       # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 2048    # Frame size in trimming.
+trim_hop_size: 512       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_type: "MelGANGenerator" # Generator type.
+generator_params:
+    in_channels: 80               # Number of input channels.
+    out_channels: 4               # Number of output channels.
+    kernel_size: 7                # Kernel size of initial and final conv layers.
+    channels: 384                 # Initial number of channels for conv layers.
+    upsample_scales: [8, 4, 2]    # List of Upsampling scales.
+    stack_kernel_size: 3          # Kernel size of dilated conv layers in residual stack.
+    stacks: 4                     # Number of stacks in a single residual stack module.
+    use_weight_norm: True         # Whether to use weight normalization.
+    use_causal_conv: False        # Whether to use causal convolution.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_type: "MelGANMultiScaleDiscriminator" # Discriminator type.
+discriminator_params:
+    in_channels: 1                    # Number of input channels.
+    out_channels: 1                   # Number of output channels.
+    scales: 3                         # Number of multi-scales.
+    downsample_pooling: "AvgPool1d"   # Pooling type for the input downsampling.
+    downsample_pooling_params:        # Parameters of the above pooling function.
+        kernel_size: 4
+        stride: 2
+        padding: 1
+        count_include_pad: False
+    kernel_sizes: [5, 3]              # List of kernel size.
+    channels: 16                      # Number of channels of the initial conv layer.
+    max_downsample_channels: 512      # Maximum number of channels of downsampling layers.
+    downsample_scales: [4, 4, 4]      # List of downsampling scales.
+    nonlinear_activation: "LeakyReLU" # Nonlinear activation function.
+    nonlinear_activation_params:      # Parameters of nonlinear activation function.
+        negative_slope: 0.2
+    use_weight_norm: True             # Whether to use weight norm.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+use_subband_stft_loss: true
+subband_stft_loss_params:
+    fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
+    hop_sizes: [30, 60, 10]     # List of hop size for STFT-based loss
+    win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
+    window: "hann_window"       # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+use_feat_match_loss: false # Whether to use feature matching loss.
+lambda_adv: 2.5            # Loss balancing coefficient for adversarial loss.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 64             # Batch size.
+batch_max_steps: 16384     # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
+num_workers: 4             # Number of workers in Pytorch DataLoader.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_type: "Adam"       # Generator's optimizer type.
+generator_optimizer_params:
+    lr: 1.0e-3                          # Generator's learning rate.
+    eps: 1.0e-7                         # Generator's epsilon.
+    weight_decay: 0.0                   # Generator's weight decay coefficient.
+    amsgrad: true
+generator_grad_norm: -1                 # Generator's gradient norm.
+generator_scheduler_type: "MultiStepLR" # Generator's scheduler type.
+generator_scheduler_params:
+    gamma: 0.5                          # Generator's scheduler gamma.
+    milestones:                         # At each milestone, lr will be multiplied by gamma.
+        - 100000
+        - 200000
+        - 300000
+        - 400000
+        - 500000
+        - 600000
+discriminator_optimizer_type: "Adam"        # Discriminator's optimizer type.
+discriminator_optimizer_params:
+    lr: 1.0e-3                              # Discriminator's learning rate.
+    eps: 1.0e-7                             # Discriminator's epsilon.
+    weight_decay: 0.0                       # Discriminator's weight decay coefficient.
+    amsgrad: true
+discriminator_grad_norm: -1                 # Discriminator's gradient norm.
+discriminator_scheduler_type: "MultiStepLR" # Discriminator's scheduler type.
+discriminator_scheduler_params:
+    gamma: 0.5                              # Discriminator's scheduler gamma.
+    milestones:                             # At each milestone, lr will be multiplied by gamma.
+        - 100000
+        - 200000
+        - 300000
+        - 400000
+        - 500000
+        - 600000
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 200000 # Number of steps to start to train discriminator.
+train_max_steps: 1000000                # Number of training steps.
+save_interval_steps: 50000              # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+log_interval_steps: 1000                # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/ljspeech/voc1/conf/parallel_wavegan.v1.long.yaml b/ParallelWaveGAN/egs/ljspeech/voc1/conf/parallel_wavegan.v1.long.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5223a062af37260af88ef526f111e1d3588f97d6
--- /dev/null
+++ b/ParallelWaveGAN/egs/ljspeech/voc1/conf/parallel_wavegan.v1.long.yaml
@@ -0,0 +1,123 @@
+# This is the hyperparameter configuration file for Parallel WaveGAN.
+# Please make sure this is adjusted for the LJSpeech dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration trains more steps up to 1000k compared to v1 config.
+# It requires 12 GB GPU memory and takes ~7 days on TITAN V.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 22050     # Sampling rate.
+fft_size: 1024           # FFT size.
+hop_size: 256            # Hop size.
+win_length: null         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: true       # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 2048    # Frame size in trimming.
+trim_hop_size: 512       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Kernel size of dilated convolution.
+    layers: 30            # Number of residual block layers.
+    stacks: 3             # Number of stacks i.e., dilation cycles.
+    residual_channels: 64 # Number of channels in residual conv.
+    gate_channels: 128    # Number of channels in gated conv.
+    skip_channels: 64     # Number of channels in skip conv.
+    aux_channels: 80      # Number of channels for auxiliary feature conv.
+                          # Must be the same as num_mels.
+    aux_context_window: 2 # Context window size for auxiliary feature.
+                          # If set to 2, previous 2 and future 2 frames will be considered.
+    dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
+    upsample_params:                      # Upsampling network parameters.
+        upsample_scales: [4, 4, 4, 4]     # Upsampling scales. Prodcut of these must be the same as hop size.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Number of output channels.
+    layers: 10            # Number of conv layers.
+    conv_channels: 64     # Number of chnn layers.
+    bias: true            # Whether to use bias parameter in conv.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
+    nonlinear_activation_params:      # Nonlinear function parameters
+        negative_slope: 0.2           # Alpha in LeakyReLU.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 4.0  # Loss balancing coefficient.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 6              # Batch size.
+batch_max_steps: 25600     # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2             # Number of workers in Pytorch DataLoader.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    lr: 0.0001             # Generator's learning rate.
+    eps: 1.0e-6            # Generator's epsilon.
+    weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+    step_size: 200000      # Generator's scheduler step size.
+    gamma: 0.5             # Generator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+    lr: 0.00005            # Discriminator's learning rate.
+    eps: 1.0e-6            # Discriminator's epsilon.
+    weight_decay: 0.0      # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    step_size: 200000      # Discriminator's scheduler step size.
+    gamma: 0.5             # Discriminator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 1000000                # Number of training steps.
+save_interval_steps: 5000               # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+log_interval_steps: 100                 # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/ljspeech/voc1/conf/parallel_wavegan.v1.no_limit.yaml b/ParallelWaveGAN/egs/ljspeech/voc1/conf/parallel_wavegan.v1.no_limit.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e6239fe09bf3fd6631b33b1dd8cbefb72cbeee5a
--- /dev/null
+++ b/ParallelWaveGAN/egs/ljspeech/voc1/conf/parallel_wavegan.v1.no_limit.yaml
@@ -0,0 +1,123 @@
+# This is the hyperparameter configuration file for Parallel WaveGAN.
+# Please make sure this is adjusted for the LJSpeech dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration requires 12 GB GPU memory and takes ~3 days on TITAN V.
+# This configuration does not use Mel range limit compared to the original v1.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 22050     # Sampling rate.
+fft_size: 1024           # FFT size.
+hop_size: 256            # Hop size.
+win_length: null         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 0                  # Minimum freq in mel basis calculation.
+fmax: 11025              # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: true       # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 2048    # Frame size in trimming.
+trim_hop_size: 512       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Kernel size of dilated convolution.
+    layers: 30            # Number of residual block layers.
+    stacks: 3             # Number of stacks i.e., dilation cycles.
+    residual_channels: 64 # Number of channels in residual conv.
+    gate_channels: 128    # Number of channels in gated conv.
+    skip_channels: 64     # Number of channels in skip conv.
+    aux_channels: 80      # Number of channels for auxiliary feature conv.
+                          # Must be the same as num_mels.
+    aux_context_window: 2 # Context window size for auxiliary feature.
+                          # If set to 2, previous 2 and future 2 frames will be considered.
+    dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
+    upsample_params:                      # Upsampling network parameters.
+        upsample_scales: [4, 4, 4, 4]     # Upsampling scales. Prodcut of these must be the same as hop size.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Number of output channels.
+    layers: 10            # Number of conv layers.
+    conv_channels: 64     # Number of chnn layers.
+    bias: true            # Whether to use bias parameter in conv.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
+    nonlinear_activation_params:      # Nonlinear function parameters
+        negative_slope: 0.2           # Alpha in LeakyReLU.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 4.0  # Loss balancing coefficient.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 6              # Batch size.
+batch_max_steps: 25600     # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2             # Number of workers in Pytorch DataLoader.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    lr: 0.0001             # Generator's learning rate.
+    eps: 1.0e-6            # Generator's epsilon.
+    weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+    step_size: 200000      # Generator's scheduler step size.
+    gamma: 0.5             # Generator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+    lr: 0.00005            # Discriminator's learning rate.
+    eps: 1.0e-6            # Discriminator's epsilon.
+    weight_decay: 0.0      # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    step_size: 200000      # Discriminator's scheduler step size.
+    gamma: 0.5             # Discriminator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 400000                 # Number of training steps.
+save_interval_steps: 5000               # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+log_interval_steps: 100                 # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/ljspeech/voc1/conf/parallel_wavegan.v1.yaml b/ParallelWaveGAN/egs/ljspeech/voc1/conf/parallel_wavegan.v1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2216715bef3e7c01a8b9a176179fbf6a6c0f4c3f
--- /dev/null
+++ b/ParallelWaveGAN/egs/ljspeech/voc1/conf/parallel_wavegan.v1.yaml
@@ -0,0 +1,122 @@
+# This is the hyperparameter configuration file for Parallel WaveGAN.
+# Please make sure this is adjusted for the LJSpeech dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration requires 12 GB GPU memory and takes ~3 days on TITAN V.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 22050     # Sampling rate.
+fft_size: 1024           # FFT size.
+hop_size: 256            # Hop size.
+win_length: null         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: true       # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 2048    # Frame size in trimming.
+trim_hop_size: 512       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Kernel size of dilated convolution.
+    layers: 30            # Number of residual block layers.
+    stacks: 3             # Number of stacks i.e., dilation cycles.
+    residual_channels: 64 # Number of channels in residual conv.
+    gate_channels: 128    # Number of channels in gated conv.
+    skip_channels: 64     # Number of channels in skip conv.
+    aux_channels: 80      # Number of channels for auxiliary feature conv.
+                          # Must be the same as num_mels.
+    aux_context_window: 2 # Context window size for auxiliary feature.
+                          # If set to 2, previous 2 and future 2 frames will be considered.
+    dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
+    upsample_params:                      # Upsampling network parameters.
+        upsample_scales: [4, 4, 4, 4]     # Upsampling scales. Prodcut of these must be the same as hop size.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Number of output channels.
+    layers: 10            # Number of conv layers.
+    conv_channels: 64     # Number of chnn layers.
+    bias: true            # Whether to use bias parameter in conv.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
+    nonlinear_activation_params:      # Nonlinear function parameters
+        negative_slope: 0.2           # Alpha in LeakyReLU.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 4.0  # Loss balancing coefficient.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 6              # Batch size.
+batch_max_steps: 25600     # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2             # Number of workers in Pytorch DataLoader.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    lr: 0.0001             # Generator's learning rate.
+    eps: 1.0e-6            # Generator's epsilon.
+    weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+    step_size: 200000      # Generator's scheduler step size.
+    gamma: 0.5             # Generator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+    lr: 0.00005            # Discriminator's learning rate.
+    eps: 1.0e-6            # Discriminator's epsilon.
+    weight_decay: 0.0      # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    step_size: 200000      # Discriminator's scheduler step size.
+    gamma: 0.5             # Discriminator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 400000                 # Number of training steps.
+save_interval_steps: 5000               # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+log_interval_steps: 100                 # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/ljspeech/voc1/conf/parallel_wavegan.v3.yaml b/ParallelWaveGAN/egs/ljspeech/voc1/conf/parallel_wavegan.v3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a34f42d00103a07f26ef6ccf8bc3578bd6cda4cd
--- /dev/null
+++ b/ParallelWaveGAN/egs/ljspeech/voc1/conf/parallel_wavegan.v3.yaml
@@ -0,0 +1,132 @@
+# This is the hyperparameter configuration file for Parallel WaveGAN.
+# Please make sure this is adjusted for the LJSpeech dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# The generator is PWG and the discriminator is MelGAN. This configuration
+# requires ~9 GB GPU memory and takes ~21 days on TITAN V.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 22050     # Sampling rate.
+fft_size: 1024           # FFT size.
+hop_size: 256            # Hop size.
+win_length: null         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: true       # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 2048    # Frame size in trimming.
+trim_hop_size: 512       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 5        # Kernel size of dilated convolution.
+    layers: 30            # Number of residual block layers.
+    stacks: 3             # Number of stacks i.e., dilation cycles.
+    residual_channels: 64 # Number of channels in residual conv.
+    gate_channels: 128    # Number of channels in gated conv.
+    skip_channels: 64     # Number of channels in skip conv.
+    aux_channels: 80      # Number of channels for auxiliary feature conv.
+                          # Must be the same as num_mels.
+    aux_context_window: 2 # Context window size for auxiliary feature.
+                          # If set to 2, previous 2 and future 2 frames will be considered.
+    dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
+    upsample_params:                      # Upsampling network parameters.
+        upsample_scales: [4, 4, 4, 4]     # Upsampling scales. Prodcut of these must be the same as hop size.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_type: "MelGANMultiScaleDiscriminator" # Discriminator type.
+discriminator_params:
+    in_channels: 1                    # Number of input channels.
+    out_channels: 1                   # Number of output channels.
+    scales: 3                         # Number of multi-scales.
+    downsample_pooling: "AvgPool1d"   # Pooling type for the input downsampling.
+    downsample_pooling_params:        # Parameters of the above pooling function.
+        kernel_size: 4
+        stride: 2
+        padding: 1
+        count_include_pad: False
+    kernel_sizes: [5, 3]              # List of kernel size.
+    channels: 16                      # Number of channels of the initial conv layer.
+    max_downsample_channels: 1024     # Maximum number of channels of downsampling layers.
+    downsample_scales: [4, 4, 4, 4]   # List of downsampling scales.
+    nonlinear_activation: "LeakyReLU" # Nonlinear activation function.
+    nonlinear_activation_params:      # Parameters of nonlinear activation function.
+        negative_slope: 0.2
+    use_weight_norm: True             # Whether to use weight norm.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+use_feat_match_loss: true # Whether to use feature matching loss.
+lambda_feat_match: 25.0   # Loss balancing coefficient for feature matching loss.
+lambda_adv: 4.0          # Loss balancing coefficient for adversarial loss.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 16             # Batch size.
+batch_max_steps: 8192      # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2             # Number of workers in Pytorch DataLoader.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    lr: 0.0001             # Generator's learning rate.
+    eps: 1.0e-6            # Generator's epsilon.
+    weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+    step_size: 3000000     # Generator's scheduler step size.
+    gamma: 0.5             # Generator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+    lr: 0.00005            # Discriminator's learning rate.
+    eps: 1.0e-6            # Discriminator's epsilon.
+    weight_decay: 0.0      # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    step_size: 3000000     # Discriminator's scheduler step size.
+    gamma: 0.5             # Discriminator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 3000000                # Number of training steps.
+save_interval_steps: 5000               # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+log_interval_steps: 100                 # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/ljspeech/voc1/conf/slurm.conf b/ParallelWaveGAN/egs/ljspeech/voc1/conf/slurm.conf
new file mode 100644
index 0000000000000000000000000000000000000000..cefd21f031d647f798e8045a6beb92b0b833d041
--- /dev/null
+++ b/ParallelWaveGAN/egs/ljspeech/voc1/conf/slurm.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command sbatch --export=PATH  --ntasks-per-node=1
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
+option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1 # Do not add anything to qsub_opts
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/ParallelWaveGAN/egs/ljspeech/voc1/conf/style_melgan.v1.yaml b/ParallelWaveGAN/egs/ljspeech/voc1/conf/style_melgan.v1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3c76f66802fae5cd4d5a2f0943ec835f496f5ac8
--- /dev/null
+++ b/ParallelWaveGAN/egs/ljspeech/voc1/conf/style_melgan.v1.yaml
@@ -0,0 +1,146 @@
+# This is the configuration file for LJSpeech dataset.
+# This configuration is based on StyleMelGAN paper but
+# uses MSE loss instead of Hinge loss. And I found that
+# batch_size = 8 is also working good. So maybe if you
+# want to accelerate the training, you can reduce the
+# batch size (e.g. 8 or 16).
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 22050     # Sampling rate.
+fft_size: 1024           # FFT size.
+hop_size: 256            # Hop size.
+win_length: null         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: false      # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 1024    # Frame size in trimming.
+trim_hop_size: 256       # Hop size in trimming.
+format: "hdf5"           # Feature file format. " npy " or " hdf5 " is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_type: "StyleMelGANGenerator" # Generator type.
+generator_params:
+    in_channels: 128
+    aux_channels: 80
+    channels: 64
+    out_channels: 1
+    kernel_size: 9
+    dilation: 2
+    bias: True
+    noise_upsample_scales: [11, 2, 2, 2]
+    noise_upsample_activation: "LeakyReLU"
+    noise_upsample_activation_params:
+        negative_slope: 0.2
+    upsample_scales: [2, 2, 2, 2, 2, 2, 2, 2, 1]
+    upsample_mode: "nearest"
+    gated_function: "softmax"
+    use_weight_norm: True
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_type: "StyleMelGANDiscriminator" # Discriminator type.
+discriminator_params:
+    repeats: 4
+    window_sizes: [512, 1024, 2048, 4096]
+    pqmf_params:
+        - [1, None, None, None]
+        - [2, 62, 0.26700, 9.0]
+        - [4, 62, 0.14200, 9.0]
+        - [8, 62, 0.07949, 9.0]
+    discriminator_params:
+        out_channels: 1
+        kernel_sizes: [5, 3]
+        channels: 16
+        max_downsample_channels: 512
+        bias: True
+        downsample_scales: [4, 4, 4, 1]
+        nonlinear_activation: "LeakyReLU"
+        nonlinear_activation_params:
+            negative_slope: 0.2
+    use_weight_norm: True
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+lambda_aux: 1.0                   # Loss balancing coefficient for aux loss.
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 1.0 # Loss balancing coefficient for adv loss.
+generator_adv_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+discriminator_adv_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 32              # Batch size.
+batch_max_steps: 22528      # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2              # Number of workers in Pytorch DataLoader.
+remove_short_samples: false # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_type: Adam
+generator_optimizer_params:
+    lr: 1.0e-4
+    betas: [0.5, 0.9]
+    weight_decay: 0.0
+generator_scheduler_type: MultiStepLR
+generator_scheduler_params:
+    gamma: 0.5
+    milestones:
+        - 100000
+        - 300000
+        - 500000
+        - 700000
+        - 900000
+generator_grad_norm: -1
+discriminator_optimizer_type: Adam
+discriminator_optimizer_params:
+    lr: 2.0e-4
+    betas: [0.5, 0.9]
+    weight_decay: 0.0
+discriminator_scheduler_type: MultiStepLR
+discriminator_scheduler_params:
+    gamma: 0.5
+    milestones:
+        - 200000
+        - 400000
+        - 600000
+        - 800000
+discriminator_grad_norm: -1
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 1500000                # Number of training steps.
+save_interval_steps: 50000              # Interval steps to save checkpoint.
+eval_interval_steps: 1000              # Interval steps to evaluate the network.
+log_interval_steps: 100                # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/ljspeech/voc1/local/data_download.sh b/ParallelWaveGAN/egs/ljspeech/voc1/local/data_download.sh
new file mode 100755
index 0000000000000000000000000000000000000000..698e3eb497addaeaa4e3c639607ffb9f2f37905f
--- /dev/null
+++ b/ParallelWaveGAN/egs/ljspeech/voc1/local/data_download.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+download_dir=$1
+
+# check arguments
+if [ $# != 1 ]; then
+    echo "Usage: $0 <download_dir>"
+    exit 1
+fi
+
+set -euo pipefail
+
+cwd=$(pwd)
+if [ ! -e "${download_dir}/LJSpeech-1.1" ]; then
+    mkdir -p "${download_dir}"
+    cd "${download_dir}"
+    wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
+    tar -vxf ./*.tar.bz2
+    rm ./*.tar.bz2
+    cd "${cwd}"
+    echo "Successfully downloaded data."
+else
+    echo "Already exists. Skipped."
+fi
diff --git a/ParallelWaveGAN/egs/ljspeech/voc1/local/data_prep.sh b/ParallelWaveGAN/egs/ljspeech/voc1/local/data_prep.sh
new file mode 100755
index 0000000000000000000000000000000000000000..6de6cbcab27f6daf7398ef38ef582f7766eb9035
--- /dev/null
+++ b/ParallelWaveGAN/egs/ljspeech/voc1/local/data_prep.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+# shellcheck disable=SC1091
+. ./path.sh || exit 1;
+
+num_dev=250
+num_eval=250
+train_set="train_nodev"
+dev_set="dev"
+eval_set="eval"
+shuffle=false
+
+# shellcheck disable=SC1091
+. utils/parse_options.sh || exit 1;
+
+db_root=$1
+data_dir=$2
+
+# check arguments
+if [ $# != 2 ]; then
+    echo "Usage: $0 [Options] <db_root> <data_dir>"
+    echo "e.g.: $0 downloads/LJSpeech-1.1 data"
+    echo ""
+    echo "Options:"
+    echo "    --num_dev: number of development uttreances (default=250)."
+    echo "    --num_eval: number of evaluation uttreances (default=250)."
+    echo "    --train_set: name of train set (default=train_nodev)."
+    echo "    --dev_set: name of dev set (default=dev)."
+    echo "    --eval_set: name of eval set (default=eval)."
+    echo "    --shuffle: whether to perform shuffle in making dev / eval set (default=false)."
+    exit 1
+fi
+
+set -euo pipefail
+
+[ ! -e "${data_dir}/all" ] && mkdir -p "${data_dir}/all"
+
+# set filenames
+scp="${data_dir}/all/wav.scp"
+
+# check file existence
+[ -e "${scp}" ] && rm "${scp}"
+
+# make all scp
+find "${db_root}" -follow -name "*.wav" | sort | while read -r filename; do
+    id=$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")
+    echo "${id} ${filename}" >> "${scp}"
+done
+
+# split
+num_all=$(wc -l < "${scp}")
+num_deveval=$((num_dev + num_eval))
+num_train=$((num_all - num_deveval))
+utils/split_data.sh \
+    --num_first "${num_train}" \
+    --num_second "${num_deveval}" \
+    --shuffle "${shuffle}" \
+    "${data_dir}/all" \
+    "${data_dir}/${train_set}" \
+    "${data_dir}/deveval"
+utils/split_data.sh \
+    --num_first "${num_dev}" \
+    --num_second "${num_eval}" \
+    --shuffle "${shuffle}" \
+    "${data_dir}/deveval" \
+    "${data_dir}/${dev_set}" \
+    "${data_dir}/${eval_set}"
+
+# remove tmp directories
+rm -rf "${data_dir}/all"
+rm -rf "${data_dir}/deveval"
+
+echo "Successfully prepared data."
diff --git a/ParallelWaveGAN/egs/ljspeech/voc1/path.sh b/ParallelWaveGAN/egs/ljspeech/voc1/path.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b0ca27c615f70aa29e240222ec370f8ad4e7b45a
--- /dev/null
+++ b/ParallelWaveGAN/egs/ljspeech/voc1/path.sh
@@ -0,0 +1,33 @@
+# cuda related
+export CUDA_HOME=/usr/local/cuda-10.0
+export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
+
+# path related
+export PRJ_ROOT="${PWD}/../../.."
+if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then
+    # shellcheck disable=SC1090
+    . "${PRJ_ROOT}/tools/venv/bin/activate"
+fi
+
+# python related
+export OMP_NUM_THREADS=1
+export PYTHONIOENCODING=UTF-8
+export MPL_BACKEND=Agg
+
+# check installation
+if ! command -v parallel-wavegan-train > /dev/null; then
+    echo "Error: It seems setup is not finished." >&2
+    echo "Error: Please setup your environment by following README.md" >&2
+    return 1
+fi
+if ! command -v jq > /dev/null; then
+    echo "Error: It seems jq is not installed." >&2
+    echo "Error: Please install via \`sudo apt-get install jq\`." >&2
+    echo "Error: If you do not have sudo, please download from https://stedolan.github.io/jq/download/." >&2
+    return 1
+fi
+if ! command -v yq > /dev/null; then
+    echo "Error: It seems yq is not installed." >&2
+    echo "Error: Please install via \`pip install yq\`." >&2
+    return 1
+fi
diff --git a/ParallelWaveGAN/egs/ljspeech/voc1/run.sh b/ParallelWaveGAN/egs/ljspeech/voc1/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..033011b2db80b093d90008617d6272075de48a14
--- /dev/null
+++ b/ParallelWaveGAN/egs/ljspeech/voc1/run.sh
@@ -0,0 +1,163 @@
+#!/bin/bash
+
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+. ./cmd.sh || exit 1;
+. ./path.sh || exit 1;
+
+# basic settings
+stage=-1       # stage to start
+stop_stage=100 # stage to stop
+verbose=1      # verbosity level (lower is less info)
+n_gpus=1       # number of gpus in training
+n_jobs=16      # number of parallel jobs in feature extraction
+
+# NOTE(kan-bayashi): renamed to conf to avoid conflict in parse_options.sh
+conf=conf/parallel_wavegan.v1.yaml
+
+# directory path setting
+download_dir=downloads # direcotry to save downloaded files
+dumpdir=dump           # directory to dump features
+
+# training related setting
+tag=""     # tag for directory to save model
+resume=""  # checkpoint path to resume training
+           # (e.g. <path>/<to>/checkpoint-10000steps.pkl)
+
+# decoding related setting
+checkpoint="" # checkpoint path to be used for decoding
+              # if not provided, the latest one will be used
+              # (e.g. <path>/<to>/checkpoint-400000steps.pkl)
+
+# shellcheck disable=SC1091
+. utils/parse_options.sh || exit 1;
+
+train_set="train_nodev" # name of training data directory
+dev_set="dev"           # name of development data direcotry
+eval_set="eval"         # name of evaluation data direcotry
+
+set -euo pipefail
+
+if [ "${stage}" -le -1 ] && [ "${stop_stage}" -ge -1 ]; then
+    echo "Stage -1: Data download"
+    local/data_download.sh "${download_dir}"
+fi
+
+if [ "${stage}" -le 0 ] && [ "${stop_stage}" -ge 0 ]; then
+    echo "Stage 0: Data preparation"
+    local/data_prep.sh \
+        --train_set "${train_set}" \
+        --dev_set "${dev_set}" \
+        --eval_set "${eval_set}" \
+        "${download_dir}/LJSpeech-1.1" data
+fi
+
+stats_ext=$(grep -q "hdf5" <(yq ".format" "${conf}") && echo "h5" || echo "npy")
+if [ "${stage}" -le 1 ] && [ "${stop_stage}" -ge 1 ]; then
+    echo "Stage 1: Feature extraction"
+    # extract raw features
+    pids=()
+    for name in "${train_set}" "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${dumpdir}/${name}/raw" ] && mkdir -p "${dumpdir}/${name}/raw"
+        echo "Feature extraction start. See the progress via ${dumpdir}/${name}/raw/preprocessing.*.log."
+        utils/make_subset_data.sh "data/${name}" "${n_jobs}" "${dumpdir}/${name}/raw"
+        ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/raw/preprocessing.JOB.log" \
+            parallel-wavegan-preprocess \
+                --config "${conf}" \
+                --scp "${dumpdir}/${name}/raw/wav.JOB.scp" \
+                --dumpdir "${dumpdir}/${name}/raw/dump.JOB" \
+                --verbose "${verbose}"
+        echo "Successfully finished feature extraction of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished feature extraction."
+
+    # calculate statistics for normalization
+    echo "Statistics computation start. See the progress via ${dumpdir}/${train_set}/compute_statistics.log."
+    ${train_cmd} "${dumpdir}/${train_set}/compute_statistics.log" \
+        parallel-wavegan-compute-statistics \
+            --config "${conf}" \
+            --rootdir "${dumpdir}/${train_set}/raw" \
+            --dumpdir "${dumpdir}/${train_set}" \
+            --verbose "${verbose}"
+    echo "Successfully finished calculation of statistics."
+
+    # normalize and dump them
+    pids=()
+    for name in "${train_set}" "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${dumpdir}/${name}/norm" ] && mkdir -p "${dumpdir}/${name}/norm"
+        echo "Nomalization start. See the progress via ${dumpdir}/${name}/norm/normalize.*.log."
+        ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/norm/normalize.JOB.log" \
+            parallel-wavegan-normalize \
+                --config "${conf}" \
+                --stats "${dumpdir}/${train_set}/stats.${stats_ext}" \
+                --rootdir "${dumpdir}/${name}/raw/dump.JOB" \
+                --dumpdir "${dumpdir}/${name}/norm/dump.JOB" \
+                --verbose "${verbose}"
+        echo "Successfully finished normalization of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished normalization."
+fi
+
+if [ -z "${tag}" ]; then
+    expdir="exp/${train_set}_ljspeech_$(basename "${conf}" .yaml)"
+else
+    expdir="exp/${train_set}_ljspeech_${tag}"
+fi
+if [ "${stage}" -le 2 ] && [ "${stop_stage}" -ge 2 ]; then
+    echo "Stage 2: Network training"
+    [ ! -e "${expdir}" ] && mkdir -p "${expdir}"
+    cp "${dumpdir}/${train_set}/stats.${stats_ext}" "${expdir}"
+    if [ "${n_gpus}" -gt 1 ]; then
+        train="python -m parallel_wavegan.distributed.launch --nproc_per_node ${n_gpus} -c parallel-wavegan-train"
+    else
+        train="parallel-wavegan-train"
+    fi
+    echo "Training start. See the progress via ${expdir}/train.log."
+    ${cuda_cmd} --gpu "${n_gpus}" "${expdir}/train.log" \
+        ${train} \
+            --config "${conf}" \
+            --train-dumpdir "${dumpdir}/${train_set}/norm" \
+            --dev-dumpdir "${dumpdir}/${dev_set}/norm" \
+            --outdir "${expdir}" \
+            --resume "${resume}" \
+            --verbose "${verbose}"
+    echo "Successfully finished training."
+fi
+
+if [ "${stage}" -le 3 ] && [ "${stop_stage}" -ge 3 ]; then
+    echo "Stage 3: Network decoding"
+    # shellcheck disable=SC2012
+    [ -z "${checkpoint}" ] && checkpoint="$(ls -dt "${expdir}"/*.pkl | head -1 || true)"
+    outdir="${expdir}/wav/$(basename "${checkpoint}" .pkl)"
+    pids=()
+    for name in "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${outdir}/${name}" ] && mkdir -p "${outdir}/${name}"
+        [ "${n_gpus}" -gt 1 ] && n_gpus=1
+        echo "Decoding start. See the progress via ${outdir}/${name}/decode.log."
+        ${cuda_cmd} --gpu "${n_gpus}" "${outdir}/${name}/decode.log" \
+            parallel-wavegan-decode \
+                --dumpdir "${dumpdir}/${name}/norm" \
+                --checkpoint "${checkpoint}" \
+                --outdir "${outdir}/${name}" \
+                --verbose "${verbose}"
+        echo "Successfully finished decoding of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished decoding."
+fi
+echo "Finished."
diff --git a/ParallelWaveGAN/egs/ljspeech/voc1/utils b/ParallelWaveGAN/egs/ljspeech/voc1/utils
new file mode 120000
index 0000000000000000000000000000000000000000..973afe674f2c85f7a400600f963e0709767602dc
--- /dev/null
+++ b/ParallelWaveGAN/egs/ljspeech/voc1/utils
@@ -0,0 +1 @@
+../../../utils
\ No newline at end of file
diff --git a/ParallelWaveGAN/egs/speech_commands/voc1/cmd.sh b/ParallelWaveGAN/egs/speech_commands/voc1/cmd.sh
new file mode 100644
index 0000000000000000000000000000000000000000..19f342102fc4f3389157c48f1196b16b68eb1cf1
--- /dev/null
+++ b/ParallelWaveGAN/egs/speech_commands/voc1/cmd.sh
@@ -0,0 +1,91 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend="local"
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="utils/run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="utils/run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="utils/run.pl"
+
+# Local machine, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="utils/stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="utils/stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="utils/stdout.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="utils/queue.pl"
+    export cuda_cmd="utils/queue.pl"
+    export decode_cmd="utils/queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="utils/slurm.pl"
+    export cuda_cmd="utils/slurm.pl"
+    export decode_cmd="utils/slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="utils/ssh.pl"
+    export cuda_cmd="utils/ssh.pl"
+    export decode_cmd="utils/ssh.pl"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/ParallelWaveGAN/egs/speech_commands/voc1/conf/parallel_wavegan.v1.yaml b/ParallelWaveGAN/egs/speech_commands/voc1/conf/parallel_wavegan.v1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d7ffd9a85ebf488c2bface59d08035ae9ddb7537
--- /dev/null
+++ b/ParallelWaveGAN/egs/speech_commands/voc1/conf/parallel_wavegan.v1.yaml
@@ -0,0 +1,122 @@
+# This is the hyperparameter configuration file for Parallel WaveGAN.
+# Please make sure this is adjusted for the LJSpeech dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration requires 12 GB GPU memory and takes ~3 days on TITAN V.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 16000     # Sampling rate.
+fft_size: 1024           # FFT size.
+hop_size: 256            # Hop size.
+win_length: null         # Window length.
+# If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: true       # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 2048    # Frame size in trimming.
+trim_hop_size: 512       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Kernel size of dilated convolution.
+    layers: 30            # Number of residual block layers.
+    stacks: 3             # Number of stacks i.e., dilation cycles.
+    residual_channels: 64 # Number of channels in residual conv.
+    gate_channels: 128    # Number of channels in gated conv.
+    skip_channels: 64     # Number of channels in skip conv.
+    aux_channels: 80      # Number of channels for auxiliary feature conv.
+    # Must be the same as num_mels.
+    aux_context_window: 2 # Context window size for auxiliary feature.
+    # If set to 2, previous 2 and future 2 frames will be considered.
+    dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
+    use_weight_norm: true # Whether to use weight norm.
+    # If set to true, it will be applied to all of the conv layers.
+    upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
+    upsample_params: # Upsampling network parameters.
+        upsample_scales: [ 4, 4, 4, 4 ]     # Upsampling scales. Prodcut of these must be the same as hop size.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Number of output channels.
+    layers: 10            # Number of conv layers.
+    conv_channels: 64     # Number of chnn layers.
+    bias: true            # Whether to use bias parameter in conv.
+    use_weight_norm: true # Whether to use weight norm.
+    # If set to true, it will be applied to all of the conv layers.
+    nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
+    nonlinear_activation_params: # Nonlinear function parameters
+        negative_slope: 0.2           # Alpha in LeakyReLU.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [ 1024, 2048, 512 ]  # List of FFT size for STFT-based loss.
+    hop_sizes: [ 120, 240, 50 ]     # List of hop size for STFT-based loss
+    win_lengths: [ 600, 1200, 240 ] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 4.0  # Loss balancing coefficient.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 6              # Batch size.
+batch_max_steps: 7680     # Length of each audio in batch. Make sure dividable by hop_size. was 25600
+pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2             # Number of workers in Pytorch DataLoader.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps. was true
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    lr: 0.0001             # Generator's learning rate.
+    eps: 1.0e-6            # Generator's epsilon.
+    weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+    step_size: 200000      # Generator's scheduler step size.
+    gamma: 0.5             # Generator's scheduler gamma.
+    # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+    lr: 0.00005            # Discriminator's learning rate.
+    eps: 1.0e-6            # Discriminator's epsilon.
+    weight_decay: 0.0      # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    step_size: 200000      # Discriminator's scheduler step size.
+    gamma: 0.5             # Discriminator's scheduler gamma.
+    # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 20000 # Number of steps to start to train discriminator. was 100000
+train_max_steps: 100000                 # Number of training steps. was 400000
+save_interval_steps: 2000               # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+log_interval_steps: 100                 # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/speech_commands/voc1/local/data_download.sh b/ParallelWaveGAN/egs/speech_commands/voc1/local/data_download.sh
new file mode 100755
index 0000000000000000000000000000000000000000..10a1aea507a4641739aa8dc5c64de811a5eadc58
--- /dev/null
+++ b/ParallelWaveGAN/egs/speech_commands/voc1/local/data_download.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+download_dir=$1
+
+# check arguments
+if [ $# != 1 ]; then
+    echo "Usage: $0 <download_dir>"
+    exit 1
+fi
+
+set -euo pipefail
+
+cwd=$(pwd)
+if [ ! -e "${download_dir}/speech_commands" ]; then
+    mkdir -p "${download_dir}/speech_commands"
+    cd "${download_dir}"
+    wget http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz
+    tar -vxf ./*.tar.gz -C "${download_dir}/speech_commands"
+    rm ./*.tar.gz
+    cd "${cwd}"
+    echo "Successfully downloaded data."
+else
+    echo "Already exists. Skipped."
+fi
diff --git a/ParallelWaveGAN/egs/speech_commands/voc1/local/data_prep.sh b/ParallelWaveGAN/egs/speech_commands/voc1/local/data_prep.sh
new file mode 100755
index 0000000000000000000000000000000000000000..b24100ce7c38b3c28f51fe156d49d7f1c8d0ab38
--- /dev/null
+++ b/ParallelWaveGAN/egs/speech_commands/voc1/local/data_prep.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+# shellcheck disable=SC1091
+. ./path.sh || exit 1;
+
+num_dev=250
+num_eval=250
+train_set="train_nodev"
+dev_set="dev"
+eval_set="eval"
+shuffle=false
+
+# shellcheck disable=SC1091
+. utils/parse_options.sh || exit 1;
+
+db_root=$1
+data_dir=$2
+
+# check arguments
+if [ $# != 2 ]; then
+    echo "Usage: $0 [Options] <db_root> <data_dir>"
+    echo "e.g.: $0 downloads/speech_commands data"
+    echo ""
+    echo "Options:"
+    echo "    --num_dev: number of development uttreances (default=250)."
+    echo "    --num_eval: number of evaluation uttreances (default=250)."
+    echo "    --train_set: name of train set (default=train_nodev)."
+    echo "    --dev_set: name of dev set (default=dev)."
+    echo "    --eval_set: name of eval set (default=eval)."
+    echo "    --shuffle: whether to perform shuffle in making dev / eval set (default=false)."
+    exit 1
+fi
+
+set -euo pipefail
+
+[ ! -e "${data_dir}/all" ] && mkdir -p "${data_dir}/all"
+
+# set filenames
+scp="${data_dir}/all/wav.scp"
+
+# check file existence
+[ -e "${scp}" ] && rm "${scp}"
+
+# make all scp
+find "${db_root}" -follow -name "*.wav" | sort | while read -r filename; do
+    id=$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")
+    echo "${id} ${filename}" >> "${scp}"
+done
+
+# split
+num_all=$(wc -l < "${scp}")
+num_deveval=$((num_dev + num_eval))
+num_train=$((num_all - num_deveval))
+utils/split_data.sh \
+    --num_first "${num_train}" \
+    --num_second "${num_deveval}" \
+    --shuffle "${shuffle}" \
+    "${data_dir}/all" \
+    "${data_dir}/${train_set}" \
+    "${data_dir}/deveval"
+utils/split_data.sh \
+    --num_first "${num_dev}" \
+    --num_second "${num_eval}" \
+    --shuffle "${shuffle}" \
+    "${data_dir}/deveval" \
+    "${data_dir}/${dev_set}" \
+    "${data_dir}/${eval_set}"
+
+# remove tmp directories
+rm -rf "${data_dir}/all"
+rm -rf "${data_dir}/deveval"
+
+echo "Successfully prepared data."
diff --git a/ParallelWaveGAN/egs/speech_commands/voc1/path.sh b/ParallelWaveGAN/egs/speech_commands/voc1/path.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b0ca27c615f70aa29e240222ec370f8ad4e7b45a
--- /dev/null
+++ b/ParallelWaveGAN/egs/speech_commands/voc1/path.sh
@@ -0,0 +1,33 @@
+# cuda related
+export CUDA_HOME=/usr/local/cuda-10.0
+export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
+
+# path related
+export PRJ_ROOT="${PWD}/../../.."
+if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then
+    # shellcheck disable=SC1090
+    . "${PRJ_ROOT}/tools/venv/bin/activate"
+fi
+
+# python related
+export OMP_NUM_THREADS=1
+export PYTHONIOENCODING=UTF-8
+export MPL_BACKEND=Agg
+
+# check installation
+if ! command -v parallel-wavegan-train > /dev/null; then
+    echo "Error: It seems setup is not finished." >&2
+    echo "Error: Please setup your environment by following README.md" >&2
+    return 1
+fi
+if ! command -v jq > /dev/null; then
+    echo "Error: It seems jq is not installed." >&2
+    echo "Error: Please install via \`sudo apt-get install jq\`." >&2
+    echo "Error: If you do not have sudo, please download from https://stedolan.github.io/jq/download/." >&2
+    return 1
+fi
+if ! command -v yq > /dev/null; then
+    echo "Error: It seems yq is not installed." >&2
+    echo "Error: Please install via \`pip install yq\`." >&2
+    return 1
+fi
diff --git a/ParallelWaveGAN/egs/speech_commands/voc1/run.sh b/ParallelWaveGAN/egs/speech_commands/voc1/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..0e1e24d55f548d47b13ca74b0a713be29eb277cf
--- /dev/null
+++ b/ParallelWaveGAN/egs/speech_commands/voc1/run.sh
@@ -0,0 +1,164 @@
+#!/bin/bash
+
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+. ./cmd.sh || exit 1;
+. ./path.sh || exit 1;
+
+# basic settings
+stage=-1       # stage to start
+stop_stage=100 # stage to stop
+verbose=1      # verbosity level (lower is less info)
+n_gpus=1       # number of gpus in training
+n_jobs=16      # number of parallel jobs in feature extraction
+
+# NOTE(kan-bayashi): renamed to conf to avoid conflict in parse_options.sh
+conf=conf/parallel_wavegan.v1.yaml
+
+# directory path setting
+download_dir=downloads # direcotry to save downloaded files
+dumpdir=dump           # directory to dump features
+
+# training related setting
+tag=""     # tag for directory to save model
+resume=""  # checkpoint path to resume training
+           # (e.g. <path>/<to>/checkpoint-10000steps.pkl)
+
+# decoding related setting
+checkpoint="" # checkpoint path to be used for decoding
+              # if not provided, the latest one will be used
+              # (e.g. <path>/<to>/checkpoint-400000steps.pkl)
+
+# shellcheck disable=SC1091
+. utils/parse_options.sh || exit 1;
+
+train_set="train_nodev" # name of training data directory
+dev_set="dev"           # name of development data direcotry
+eval_set="eval"         # name of evaluation data direcotry
+
+set -euo pipefail
+
+if [ "${stage}" -le -1 ] && [ "${stop_stage}" -ge -1 ]; then
+    echo "Stage -1: Data download"
+    local/data_download.sh "${download_dir}"
+fi
+
+if [ "${stage}" -le 0 ] && [ "${stop_stage}" -ge 0 ]; then
+    echo "Stage 0: Data preparation"
+    local/data_prep.sh \
+        --train_set "${train_set}" \
+        --dev_set "${dev_set}" \
+        --eval_set "${eval_set}" \
+        --shuffle true \
+        "${download_dir}/sc_all" data
+fi
+
+stats_ext=$(grep -q "hdf5" <(yq ".format" "${conf}") && echo "h5" || echo "npy")
+if [ "${stage}" -le 1 ] && [ "${stop_stage}" -ge 1 ]; then
+    echo "Stage 1: Feature extraction"
+    # extract raw features
+    pids=()
+    for name in "${train_set}" "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${dumpdir}/${name}/raw" ] && mkdir -p "${dumpdir}/${name}/raw"
+        echo "Feature extraction start. See the progress via ${dumpdir}/${name}/raw/preprocessing.*.log."
+        utils/make_subset_data.sh "data/${name}" "${n_jobs}" "${dumpdir}/${name}/raw"
+        ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/raw/preprocessing.JOB.log" \
+            parallel-wavegan-preprocess \
+                --config "${conf}" \
+                --scp "${dumpdir}/${name}/raw/wav.JOB.scp" \
+                --dumpdir "${dumpdir}/${name}/raw/dump.JOB" \
+                --verbose "${verbose}"
+        echo "Successfully finished feature extraction of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished feature extraction."
+
+    # calculate statistics for normalization
+    echo "Statistics computation start. See the progress via ${dumpdir}/${train_set}/compute_statistics.log."
+    ${train_cmd} "${dumpdir}/${train_set}/compute_statistics.log" \
+        parallel-wavegan-compute-statistics \
+            --config "${conf}" \
+            --rootdir "${dumpdir}/${train_set}/raw" \
+            --dumpdir "${dumpdir}/${train_set}" \
+            --verbose "${verbose}"
+    echo "Successfully finished calculation of statistics."
+
+    # normalize and dump them
+    pids=()
+    for name in "${train_set}" "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${dumpdir}/${name}/norm" ] && mkdir -p "${dumpdir}/${name}/norm"
+        echo "Nomalization start. See the progress via ${dumpdir}/${name}/norm/normalize.*.log."
+        ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/norm/normalize.JOB.log" \
+            parallel-wavegan-normalize \
+                --config "${conf}" \
+                --stats "${dumpdir}/${train_set}/stats.${stats_ext}" \
+                --rootdir "${dumpdir}/${name}/raw/dump.JOB" \
+                --dumpdir "${dumpdir}/${name}/norm/dump.JOB" \
+                --verbose "${verbose}"
+        echo "Successfully finished normalization of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished normalization."
+fi
+
+if [ -z "${tag}" ]; then
+    expdir="exp/${train_set}_speech_commands_$(basename "${conf}" .yaml)"
+else
+    expdir="exp/${train_set}_speech_commands_${tag}"
+fi
+if [ "${stage}" -le 2 ] && [ "${stop_stage}" -ge 2 ]; then
+    echo "Stage 2: Network training"
+    [ ! -e "${expdir}" ] && mkdir -p "${expdir}"
+    cp "${dumpdir}/${train_set}/stats.${stats_ext}" "${expdir}"
+    if [ "${n_gpus}" -gt 1 ]; then
+        train="python -m parallel_wavegan.distributed.launch --nproc_per_node ${n_gpus} -c parallel-wavegan-train"
+    else
+        train="parallel-wavegan-train"
+    fi
+    echo "Training start. See the progress via ${expdir}/train.log."
+    ${cuda_cmd} --gpu "${n_gpus}" "${expdir}/train.log" \
+        ${train} \
+            --config "${conf}" \
+            --train-dumpdir "${dumpdir}/${train_set}/norm" \
+            --dev-dumpdir "${dumpdir}/${dev_set}/norm" \
+            --outdir "${expdir}" \
+            --resume "${resume}" \
+            --verbose "${verbose}"
+    echo "Successfully finished training."
+fi
+
+if [ "${stage}" -le 3 ] && [ "${stop_stage}" -ge 3 ]; then
+    echo "Stage 3: Network decoding"
+    # shellcheck disable=SC2012
+    [ -z "${checkpoint}" ] && checkpoint="$(ls -dt "${expdir}"/*.pkl | head -1 || true)"
+    outdir="${expdir}/wav/$(basename "${checkpoint}" .pkl)"
+    pids=()
+    for name in "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${outdir}/${name}" ] && mkdir -p "${outdir}/${name}"
+        [ "${n_gpus}" -gt 1 ] && n_gpus=1
+        echo "Decoding start. See the progress via ${outdir}/${name}/decode.log."
+        ${cuda_cmd} --gpu "${n_gpus}" "${outdir}/${name}/decode.log" \
+            parallel-wavegan-decode \
+                --dumpdir "${dumpdir}/${name}/norm" \
+                --checkpoint "${checkpoint}" \
+                --outdir "${outdir}/${name}" \
+                --verbose "${verbose}"
+        echo "Successfully finished decoding of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished decoding."
+fi
+echo "Finished."
diff --git a/ParallelWaveGAN/egs/speech_commands/voc1/utils b/ParallelWaveGAN/egs/speech_commands/voc1/utils
new file mode 120000
index 0000000000000000000000000000000000000000..973afe674f2c85f7a400600f963e0709767602dc
--- /dev/null
+++ b/ParallelWaveGAN/egs/speech_commands/voc1/utils
@@ -0,0 +1 @@
+../../../utils
\ No newline at end of file
diff --git a/ParallelWaveGAN/egs/template_multi_spk/voc1/cmd.sh b/ParallelWaveGAN/egs/template_multi_spk/voc1/cmd.sh
new file mode 100644
index 0000000000000000000000000000000000000000..19f342102fc4f3389157c48f1196b16b68eb1cf1
--- /dev/null
+++ b/ParallelWaveGAN/egs/template_multi_spk/voc1/cmd.sh
@@ -0,0 +1,91 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend="local"
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="utils/run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="utils/run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="utils/run.pl"
+
+# Local machine, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="utils/stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="utils/stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="utils/stdout.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="utils/queue.pl"
+    export cuda_cmd="utils/queue.pl"
+    export decode_cmd="utils/queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="utils/slurm.pl"
+    export cuda_cmd="utils/slurm.pl"
+    export decode_cmd="utils/slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="utils/ssh.pl"
+    export cuda_cmd="utils/ssh.pl"
+    export decode_cmd="utils/ssh.pl"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/ParallelWaveGAN/egs/template_multi_spk/voc1/conf/parallel_wavegan.v1.yaml b/ParallelWaveGAN/egs/template_multi_spk/voc1/conf/parallel_wavegan.v1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..920cb7be631bdd7f6e06dea457d1a18d95932a79
--- /dev/null
+++ b/ParallelWaveGAN/egs/template_multi_spk/voc1/conf/parallel_wavegan.v1.yaml
@@ -0,0 +1,121 @@
+# This is the hyperparameter configuration file for Parallel WaveGAN.
+# This configuration requires 12 GB GPU memory and takes ~3 days on TITAN V.
+# You need to change the setting depneding on your dataset.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 22050     # Sampling rate.
+fft_size: 1024           # FFT size.
+hop_size: 256            # Hop size.
+win_length: null         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: false      # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 2048    # Frame size in trimming.
+trim_hop_size: 512       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Kernel size of dilated convolution.
+    layers: 30            # Number of residual block layers.
+    stacks: 3             # Number of stacks i.e., dilation cycles.
+    residual_channels: 64 # Number of channels in residual conv.
+    gate_channels: 128    # Number of channels in gated conv.
+    skip_channels: 64     # Number of channels in skip conv.
+    aux_channels: 80      # Number of channels for auxiliary feature conv.
+                          # Must be the same as num_mels.
+    aux_context_window: 2 # Context window size for auxiliary feature.
+                          # If set to 2, previous 2 and future 2 frames will be considered.
+    dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
+    upsample_params:                      # Upsampling network parameters.
+        upsample_scales: [4, 4, 4, 4]     # Upsampling scales. Prodcut of these must be the same as hop size.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Number of output channels.
+    layers: 10            # Number of conv layers.
+    conv_channels: 64     # Number of chnn layers.
+    bias: true            # Whether to use bias parameter in conv.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
+    nonlinear_activation_params:      # Nonlinear function parameters
+        negative_slope: 0.2           # Alpha in LeakyReLU.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 4.0  # Loss balancing coefficient.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 6              # Batch size.
+batch_max_steps: 25600     # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2             # Number of workers in Pytorch DataLoader.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    lr: 0.0001             # Generator's learning rate.
+    eps: 1.0e-6            # Generator's epsilon.
+    weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+    step_size: 200000      # Generator's scheduler step size.
+    gamma: 0.5             # Generator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+    lr: 0.00005            # Discriminator's learning rate.
+    eps: 1.0e-6            # Discriminator's epsilon.
+    weight_decay: 0.0      # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    step_size: 200000      # Discriminator's scheduler step size.
+    gamma: 0.5             # Discriminator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 400000                 # Number of training steps.
+save_interval_steps: 5000               # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+log_interval_steps: 100                 # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/template_multi_spk/voc1/conf/slurm.conf b/ParallelWaveGAN/egs/template_multi_spk/voc1/conf/slurm.conf
new file mode 100644
index 0000000000000000000000000000000000000000..cefd21f031d647f798e8045a6beb92b0b833d041
--- /dev/null
+++ b/ParallelWaveGAN/egs/template_multi_spk/voc1/conf/slurm.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command sbatch --export=PATH  --ntasks-per-node=1
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
+option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1 # Do not add anything to qsub_opts
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/ParallelWaveGAN/egs/template_multi_spk/voc1/local/data_prep.sh b/ParallelWaveGAN/egs/template_multi_spk/voc1/local/data_prep.sh
new file mode 100755
index 0000000000000000000000000000000000000000..6c3df19a390b6a9eb533549db7a8ac045688be04
--- /dev/null
+++ b/ParallelWaveGAN/egs/template_multi_spk/voc1/local/data_prep.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+
+# Copyright 2020 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+# shellcheck disable=SC1091
+. ./path.sh || exit 1;
+
+fs=22050
+num_dev=5
+num_eval=5
+train_set="train_nodev"
+dev_set="dev"
+eval_set="eval"
+shuffle=false
+
+# shellcheck disable=SC1091
+. utils/parse_options.sh || exit 1;
+
+db_root=$1
+spk=$2
+data_dir=$3
+
+# check arguments
+if [ $# != 3 ]; then
+    echo "Usage: $0 [Options] <db_root> <spk> <data_dir>"
+    echo ""
+    echo "Options:"
+    echo "    --fs: target sampling rate (default=22050)."
+    echo "    --num_dev: number of development uttreances (default=10)."
+    echo "    --num_eval: number of evaluation uttreances (default=10)."
+    echo "    --train_set: name of train set (default=train_nodev)."
+    echo "    --dev_set: name of dev set (default=dev)."
+    echo "    --eval_set: name of eval set (default=eval)."
+    echo "    --shuffle: whether to perform shuffule to make dev & eval set (default=true)."
+    exit 1
+fi
+
+set -euo pipefail
+
+# check spk existence
+[ ! -e "${db_root}/${spk}" ] && \
+    echo "${spk} does not exist." >&2 && exit 1;
+
+[ ! -e "${data_dir}/all_${spk}" ] && mkdir -p "${data_dir}/all_${spk}"
+
+# set filenames
+scp="${data_dir}/all_${spk}/wav.scp"
+
+# check file existence
+[ -e "${scp}" ] && rm "${scp}"
+
+# make all scp
+find "${db_root}/${spk}" -follow -name "*.wav" | sort | while read -r filename; do
+    id=${spk}_$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")
+    echo "${id} cat ${filename} | sox -t wav - -c 1 -b 16 -t wav - rate ${fs} |" >> "${scp}"
+done
+
+# split
+num_all=$(wc -l < "${scp}")
+num_deveval=$((num_dev + num_eval))
+num_train=$((num_all - num_deveval))
+if [ ${num_eval} -ne 0 ]; then
+    utils/split_data.sh \
+        --num_first "${num_train}" \
+        --num_second "${num_deveval}" \
+        --shuffle "${shuffle}" \
+        "${data_dir}/all_${spk}" \
+        "${data_dir}/${train_set}" \
+        "${data_dir}/deveval_${spk}"
+    utils/split_data.sh \
+        --num_first "${num_dev}" \
+        --num_second "${num_eval}" \
+        --shuffle "${shuffle}" \
+        "${data_dir}/deveval_${spk}" \
+        "${data_dir}/${dev_set}" \
+        "${data_dir}/${eval_set}"
+else
+    utils/split_data.sh \
+        --num_first "${num_train}" \
+        --num_second "${num_deveval}" \
+        --shuffle "${shuffle}" \
+        "${data_dir}/all_${spk}" \
+        "${data_dir}/${train_set}" \
+        "${data_dir}/${dev_set}"
+    cp -r "${data_dir}/${dev_set}" "${data_dir}/${eval_set}"
+fi
+
+# remove tmp directories
+rm -rf "${data_dir}/all_${spk}"
+rm -rf "${data_dir}/deveval_${spk}"
+
+echo "Successfully prepared data."
diff --git a/ParallelWaveGAN/egs/template_multi_spk/voc1/path.sh b/ParallelWaveGAN/egs/template_multi_spk/voc1/path.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b0ca27c615f70aa29e240222ec370f8ad4e7b45a
--- /dev/null
+++ b/ParallelWaveGAN/egs/template_multi_spk/voc1/path.sh
@@ -0,0 +1,33 @@
+# cuda related
+export CUDA_HOME=/usr/local/cuda-10.0
+export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
+
+# path related
+export PRJ_ROOT="${PWD}/../../.."
+if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then
+    # shellcheck disable=SC1090
+    . "${PRJ_ROOT}/tools/venv/bin/activate"
+fi
+
+# python related
+export OMP_NUM_THREADS=1
+export PYTHONIOENCODING=UTF-8
+export MPL_BACKEND=Agg
+
+# check installation
+if ! command -v parallel-wavegan-train > /dev/null; then
+    echo "Error: It seems setup is not finished." >&2
+    echo "Error: Please setup your environment by following README.md" >&2
+    return 1
+fi
+if ! command -v jq > /dev/null; then
+    echo "Error: It seems jq is not installed." >&2
+    echo "Error: Please install via \`sudo apt-get install jq\`." >&2
+    echo "Error: If you do not have sudo, please download from https://stedolan.github.io/jq/download/." >&2
+    return 1
+fi
+if ! command -v yq > /dev/null; then
+    echo "Error: It seems yq is not installed." >&2
+    echo "Error: Please install via \`pip install yq\`." >&2
+    return 1
+fi
diff --git a/ParallelWaveGAN/egs/template_multi_spk/voc1/run.sh b/ParallelWaveGAN/egs/template_multi_spk/voc1/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..a98ef0f9f4bd4bf84eccfcc9a0f22112f02adebd
--- /dev/null
+++ b/ParallelWaveGAN/egs/template_multi_spk/voc1/run.sh
@@ -0,0 +1,212 @@
+#!/bin/bash
+
+# Copyright 2020 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+. ./cmd.sh || exit 1;
+. ./path.sh || exit 1;
+
+# basic settings
+stage=0        # stage to start
+stop_stage=100 # stage to stop
+verbose=1      # verbosity level (lower is less info)
+n_gpus=1       # number of gpus in training
+n_jobs=4       # number of parallel jobs in feature extraction
+
+# NOTE(kan-bayashi): renamed to conf to avoid conflict in parse_options.sh
+conf=conf/parallel_wavegan.v1.yaml
+
+# directory path setting
+db_root=/path/to/database # direcotry including spk name directory (MODIFY BY YOURSELF)
+                          # e.g.
+                          # /path/to/database
+                          # ├── spk_1
+                          # │   ├── utt1.wav
+                          # ├── spk_2
+                          # │   ├── utt1.wav
+                          # │   ...
+                          # └── spk_N
+                          #     ├── utt1.wav
+                          #     ...
+dumpdir=dump # directory to dump features
+
+# subset setting
+spks="all"    # speaker name to be used (e.g. "spk1 spk2")
+              # it must be matched the name under the ${db_root}
+              # if set to "all", all of the speakers in ${db_root} will be used
+shuffle=false # whether to shuffle the data to create subset
+num_dev=10    # the number of development data for each speaker
+num_eval=10   # the number of evaluation data for each speaker
+              # (if set to 0, the same dev set is used as eval set)
+
+# training related setting
+tag=""     # tag for directory to save model
+resume=""  # checkpoint path to resume training
+           # (e.g. <path>/<to>/checkpoint-10000steps.pkl)
+pretrain="" # checkpoint path to load pretrained parameters
+            # (e.g. ../../jsut/<path>/<to>/checkpoint-400000steps.pkl)
+
+# decoding related setting
+checkpoint="" # checkpoint path to be used for decoding
+              # if not provided, the latest one will be used
+              # (e.g. <path>/<to>/checkpoint-400000steps.pkl)
+
+# shellcheck disable=SC1091
+. utils/parse_options.sh || exit 1;
+
+train_set="train_nodev_$(echo "${spks}" | tr " " "_")" # name of training data directory
+dev_set="dev_$(echo "${spks}" | tr " " "_")"           # name of development data directory
+eval_set="eval_$(echo "${spks}" | tr " " "_")"         # name of evaluation data directory
+
+set -euo pipefail
+
+if [ "${stage}" -le 0 ] && [ "${stop_stage}" -ge 0 ]; then
+    echo "Stage 0: Data preparation"
+    train_data_dirs=""
+    dev_data_dirs=""
+    eval_data_dirs=""
+    if [ "${spks}" = "all" ]; then
+        spks=$(find "${db_root}" -maxdepth 1 ! -path "${db_root}" \
+            -follow -type d -print0 -name "[^.]*" | xargs -0 -I{} basename {})
+    fi
+    for spk in ${spks}; do
+        local/data_prep.sh \
+            --fs "$(yq ".sampling_rate" "${conf}")" \
+            --shuffle "${shuffle}" \
+            --num_dev "${num_dev}" \
+            --num_eval "${num_eval}" \
+            --train_set "train_nodev_${spk}" \
+            --dev_set "dev_${spk}" \
+            --eval_set "eval_${spk}" \
+            "${db_root}" "${spk}" data
+        train_data_dirs+=" data/train_nodev_${spk}"
+        dev_data_dirs+=" data/dev_${spk}"
+        eval_data_dirs+=" data/eval_${spk}"
+    done
+    # shellcheck disable=SC2086
+    utils/combine_data.sh "data/${train_set}" ${train_data_dirs}
+    # shellcheck disable=SC2086
+    utils/combine_data.sh "data/${dev_set}" ${dev_data_dirs}
+    # shellcheck disable=SC2086
+    utils/combine_data.sh "data/${eval_set}" ${eval_data_dirs}
+fi
+
+stats_ext=$(grep -q "hdf5" <(yq ".format" "${conf}") && echo "h5" || echo "npy")
+if [ "${stage}" -le 1 ] && [ "${stop_stage}" -ge 1 ]; then
+    echo "Stage 1: Feature extraction"
+    # extract raw features
+    pids=()
+    for name in "${train_set}" "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${dumpdir}/${name}/raw" ] && mkdir -p "${dumpdir}/${name}/raw"
+        echo "Feature extraction start. See the progress via ${dumpdir}/${name}/raw/preprocessing.*.log."
+        utils/make_subset_data.sh "data/${name}" "${n_jobs}" "${dumpdir}/${name}/raw"
+        ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/raw/preprocessing.JOB.log" \
+            parallel-wavegan-preprocess \
+                --config "${conf}" \
+                --scp "${dumpdir}/${name}/raw/wav.JOB.scp" \
+                --dumpdir "${dumpdir}/${name}/raw/dump.JOB" \
+                --verbose "${verbose}"
+        echo "Successfully finished feature extraction of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished feature extraction."
+
+    # calculate statistics for normalization
+    if [ -z "${pretrain}" ]; then
+        # calculate statistics for normalization
+        echo "Statistics computation start. See the progress via ${dumpdir}/${train_set}/compute_statistics.log."
+        ${train_cmd} "${dumpdir}/${train_set}/compute_statistics.log" \
+            parallel-wavegan-compute-statistics \
+                --config "${conf}" \
+                --rootdir "${dumpdir}/${train_set}/raw" \
+                --dumpdir "${dumpdir}/${train_set}" \
+                --verbose "${verbose}"
+        echo "Successfully finished calculation of statistics."
+    else
+        echo "Use statistics of pretrained model. Skip statistics computation."
+        cp "$(dirname "${pretrain}")/stats.${stats_ext}" "${dumpdir}/${train_set}"
+    fi
+
+    # normalize and dump them
+    pids=()
+    for name in "${train_set}" "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${dumpdir}/${name}/norm" ] && mkdir -p "${dumpdir}/${name}/norm"
+        echo "Nomalization start. See the progress via ${dumpdir}/${name}/norm/normalize.*.log."
+        ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/norm/normalize.JOB.log" \
+            parallel-wavegan-normalize \
+                --config "${conf}" \
+                --stats "${dumpdir}/${train_set}/stats.${stats_ext}" \
+                --rootdir "${dumpdir}/${name}/raw/dump.JOB" \
+                --dumpdir "${dumpdir}/${name}/norm/dump.JOB" \
+                --verbose "${verbose}"
+        echo "Successfully finished normalization of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished normalization."
+fi
+
+if [ -z "${tag}" ]; then
+    expdir="exp/${train_set}_$(basename "${conf}" .yaml)"
+    if [ -n "${pretrain}" ]; then
+        pretrain_tag=$(basename "$(dirname "${pretrain}")")
+        expdir+="_${pretrain_tag}"
+    fi
+else
+    expdir="exp/${train_set}_${tag}"
+fi
+if [ "${stage}" -le 2 ] && [ "${stop_stage}" -ge 2 ]; then
+    echo "Stage 2: Network training"
+    [ ! -e "${expdir}" ] && mkdir -p "${expdir}"
+    cp "${dumpdir}/${train_set}/stats.${stats_ext}" "${expdir}"
+    if [ "${n_gpus}" -gt 1 ]; then
+        train="python -m parallel_wavegan.distributed.launch --nproc_per_node ${n_gpus} -c parallel-wavegan-train"
+    else
+        train="parallel-wavegan-train"
+    fi
+    echo "Training start. See the progress via ${expdir}/train.log."
+    ${cuda_cmd} --gpu "${n_gpus}" "${expdir}/train.log" \
+        ${train} \
+            --config "${conf}" \
+            --train-dumpdir "${dumpdir}/${train_set}/norm" \
+            --dev-dumpdir "${dumpdir}/${dev_set}/norm" \
+            --outdir "${expdir}" \
+            --resume "${resume}" \
+            --pretrain "${pretrain}" \
+            --verbose "${verbose}"
+    echo "Successfully finished training."
+fi
+
+if [ "${stage}" -le 3 ] && [ "${stop_stage}" -ge 3 ]; then
+    echo "Stage 3: Network decoding"
+    # shellcheck disable=SC2012
+    [ -z "${checkpoint}" ] && checkpoint="$(ls -dt "${expdir}"/*.pkl | head -1 || true)"
+    outdir="${expdir}/wav/$(basename "${checkpoint}" .pkl)"
+    pids=()
+    for name in "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${outdir}/${name}" ] && mkdir -p "${outdir}/${name}"
+        [ "${n_gpus}" -gt 1 ] && n_gpus=1
+        echo "Decoding start. See the progress via ${outdir}/${name}/decode.log."
+        ${cuda_cmd} --gpu "${n_gpus}" "${outdir}/${name}/decode.log" \
+            parallel-wavegan-decode \
+                --dumpdir "${dumpdir}/${name}/norm" \
+                --checkpoint "${checkpoint}" \
+                --outdir "${outdir}/${name}" \
+                --verbose "${verbose}"
+        echo "Successfully finished decoding of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished decoding."
+fi
+echo "Finished."
diff --git a/ParallelWaveGAN/egs/template_multi_spk/voc1/utils b/ParallelWaveGAN/egs/template_multi_spk/voc1/utils
new file mode 120000
index 0000000000000000000000000000000000000000..973afe674f2c85f7a400600f963e0709767602dc
--- /dev/null
+++ b/ParallelWaveGAN/egs/template_multi_spk/voc1/utils
@@ -0,0 +1 @@
+../../../utils
\ No newline at end of file
diff --git a/ParallelWaveGAN/egs/template_single_spk/voc1/cmd.sh b/ParallelWaveGAN/egs/template_single_spk/voc1/cmd.sh
new file mode 100644
index 0000000000000000000000000000000000000000..19f342102fc4f3389157c48f1196b16b68eb1cf1
--- /dev/null
+++ b/ParallelWaveGAN/egs/template_single_spk/voc1/cmd.sh
@@ -0,0 +1,91 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend="local"
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="utils/run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="utils/run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="utils/run.pl"
+
+# Local machine, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="utils/stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="utils/stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="utils/stdout.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="utils/queue.pl"
+    export cuda_cmd="utils/queue.pl"
+    export decode_cmd="utils/queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="utils/slurm.pl"
+    export cuda_cmd="utils/slurm.pl"
+    export decode_cmd="utils/slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="utils/ssh.pl"
+    export cuda_cmd="utils/ssh.pl"
+    export decode_cmd="utils/ssh.pl"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/ParallelWaveGAN/egs/template_single_spk/voc1/conf/parallel_wavegan.v1.yaml b/ParallelWaveGAN/egs/template_single_spk/voc1/conf/parallel_wavegan.v1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..29ff94d04d37d72eba8d0d2a7496ffca90625648
--- /dev/null
+++ b/ParallelWaveGAN/egs/template_single_spk/voc1/conf/parallel_wavegan.v1.yaml
@@ -0,0 +1,121 @@
+# This is the hyperparameter configuration file for Parallel WaveGAN.
+# This configuration requires 12 GB GPU memory and takes ~3 days on TITAN V.
+# You need to change the setting depneding on your dataset.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 22050     # Sampling rate.
+fft_size: 1024           # FFT size.
+hop_size: 256            # Hop size.
+win_length: null         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: false       # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 2048    # Frame size in trimming.
+trim_hop_size: 512       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Kernel size of dilated convolution.
+    layers: 30            # Number of residual block layers.
+    stacks: 3             # Number of stacks i.e., dilation cycles.
+    residual_channels: 64 # Number of channels in residual conv.
+    gate_channels: 128    # Number of channels in gated conv.
+    skip_channels: 64     # Number of channels in skip conv.
+    aux_channels: 80      # Number of channels for auxiliary feature conv.
+                          # Must be the same as num_mels.
+    aux_context_window: 2 # Context window size for auxiliary feature.
+                          # If set to 2, previous 2 and future 2 frames will be considered.
+    dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
+    upsample_params:                      # Upsampling network parameters.
+        upsample_scales: [4, 4, 4, 4]     # Upsampling scales. Prodcut of these must be the same as hop size.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Number of output channels.
+    layers: 10            # Number of conv layers.
+    conv_channels: 64     # Number of chnn layers.
+    bias: true            # Whether to use bias parameter in conv.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
+    nonlinear_activation_params:      # Nonlinear function parameters
+        negative_slope: 0.2           # Alpha in LeakyReLU.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 4.0  # Loss balancing coefficient.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 6              # Batch size.
+batch_max_steps: 25600     # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2             # Number of workers in Pytorch DataLoader.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    lr: 0.0001             # Generator's learning rate.
+    eps: 1.0e-6            # Generator's epsilon.
+    weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+    step_size: 200000      # Generator's scheduler step size.
+    gamma: 0.5             # Generator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+    lr: 0.00005            # Discriminator's learning rate.
+    eps: 1.0e-6            # Discriminator's epsilon.
+    weight_decay: 0.0      # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    step_size: 200000      # Discriminator's scheduler step size.
+    gamma: 0.5             # Discriminator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 400000                 # Number of training steps.
+save_interval_steps: 5000               # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+log_interval_steps: 100                 # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/template_single_spk/voc1/conf/slurm.conf b/ParallelWaveGAN/egs/template_single_spk/voc1/conf/slurm.conf
new file mode 100644
index 0000000000000000000000000000000000000000..cefd21f031d647f798e8045a6beb92b0b833d041
--- /dev/null
+++ b/ParallelWaveGAN/egs/template_single_spk/voc1/conf/slurm.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command sbatch --export=PATH  --ntasks-per-node=1
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
+option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1 # Do not add anything to qsub_opts
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/ParallelWaveGAN/egs/template_single_spk/voc1/local/data_prep.sh b/ParallelWaveGAN/egs/template_single_spk/voc1/local/data_prep.sh
new file mode 100755
index 0000000000000000000000000000000000000000..b2044d8f4bc045ec1052547a2657bbc0ff35fce4
--- /dev/null
+++ b/ParallelWaveGAN/egs/template_single_spk/voc1/local/data_prep.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+
+# Copyright 2020 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+# shellcheck disable=SC1091
+. ./path.sh || exit 1;
+
+fs=22050
+num_dev=100
+num_eval=100
+train_set="train_nodev"
+dev_set="dev"
+eval_set="eval"
+shuffle=false
+
+# shellcheck disable=SC1091
+. utils/parse_options.sh || exit 1;
+
+db_root=$1
+data_dir=$2
+
+# check arguments
+if [ $# != 2 ]; then
+    echo "Usage: $0 [Options] <db_root> <data_dir>"
+    echo ""
+    echo "Options:"
+    echo "    --num_dev: number of development uttreances (default=250)."
+    echo "    --num_eval: number of evaluation uttreances (default=250)."
+    echo "    --train_set: name of train set (default=train_nodev)."
+    echo "    --dev_set: name of dev set (default=dev)."
+    echo "    --eval_set: name of eval set (default=eval)."
+    echo "    --shuffle: whether to perform shuffle in making dev / eval set (default=false)."
+    exit 1
+fi
+
+set -euo pipefail
+
+[ ! -e "${data_dir}/all" ] && mkdir -p "${data_dir}/all"
+
+# set filenames
+scp="${data_dir}/all/wav.scp"
+
+# check file existence
+[ -e "${scp}" ] && rm "${scp}"
+
+# make all scp
+find "${db_root}" -follow -name "*.wav" | sort | while read -r filename; do
+    id=$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")
+    echo "${id} cat ${filename} | sox -t wav - -c 1 -b 16 -t wav - rate ${fs} |" >> "${scp}"
+done
+
+# split
+num_all=$(wc -l < "${scp}")
+num_deveval=$((num_dev + num_eval))
+num_train=$((num_all - num_deveval))
+if [ ${num_eval} -ne 0 ]; then
+    utils/split_data.sh \
+        --num_first "${num_train}" \
+        --num_second "${num_deveval}" \
+        --shuffle "${shuffle}" \
+        "${data_dir}/all" \
+        "${data_dir}/${train_set}" \
+        "${data_dir}/deveval"
+    utils/split_data.sh \
+        --num_first "${num_dev}" \
+        --num_second "${num_eval}" \
+        --shuffle "${shuffle}" \
+        "${data_dir}/deveval" \
+        "${data_dir}/${dev_set}" \
+        "${data_dir}/${eval_set}"
+else
+    utils/split_data.sh \
+        --num_first "${num_train}" \
+        --num_second "${num_deveval}" \
+        --shuffle "${shuffle}" \
+        "${data_dir}/all" \
+        "${data_dir}/${train_set}" \
+        "${data_dir}/${dev_set}"
+    cp -r "${data_dir}/${dev_set}" "${data_dir}/${eval_set}"
+fi
+
+# remove tmp directories
+rm -rf "${data_dir}/all"
+rm -rf "${data_dir}/deveval"
+
+echo "Successfully prepared data."
diff --git a/ParallelWaveGAN/egs/template_single_spk/voc1/path.sh b/ParallelWaveGAN/egs/template_single_spk/voc1/path.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b0ca27c615f70aa29e240222ec370f8ad4e7b45a
--- /dev/null
+++ b/ParallelWaveGAN/egs/template_single_spk/voc1/path.sh
@@ -0,0 +1,33 @@
+# cuda related
+export CUDA_HOME=/usr/local/cuda-10.0
+export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
+
+# path related
+export PRJ_ROOT="${PWD}/../../.."
+if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then
+    # shellcheck disable=SC1090
+    . "${PRJ_ROOT}/tools/venv/bin/activate"
+fi
+
+# python related
+export OMP_NUM_THREADS=1
+export PYTHONIOENCODING=UTF-8
+export MPL_BACKEND=Agg
+
+# check installation
+if ! command -v parallel-wavegan-train > /dev/null; then
+    echo "Error: It seems setup is not finished." >&2
+    echo "Error: Please setup your environment by following README.md" >&2
+    return 1
+fi
+if ! command -v jq > /dev/null; then
+    echo "Error: It seems jq is not installed." >&2
+    echo "Error: Please install via \`sudo apt-get install jq\`." >&2
+    echo "Error: If you do not have sudo, please download from https://stedolan.github.io/jq/download/." >&2
+    return 1
+fi
+if ! command -v yq > /dev/null; then
+    echo "Error: It seems yq is not installed." >&2
+    echo "Error: Please install via \`pip install yq\`." >&2
+    return 1
+fi
diff --git a/ParallelWaveGAN/egs/template_single_spk/voc1/run.sh b/ParallelWaveGAN/egs/template_single_spk/voc1/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..c3872e47bd64bfea3c55151f6d76301083ed2a8f
--- /dev/null
+++ b/ParallelWaveGAN/egs/template_single_spk/voc1/run.sh
@@ -0,0 +1,188 @@
+#!/bin/bash
+
+# Copyright 2020 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+. ./cmd.sh || exit 1;
+. ./path.sh || exit 1;
+
+# basic settings
+stage=0        # stage to start
+stop_stage=100 # stage to stop
+verbose=1      # verbosity level (lower is less info)
+n_gpus=1       # number of gpus in training
+n_jobs=4       # number of parallel jobs in feature extraction
+
+# NOTE(kan-bayashi): renamed to conf to avoid conflict in parse_options.sh
+conf=conf/parallel_wavegan.v1.yaml
+
+# directory path setting
+db_root=/path/to/database # direcotry including wavfiles (MODIFY BY YOURSELF)
+                          # each wav filename in the directory should be unique
+                          # e.g.
+                          # /path/to/database
+                          # ├── utt_1.wav
+                          # ├── utt_2.wav
+                          # │   ...
+                          # └── utt_N.wav
+dumpdir=dump # directory to dump features
+
+# subset setting
+shuffle=false # whether to shuffle the data to create subset
+num_dev=100   # the number of development data
+num_eval=100  # the number of evaluation data
+              # (if set to 0, the same dev set is used as eval set)
+
+# training related setting
+tag=""     # tag for directory to save model
+resume=""  # checkpoint path to resume training
+           # (e.g. <path>/<to>/checkpoint-10000steps.pkl)
+pretrain="" # checkpoint path to load pretrained parameters
+            # (e.g. ../../jsut/<path>/<to>/checkpoint-400000steps.pkl)
+
+# decoding related setting
+checkpoint="" # checkpoint path to be used for decoding
+              # if not provided, the latest one will be used
+              # (e.g. <path>/<to>/checkpoint-400000steps.pkl)
+
+# shellcheck disable=SC1091
+. utils/parse_options.sh || exit 1;
+
+train_set="train_nodev" # name of training data directory
+dev_set="dev"           # name of development data direcotry
+eval_set="eval"         # name of evaluation data direcotry
+
+set -euo pipefail
+
+if [ "${stage}" -le 0 ] && [ "${stop_stage}" -ge 0 ]; then
+    echo "Stage 0: Data preparation"
+    local/data_prep.sh \
+        --fs "$(yq ".sampling_rate" "${conf}")" \
+        --num_dev "${num_dev}" \
+        --num_eval "${num_eval}" \
+        --train_set "${train_set}" \
+        --dev_set "${dev_set}" \
+        --eval_set "${eval_set}" \
+        --shuffle "${shuffle}" \
+        "${db_root}" data
+fi
+
+stats_ext=$(grep -q "hdf5" <(yq ".format" "${conf}") && echo "h5" || echo "npy")
+if [ "${stage}" -le 1 ] && [ "${stop_stage}" -ge 1 ]; then
+    echo "Stage 1: Feature extraction"
+    # extract raw features
+    pids=()
+    for name in "${train_set}" "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${dumpdir}/${name}/raw" ] && mkdir -p "${dumpdir}/${name}/raw"
+        echo "Feature extraction start. See the progress via ${dumpdir}/${name}/raw/preprocessing.*.log."
+        utils/make_subset_data.sh "data/${name}" "${n_jobs}" "${dumpdir}/${name}/raw"
+        ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/raw/preprocessing.JOB.log" \
+            parallel-wavegan-preprocess \
+                --config "${conf}" \
+                --scp "${dumpdir}/${name}/raw/wav.JOB.scp" \
+                --dumpdir "${dumpdir}/${name}/raw/dump.JOB" \
+                --verbose "${verbose}"
+        echo "Successfully finished feature extraction of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished feature extraction."
+
+    # calculate statistics for normalization
+    if [ -z "${pretrain}" ]; then
+        # calculate statistics for normalization
+        echo "Statistics computation start. See the progress via ${dumpdir}/${train_set}/compute_statistics.log."
+        ${train_cmd} "${dumpdir}/${train_set}/compute_statistics.log" \
+            parallel-wavegan-compute-statistics \
+                --config "${conf}" \
+                --rootdir "${dumpdir}/${train_set}/raw" \
+                --dumpdir "${dumpdir}/${train_set}" \
+                --verbose "${verbose}"
+        echo "Successfully finished calculation of statistics."
+    else
+        echo "Use statistics of pretrained model. Skip statistics computation."
+        cp "$(dirname "${pretrain}")/stats.${stats_ext}" "${dumpdir}/${train_set}"
+    fi
+
+    # normalize and dump them
+    pids=()
+    for name in "${train_set}" "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${dumpdir}/${name}/norm" ] && mkdir -p "${dumpdir}/${name}/norm"
+        echo "Nomalization start. See the progress via ${dumpdir}/${name}/norm/normalize.*.log."
+        ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/norm/normalize.JOB.log" \
+            parallel-wavegan-normalize \
+                --config "${conf}" \
+                --stats "${dumpdir}/${train_set}/stats.${stats_ext}" \
+                --rootdir "${dumpdir}/${name}/raw/dump.JOB" \
+                --dumpdir "${dumpdir}/${name}/norm/dump.JOB" \
+                --verbose "${verbose}"
+        echo "Successfully finished normalization of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished normalization."
+fi
+
+if [ -z "${tag}" ]; then
+    expdir="exp/${train_set}_$(basename "${conf}" .yaml)"
+    if [ -n "${pretrain}" ]; then
+        pretrain_tag=$(basename "$(dirname "${pretrain}")")
+        expdir+="_${pretrain_tag}"
+    fi
+else
+    expdir="exp/${train_set}_${tag}"
+fi
+if [ "${stage}" -le 2 ] && [ "${stop_stage}" -ge 2 ]; then
+    echo "Stage 2: Network training"
+    [ ! -e "${expdir}" ] && mkdir -p "${expdir}"
+    cp "${dumpdir}/${train_set}/stats.${stats_ext}" "${expdir}"
+    if [ "${n_gpus}" -gt 1 ]; then
+        train="python -m parallel_wavegan.distributed.launch --nproc_per_node ${n_gpus} -c parallel-wavegan-train"
+    else
+        train="parallel-wavegan-train"
+    fi
+    echo "Training start. See the progress via ${expdir}/train.log."
+    ${cuda_cmd} --gpu "${n_gpus}" "${expdir}/train.log" \
+        ${train} \
+            --config "${conf}" \
+            --train-dumpdir "${dumpdir}/${train_set}/norm" \
+            --dev-dumpdir "${dumpdir}/${dev_set}/norm" \
+            --outdir "${expdir}" \
+            --resume "${resume}" \
+            --pretrain "${pretrain}" \
+            --verbose "${verbose}"
+    echo "Successfully finished training."
+fi
+
+if [ "${stage}" -le 3 ] && [ "${stop_stage}" -ge 3 ]; then
+    echo "Stage 3: Network decoding"
+    # shellcheck disable=SC2012
+    [ -z "${checkpoint}" ] && checkpoint="$(ls -dt "${expdir}"/*.pkl | head -1 || true)"
+    outdir="${expdir}/wav/$(basename "${checkpoint}" .pkl)"
+    pids=()
+    for name in "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${outdir}/${name}" ] && mkdir -p "${outdir}/${name}"
+        [ "${n_gpus}" -gt 1 ] && n_gpus=1
+        echo "Decoding start. See the progress via ${outdir}/${name}/decode.log."
+        ${cuda_cmd} --gpu "${n_gpus}" "${outdir}/${name}/decode.log" \
+            parallel-wavegan-decode \
+                --dumpdir "${dumpdir}/${name}/norm" \
+                --checkpoint "${checkpoint}" \
+                --outdir "${outdir}/${name}" \
+                --verbose "${verbose}"
+        echo "Successfully finished decoding of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished decoding."
+fi
+echo "Finished."
diff --git a/ParallelWaveGAN/egs/template_single_spk/voc1/utils b/ParallelWaveGAN/egs/template_single_spk/voc1/utils
new file mode 120000
index 0000000000000000000000000000000000000000..973afe674f2c85f7a400600f963e0709767602dc
--- /dev/null
+++ b/ParallelWaveGAN/egs/template_single_spk/voc1/utils
@@ -0,0 +1 @@
+../../../utils
\ No newline at end of file
diff --git a/ParallelWaveGAN/egs/vctk/voc1/cmd.sh b/ParallelWaveGAN/egs/vctk/voc1/cmd.sh
new file mode 100644
index 0000000000000000000000000000000000000000..19f342102fc4f3389157c48f1196b16b68eb1cf1
--- /dev/null
+++ b/ParallelWaveGAN/egs/vctk/voc1/cmd.sh
@@ -0,0 +1,91 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend="local"
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="utils/run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="utils/run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="utils/run.pl"
+
+# Local machine, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="utils/stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="utils/stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="utils/stdout.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="utils/queue.pl"
+    export cuda_cmd="utils/queue.pl"
+    export decode_cmd="utils/queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="utils/slurm.pl"
+    export cuda_cmd="utils/slurm.pl"
+    export decode_cmd="utils/slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="utils/ssh.pl"
+    export cuda_cmd="utils/ssh.pl"
+    export decode_cmd="utils/ssh.pl"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/ParallelWaveGAN/egs/vctk/voc1/conf/hifigan.v1.yaml b/ParallelWaveGAN/egs/vctk/voc1/conf/hifigan.v1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..867b251736c649dda258202822c8d249ae0d1703
--- /dev/null
+++ b/ParallelWaveGAN/egs/vctk/voc1/conf/hifigan.v1.yaml
@@ -0,0 +1,180 @@
+# This is the configuration file for VCTK dataset.
+# This configuration is based on HiFiGAN V1, which is
+# an official configuration. But I found that the optimizer
+# setting does not work well with my implementation.
+# So I changed optimizer settings as follows:
+# - AdamW -> Adam
+# - betas: [0.8, 0.99] -> betas: [0.5, 0.9]
+# - Scheduler: ExponentialLR -> MultiStepLR
+# To match the shift size difference, the upsample scales
+# is also modified from the original 256 shift setting.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 24000     # Sampling rate.
+fft_size: 2048           # FFT size.
+hop_size: 300            # Hop size.
+win_length: 1200         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: false      # Whether to trim the start and end of silence.
+trim_threshold_in_db: 20 # Need to tune carefully if the recording is not good.
+trim_frame_size: 1024    # Frame size in trimming.
+trim_hop_size: 256       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_type: HiFiGANGenerator
+generator_params:
+    in_channels: 80                       # Number of input channels.
+    out_channels: 1                       # Number of output channels.
+    channels: 512                         # Number of initial channels.
+    kernel_size: 7                        # Kernel size of initial and final conv layers.
+    upsample_scales: [5, 5, 4, 3]         # Upsampling scales.
+    upsample_kernel_sizes: [10, 10, 8, 6] # Kernel size for upsampling layers.
+    resblock_kernel_sizes: [3, 7, 11]     # Kernel size for residual blocks.
+    resblock_dilations:                   # Dilations for residual blocks.
+        - [1, 3, 5]
+        - [1, 3, 5]
+        - [1, 3, 5]
+    use_additional_convs: true            # Whether to use additional conv layer in residual blocks.
+    bias: true                            # Whether to use bias parameter in conv.
+    nonlinear_activation: "LeakyReLU"     # Nonlinear activation type.
+    nonlinear_activation_params:          # Nonlinear activation paramters.
+        negative_slope: 0.1
+    use_weight_norm: true                 # Whether to apply weight normalization.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_type: HiFiGANMultiScaleMultiPeriodDiscriminator
+discriminator_params:
+    scales: 3                              # Number of multi-scale discriminator.
+    scale_downsample_pooling: "AvgPool1d"  # Pooling operation for scale discriminator.
+    scale_downsample_pooling_params:
+        kernel_size: 4                     # Pooling kernel size.
+        stride: 2                          # Pooling stride.
+        padding: 2                         # Padding size.
+    scale_discriminator_params:
+        in_channels: 1                     # Number of input channels.
+        out_channels: 1                    # Number of output channels.
+        kernel_sizes: [15, 41, 5, 3]       # List of kernel sizes.
+        channels: 128                      # Initial number of channels.
+        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
+        max_groups: 16                     # Maximum number of groups in downsampling conv layers.
+        bias: true
+        downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales.
+        nonlinear_activation: "LeakyReLU"  # Nonlinear activation.
+        nonlinear_activation_params:
+            negative_slope: 0.1
+    follow_official_norm: true             # Whether to follow the official norm setting.
+    periods: [2, 3, 5, 7, 11]              # List of period for multi-period discriminator.
+    period_discriminator_params:
+        in_channels: 1                     # Number of input channels.
+        out_channels: 1                    # Number of output channels.
+        kernel_sizes: [5, 3]               # List of kernel sizes.
+        channels: 32                       # Initial number of channels.
+        downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
+        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
+        bias: true                         # Whether to use bias parameter in conv layer."
+        nonlinear_activation: "LeakyReLU"  # Nonlinear activation.
+        nonlinear_activation_params:       # Nonlinear activation paramters.
+            negative_slope: 0.1
+        use_weight_norm: true              # Whether to apply weight normalization.
+        use_spectral_norm: false           # Whether to apply spectral normalization.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+use_stft_loss: false                 # Whether to use multi-resolution STFT loss.
+use_mel_loss: true                   # Whether to use Mel-spectrogram loss.
+mel_loss_params:
+    fs: 24000
+    fft_size: 2048
+    hop_size: 300
+    win_length: 1200
+    window: "hann"
+    num_mels: 80
+    fmin: 0
+    fmax: 12000
+    log_base: null
+generator_adv_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+discriminator_adv_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+use_feat_match_loss: true
+feat_match_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+    average_by_layers: false         # Whether to average loss by #layers in each discriminator.
+    include_final_outputs: false     # Whether to include final outputs in feat match loss calculation.
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_aux: 45.0       # Loss balancing coefficient for STFT loss.
+lambda_adv: 1.0        # Loss balancing coefficient for adversarial loss.
+lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss..
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 16              # Batch size.
+batch_max_steps: 8400       # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2              # Number of workers in Pytorch DataLoader.
+remove_short_samples: false # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_type: Adam
+generator_optimizer_params:
+    lr: 2.0e-4
+    betas: [0.5, 0.9]
+    weight_decay: 0.0
+generator_scheduler_type: MultiStepLR
+generator_scheduler_params:
+    gamma: 0.5
+    milestones:
+        - 200000
+        - 400000
+        - 600000
+        - 800000
+generator_grad_norm: -1
+discriminator_optimizer_type: Adam
+discriminator_optimizer_params:
+    lr: 2.0e-4
+    betas: [0.5, 0.9]
+    weight_decay: 0.0
+discriminator_scheduler_type: MultiStepLR
+discriminator_scheduler_params:
+    gamma: 0.5
+    milestones:
+        - 200000
+        - 400000
+        - 600000
+        - 800000
+discriminator_grad_norm: -1
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+generator_train_start_steps: 1     # Number of steps to start to train discriminator.
+discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
+train_max_steps: 2500000           # Number of training steps.
+save_interval_steps: 10000         # Interval steps to save checkpoint.
+eval_interval_steps: 1000          # Interval steps to evaluate the network.
+log_interval_steps: 100            # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/vctk/voc1/conf/multi_band_melgan.v2.yaml b/ParallelWaveGAN/egs/vctk/voc1/conf/multi_band_melgan.v2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4704d06178ae03b85f5fe202bb80ec9a658ece7a
--- /dev/null
+++ b/ParallelWaveGAN/egs/vctk/voc1/conf/multi_band_melgan.v2.yaml
@@ -0,0 +1,154 @@
+# This is the hyperparameter configuration file for MelGAN.
+# Please make sure this is adjusted for the VCTK dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration requires ~ 6GB memory and will finish within 3 days on Titan V.
+
+# This configuration is based on multi-band MelGAN but the hop size and sampling
+# rate is different from the paper (16kHz vs 24kHz). The number of iteraions
+# is now shown in the paper so currently we train 1M iterations (not sure enough
+# to converge). Compared to multi_band_melgan.v1 config, Adam optimizer without
+# gradient norm is used, which is based on @dathudeptrai advice.
+# https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906
+
+# We found that the use of small batch_max_steps has no bad effect for the quality,
+# so we use 8,100 for batch_max_steps to accelerate the training.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 24000     # Sampling rate.
+fft_size: 2048           # FFT size.
+hop_size: 300            # Hop size.
+win_length: 1200         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: false      # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 2048    # Frame size in trimming.
+trim_hop_size: 512       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_type: "MelGANGenerator" # Generator type.
+generator_params:
+    in_channels: 80               # Number of input channels.
+    out_channels: 4               # Number of output channels.
+    kernel_size: 7                # Kernel size of initial and final conv layers.
+    channels: 384                 # Initial number of channels for conv layers.
+    upsample_scales: [5, 5, 3]    # List of Upsampling scales.
+    stack_kernel_size: 3          # Kernel size of dilated conv layers in residual stack.
+    stacks: 4                     # Number of stacks in a single residual stack module.
+    use_weight_norm: True         # Whether to use weight normalization.
+    use_causal_conv: False        # Whether to use causal convolution.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_type: "MelGANMultiScaleDiscriminator" # Discriminator type.
+discriminator_params:
+    in_channels: 1                    # Number of input channels.
+    out_channels: 1                   # Number of output channels.
+    scales: 3                         # Number of multi-scales.
+    downsample_pooling: "AvgPool1d"   # Pooling type for the input downsampling.
+    downsample_pooling_params:        # Parameters of the above pooling function.
+        kernel_size: 4
+        stride: 2
+        padding: 1
+        count_include_pad: False
+    kernel_sizes: [5, 3]              # List of kernel size.
+    channels: 16                      # Number of channels of the initial conv layer.
+    max_downsample_channels: 512      # Maximum number of channels of downsampling layers.
+    downsample_scales: [4, 4, 4]      # List of downsampling scales.
+    nonlinear_activation: "LeakyReLU" # Nonlinear activation function.
+    nonlinear_activation_params:      # Parameters of nonlinear activation function.
+        negative_slope: 0.2
+    use_weight_norm: True             # Whether to use weight norm.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+use_subband_stft_loss: true
+subband_stft_loss_params:
+    fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
+    hop_sizes: [30, 60, 10]     # List of hop size for STFT-based loss
+    win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
+    window: "hann_window"       # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+use_feat_match_loss: false # Whether to use feature matching loss.
+lambda_adv: 2.5            # Loss balancing coefficient for adversarial loss.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 64             # Batch size.
+batch_max_steps: 8100      # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
+num_workers: 4             # Number of workers in Pytorch DataLoader.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_type: "Adam"       # Generator's optimizer type.
+generator_optimizer_params:
+    lr: 1.0e-3                          # Generator's learning rate.
+    eps: 1.0e-7                         # Generator's epsilon.
+    weight_decay: 0.0                   # Generator's weight decay coefficient.
+    amsgrad: true
+generator_grad_norm: -1                 # Generator's gradient norm.
+generator_scheduler_type: "MultiStepLR" # Generator's scheduler type.
+generator_scheduler_params:
+    gamma: 0.5                          # Generator's scheduler gamma.
+    milestones:                         # At each milestone, lr will be multiplied by gamma.
+        - 100000
+        - 200000
+        - 300000
+        - 400000
+        - 500000
+        - 600000
+discriminator_optimizer_type: "Adam"        # Discriminator's optimizer type.
+discriminator_optimizer_params:
+    lr: 1.0e-3                              # Discriminator's learning rate.
+    eps: 1.0e-7                             # Discriminator's epsilon.
+    weight_decay: 0.0                       # Discriminator's weight decay coefficient.
+    amsgrad: true
+discriminator_grad_norm: -1                 # Discriminator's gradient norm.
+discriminator_scheduler_type: "MultiStepLR" # Discriminator's scheduler type.
+discriminator_scheduler_params:
+    gamma: 0.5                              # Discriminator's scheduler gamma.
+    milestones:                             # At each milestone, lr will be multiplied by gamma.
+        - 100000
+        - 200000
+        - 300000
+        - 400000
+        - 500000
+        - 600000
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 200000 # Number of steps to start to train discriminator.
+train_max_steps: 1000000                # Number of training steps.
+save_interval_steps: 50000              # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+log_interval_steps: 1000                # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/vctk/voc1/conf/parallel_wavegan.v1.long.yaml b/ParallelWaveGAN/egs/vctk/voc1/conf/parallel_wavegan.v1.long.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6597746baba7de2fab1d7c40dd2281745d74852b
--- /dev/null
+++ b/ParallelWaveGAN/egs/vctk/voc1/conf/parallel_wavegan.v1.long.yaml
@@ -0,0 +1,122 @@
+# This is the hyperparameter configuration file for Parallel WaveGAN.
+# Please make sure this is adjusted for the VCTK corpus. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration requires 12 GB GPU memory and takes ~7 days on RTX TITAN.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 24000     # Sampling rate.
+fft_size: 2048           # FFT size.
+hop_size: 300            # Hop size.
+win_length: 1200         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: false      # Whether to trim the start and end of silence.
+trim_threshold_in_db: 20 # Need to tune carefully if the recording is not good.
+trim_frame_size: 1024    # Frame size in trimming.
+trim_hop_size: 256       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Kernel size of dilated convolution.
+    layers: 30            # Number of residual block layers.
+    stacks: 3             # Number of stacks i.e., dilation cycles.
+    residual_channels: 64 # Number of channels in residual conv.
+    gate_channels: 128    # Number of channels in gated conv.
+    skip_channels: 64     # Number of channels in skip conv.
+    aux_channels: 80      # Number of channels for auxiliary feature conv.
+                          # Must be the same as num_mels.
+    aux_context_window: 2 # Context window size for auxiliary feature.
+                          # If set to 2, previous 2 and future 2 frames will be considered.
+    dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
+    upsample_params:                      # Upsampling network parameters.
+        upsample_scales: [4, 5, 3, 5]     # Upsampling scales. Prodcut of these must be the same as hop size.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Number of output channels.
+    layers: 10            # Number of conv layers.
+    conv_channels: 64     # Number of chnn layers.
+    bias: true            # Whether to use bias parameter in conv.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
+    nonlinear_activation_params:      # Nonlinear function parameters
+        negative_slope: 0.2           # Alpha in LeakyReLU.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 4.0  # Loss balancing coefficient.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 6              # Batch size.
+batch_max_steps: 24000     # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2             # Number of workers in Pytorch DataLoader.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    lr: 0.0001             # Generator's learning rate.
+    eps: 1.0e-6            # Generator's epsilon.
+    weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+    step_size: 200000      # Generator's scheduler step size.
+    gamma: 0.5             # Generator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+    lr: 0.00005            # Discriminator's learning rate.
+    eps: 1.0e-6            # Discriminator's epsilon.
+    weight_decay: 0.0      # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    step_size: 200000      # Discriminator's scheduler step size.
+    gamma: 0.5             # Discriminator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 1000000                # Number of training steps.
+save_interval_steps: 5000               # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+log_interval_steps: 100                 # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/vctk/voc1/conf/parallel_wavegan.v1.yaml b/ParallelWaveGAN/egs/vctk/voc1/conf/parallel_wavegan.v1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..de0fc4822c7b6856cbfdae33664b02e867fe5534
--- /dev/null
+++ b/ParallelWaveGAN/egs/vctk/voc1/conf/parallel_wavegan.v1.yaml
@@ -0,0 +1,122 @@
+# This is the hyperparameter configuration file for Parallel WaveGAN.
+# Please make sure this is adjusted for the VCTK corpus. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration requires 12 GB GPU memory and takes ~3 days on RTX TITAN.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 24000     # Sampling rate.
+fft_size: 2048           # FFT size.
+hop_size: 300            # Hop size.
+win_length: 1200         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: false      # Whether to trim the start and end of silence.
+trim_threshold_in_db: 20 # Need to tune carefully if the recording is not good.
+trim_frame_size: 1024    # Frame size in trimming.
+trim_hop_size: 256       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Kernel size of dilated convolution.
+    layers: 30            # Number of residual block layers.
+    stacks: 3             # Number of stacks i.e., dilation cycles.
+    residual_channels: 64 # Number of channels in residual conv.
+    gate_channels: 128    # Number of channels in gated conv.
+    skip_channels: 64     # Number of channels in skip conv.
+    aux_channels: 80      # Number of channels for auxiliary feature conv.
+                          # Must be the same as num_mels.
+    aux_context_window: 2 # Context window size for auxiliary feature.
+                          # If set to 2, previous 2 and future 2 frames will be considered.
+    dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
+    upsample_params:                      # Upsampling network parameters.
+        upsample_scales: [4, 5, 3, 5]     # Upsampling scales. Prodcut of these must be the same as hop size.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Number of output channels.
+    layers: 10            # Number of conv layers.
+    conv_channels: 64     # Number of chnn layers.
+    bias: true            # Whether to use bias parameter in conv.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
+    nonlinear_activation_params:      # Nonlinear function parameters
+        negative_slope: 0.2           # Alpha in LeakyReLU.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 4.0  # Loss balancing coefficient.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 6              # Batch size.
+batch_max_steps: 24000     # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2             # Number of workers in Pytorch DataLoader.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    lr: 0.0001             # Generator's learning rate.
+    eps: 1.0e-6            # Generator's epsilon.
+    weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+    step_size: 200000      # Generator's scheduler step size.
+    gamma: 0.5             # Generator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+    lr: 0.00005            # Discriminator's learning rate.
+    eps: 1.0e-6            # Discriminator's epsilon.
+    weight_decay: 0.0      # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    step_size: 200000      # Discriminator's scheduler step size.
+    gamma: 0.5             # Discriminator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 1000000                # Number of training steps.
+save_interval_steps: 5000               # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+log_interval_steps: 100                 # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/vctk/voc1/conf/slurm.conf b/ParallelWaveGAN/egs/vctk/voc1/conf/slurm.conf
new file mode 100644
index 0000000000000000000000000000000000000000..cefd21f031d647f798e8045a6beb92b0b833d041
--- /dev/null
+++ b/ParallelWaveGAN/egs/vctk/voc1/conf/slurm.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command sbatch --export=PATH  --ntasks-per-node=1
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
+option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1 # Do not add anything to qsub_opts
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/ParallelWaveGAN/egs/vctk/voc1/conf/style_melgan.v1.yaml b/ParallelWaveGAN/egs/vctk/voc1/conf/style_melgan.v1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..187ea76ac27b3d0d7db54926c1c5e0516b667b2e
--- /dev/null
+++ b/ParallelWaveGAN/egs/vctk/voc1/conf/style_melgan.v1.yaml
@@ -0,0 +1,147 @@
+# This is the configuration file for VCTK dataset.
+# This configuration is based on StyleMelGAN paper but
+# uses MSE loss instead of Hinge loss. And I found that
+# batch_size = 8 is also working good. So maybe if you
+# want to accelerate the training, you can reduce the
+# batch size (e.g. 8 or 16). Upsampling scales is modified
+# to fit the shift size 300 pt.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 24000     # Sampling rate.
+fft_size: 2048           # FFT size.
+hop_size: 300            # Hop size.
+win_length: 1200         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: false      # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 1024    # Frame size in trimming.
+trim_hop_size: 256       # Hop size in trimming.
+format: "hdf5"           # Feature file format. " npy " or " hdf5 " is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_type: "StyleMelGANGenerator" # Generator type.
+generator_params:
+    in_channels: 128
+    aux_channels: 80
+    channels: 64
+    out_channels: 1
+    kernel_size: 9
+    dilation: 2
+    bias: True
+    noise_upsample_scales: [10, 2, 2, 2]
+    noise_upsample_activation: "LeakyReLU"
+    noise_upsample_activation_params:
+        negative_slope: 0.2
+    upsample_scales: [5, 1, 5, 1, 3, 1, 2, 2, 1]
+    upsample_mode: "nearest"
+    gated_function: "softmax"
+    use_weight_norm: True
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_type: "StyleMelGANDiscriminator" # Discriminator type.
+discriminator_params:
+    repeats: 4
+    window_sizes: [512, 1024, 2048, 4096]
+    pqmf_params:
+        - [1, None, None, None]
+        - [2, 62, 0.26700, 9.0]
+        - [4, 62, 0.14200, 9.0]
+        - [8, 62, 0.07949, 9.0]
+    discriminator_params:
+        out_channels: 1
+        kernel_sizes: [5, 3]
+        channels: 16
+        max_downsample_channels: 512
+        bias: True
+        downsample_scales: [4, 4, 4, 1]
+        nonlinear_activation: "LeakyReLU"
+        nonlinear_activation_params:
+            negative_slope: 0.2
+    use_weight_norm: True
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+lambda_aux: 1.0                   # Loss balancing coefficient for aux loss.
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 1.0 # Loss balancing coefficient for adv loss.
+generator_adv_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+discriminator_adv_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 32              # Batch size.
+batch_max_steps: 24000      # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2              # Number of workers in Pytorch DataLoader.
+remove_short_samples: false # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_type: Adam
+generator_optimizer_params:
+    lr: 1.0e-4
+    betas: [0.5, 0.9]
+    weight_decay: 0.0
+generator_scheduler_type: MultiStepLR
+generator_scheduler_params:
+    gamma: 0.5
+    milestones:
+        - 100000
+        - 300000
+        - 500000
+        - 700000
+        - 900000
+generator_grad_norm: -1
+discriminator_optimizer_type: Adam
+discriminator_optimizer_params:
+    lr: 2.0e-4
+    betas: [0.5, 0.9]
+    weight_decay: 0.0
+discriminator_scheduler_type: MultiStepLR
+discriminator_scheduler_params:
+    gamma: 0.5
+    milestones:
+        - 200000
+        - 400000
+        - 600000
+        - 800000
+discriminator_grad_norm: -1
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 1500000                # Number of training steps.
+save_interval_steps: 50000              # Interval steps to save checkpoint.
+eval_interval_steps: 1000              # Interval steps to evaluate the network.
+log_interval_steps: 100                # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/vctk/voc1/local/data_download.sh b/ParallelWaveGAN/egs/vctk/voc1/local/data_download.sh
new file mode 100755
index 0000000000000000000000000000000000000000..3367c9ca677698b0a4548642d5fc60a9723b5073
--- /dev/null
+++ b/ParallelWaveGAN/egs/vctk/voc1/local/data_download.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+# Copyright 2020 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+download_dir=$1
+
+# check arguments
+if [ $# != 1 ]; then
+    echo "Usage: $0 <download_dir>"
+    exit 1
+fi
+
+set -euo pipefail
+
+cwd=$(pwd)
+if [ ! -e "${download_dir}/VCTK-Corpus" ]; then
+    mkdir -p "${download_dir}"
+    cd "${download_dir}" || exit 1;
+    wget http://www.udialogue.org/download/VCTK-Corpus.tar.gz
+    tar xvzf ./VCTK-Corpus.tar.gz
+    rm ./VCTK-Corpus.tar.gz
+    cd "${cwd}" || exit 1;
+    echo "Successfully downloaded data."
+else
+    echo "Already exists. Skipped."
+fi
+
+if [ ! -e "${download_dir}/VCTK-Corpus/lab" ]; then
+    cd "${download_dir}" || exit 1;
+    git clone https://github.com/kan-bayashi/VCTKCorpusFullContextLabel.git
+    cp -r VCTKCorpusFullContextLabel/lab ./VCTK-Corpus
+    cd "${cwd}" || exit 1;
+    echo "Successfully downloaded label data."
+else
+    echo "Already exists. Skipped."
+fi
diff --git a/ParallelWaveGAN/egs/vctk/voc1/local/data_prep.sh b/ParallelWaveGAN/egs/vctk/voc1/local/data_prep.sh
new file mode 100755
index 0000000000000000000000000000000000000000..eb17e4388a7b32fd61d396056f91ed7fb67ec48c
--- /dev/null
+++ b/ParallelWaveGAN/egs/vctk/voc1/local/data_prep.sh
@@ -0,0 +1,118 @@
+#!/bin/bash
+
+# Copyright 2020 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+# shellcheck disable=SC1091
+. ./path.sh || exit 1;
+
+fs=24000
+num_dev=10
+num_eval=10
+train_set="train_nodev"
+dev_set="dev"
+eval_set="eval"
+shuffle=false
+
+# shellcheck disable=SC1091
+. utils/parse_options.sh || exit 1;
+
+db_root=$1
+spk=$2
+data_dir=$3
+
+# check arguments
+if [ $# != 3 ]; then
+    echo "Usage: $0 [Options] <db_root> <spk> <data_dir>"
+    echo "e.g.: $0 downloads/VCTK-Corpus p225 data"
+    echo ""
+    echo "Options:"
+    echo "    --fs: target sampling rate (default=24000)."
+    echo "    --num_dev: number of development uttreances (default=10)."
+    echo "    --num_eval: number of evaluation uttreances (default=10)."
+    echo "    --train_set: name of train set (default=train_nodev)."
+    echo "    --dev_set: name of dev set (default=dev)."
+    echo "    --eval_set: name of eval set (default=eval)."
+    echo "    --shuffle: whether to perform shuffle in making dev / eval set (default=false)."
+    exit 1
+fi
+
+set -euo pipefail
+
+# check spk existence
+[ ! -e "${db_root}/lab/mono/${spk}" ] && \
+    echo "${spk} does not exist." >&2 && exit 1;
+
+[ ! -e "${data_dir}/all" ] && mkdir -p "${data_dir}/all"
+
+# set filenames
+scp="${data_dir}/all/wav.scp"
+segments="${data_dir}/all/segments"
+
+# check file existence
+[ -e "${scp}" ] && rm "${scp}"
+[ -e "${segments}" ] && rm "${segments}"
+
+# make scp and segments
+find "${db_root}/wav48/${spk}" -follow -name "*.wav" | sort | while read -r wav; do
+    id=$(basename "${wav}" | sed -e "s/\.[^\.]*$//g")
+    lab=${db_root}/lab/mono/${spk}/${id}.lab
+
+    # check lab existence
+    if [ ! -e "${lab}" ]; then
+        echo "${id} does not have a label file. skipped."
+        continue
+    fi
+
+    echo "${id} cat ${wav} | sox -t wav - -c 1 -b 16 -t wav - rate ${fs} |" >> "${scp}"
+
+    # parse start and end time from HTS-style mono label
+    idx=1
+    while true; do
+        next_idx=$((idx+1))
+        next_symbol=$(sed -n "${next_idx}p" "${lab}" | awk '{print $3}')
+        if [ "${next_symbol}" != "pau" ]; then
+            start_nsec=$(sed -n "${idx}p" "${lab}" | awk '{print $2}')
+            break
+        fi
+        idx=${next_idx}
+    done
+    idx=$(wc -l < "${lab}")
+    while true; do
+        prev_idx=$((idx-1))
+        prev_symbol=$(sed -n "${prev_idx}p" "${lab}" | awk '{print $3}')
+        if [ "${prev_symbol}" != "pau" ]; then
+            end_nsec=$(sed -n "${idx}p" "${lab}" | awk '{print $1}')
+            break
+        fi
+        idx=${prev_idx}
+    done
+    start_sec=$(echo "${start_nsec}*0.0000001" | bc | sed "s/^\./0./")
+    end_sec=$(echo "${end_nsec}*0.0000001" | bc | sed "s/^\./0./")
+    echo "${id} ${id} ${start_sec} ${end_sec}" >> "${segments}"
+done
+
+# split
+num_all=$(wc -l < "${scp}")
+num_deveval=$((num_dev + num_eval))
+num_train=$((num_all - num_deveval))
+utils/split_data.sh \
+    --num_first "${num_train}" \
+    --num_second "${num_deveval}" \
+    --shuffle "${shuffle}" \
+    "${data_dir}/all" \
+    "${data_dir}/${train_set}" \
+    "${data_dir}/deveval"
+utils/split_data.sh \
+    --num_first "${num_dev}" \
+    --num_second "${num_eval}" \
+    --shuffle "${shuffle}" \
+    "${data_dir}/deveval" \
+    "${data_dir}/${dev_set}" \
+    "${data_dir}/${eval_set}"
+
+# remove tmp directories
+rm -rf "${data_dir}/all"
+rm -rf "${data_dir}/deveval"
+
+echo "Successfully prepared data."
diff --git a/ParallelWaveGAN/egs/vctk/voc1/path.sh b/ParallelWaveGAN/egs/vctk/voc1/path.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b0ca27c615f70aa29e240222ec370f8ad4e7b45a
--- /dev/null
+++ b/ParallelWaveGAN/egs/vctk/voc1/path.sh
@@ -0,0 +1,33 @@
+# cuda related
+export CUDA_HOME=/usr/local/cuda-10.0
+export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
+
+# path related
+export PRJ_ROOT="${PWD}/../../.."
+if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then
+    # shellcheck disable=SC1090
+    . "${PRJ_ROOT}/tools/venv/bin/activate"
+fi
+
+# python related
+export OMP_NUM_THREADS=1
+export PYTHONIOENCODING=UTF-8
+export MPL_BACKEND=Agg
+
+# check installation
+if ! command -v parallel-wavegan-train > /dev/null; then
+    echo "Error: It seems setup is not finished." >&2
+    echo "Error: Please setup your environment by following README.md" >&2
+    return 1
+fi
+if ! command -v jq > /dev/null; then
+    echo "Error: It seems jq is not installed." >&2
+    echo "Error: Please install via \`sudo apt-get install jq\`." >&2
+    echo "Error: If you do not have sudo, please download from https://stedolan.github.io/jq/download/." >&2
+    return 1
+fi
+if ! command -v yq > /dev/null; then
+    echo "Error: It seems yq is not installed." >&2
+    echo "Error: Please install via \`pip install yq\`." >&2
+    return 1
+fi
diff --git a/ParallelWaveGAN/egs/vctk/voc1/run.sh b/ParallelWaveGAN/egs/vctk/voc1/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..5f58c42f36dd7cc8e7ca2917814a72face2222cc
--- /dev/null
+++ b/ParallelWaveGAN/egs/vctk/voc1/run.sh
@@ -0,0 +1,188 @@
+#!/bin/bash
+
+# Copyright 2020 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+. ./cmd.sh || exit 1;
+. ./path.sh || exit 1;
+
+# basic settings
+stage=-1       # stage to start
+stop_stage=100 # stage to stop
+verbose=1      # verbosity level (lower is less info)
+n_gpus=1       # number of gpus in training
+n_jobs=16      # number of parallel jobs in feature extraction
+
+# NOTE(kan-bayashi): renamed to conf to avoid conflict in parse_options.sh
+conf=conf/parallel_wavegan.v1.yaml
+
+# speaker setting
+spks="all" # all or you can choose speakers e.g., "p225 p226 p227 ..."
+
+# directory path setting
+download_dir=downloads # directory to save database
+dumpdir=dump           # directory to dump features
+
+# training related setting
+tag=""     # tag for directory to save model
+resume=""  # checkpoint path to resume training
+           # (e.g. <path>/<to>/checkpoint-10000steps.pkl)
+
+# decoding related setting
+checkpoint="" # checkpoint path to be used for decoding
+              # if not provided, the latest one will be used
+              # (e.g. <path>/<to>/checkpoint-400000steps.pkl)
+
+# shellcheck disable=SC1091
+. utils/parse_options.sh || exit 1;
+
+train_set="train_nodev_$(echo "${spks}" | tr " " "_")" # name of training data directory
+dev_set="dev_$(echo "${spks}" | tr " " "_")"           # name of development data directory
+eval_set="eval_$(echo "${spks}" | tr " " "_")"         # name of evaluation data directory
+
+set -euo pipefail
+
+if [ "${stage}" -le -1 ] && [ "${stop_stage}" -ge -1 ]; then
+    echo "Stage -1: Data download"
+    local/data_download.sh "${download_dir}"
+fi
+
+if [ "${stage}" -le 0 ] && [ "${stop_stage}" -ge 0 ]; then
+    echo "Stage 0: Data preparation"
+    train_data_dirs=""
+    dev_data_dirs=""
+    eval_data_dirs=""
+    # if set to "all", use all of the speakers in the corpus
+    if [ "${spks}" = "all" ]; then
+        # NOTE(kan-bayashi): p315 will not be used since it lacks txt data
+        spks=$(find "${download_dir}/VCTK-Corpus/wav48" \
+            -maxdepth 1 -name "p*" -exec basename {} \; | sort | grep -v p315)
+    fi
+    for spk in ${spks}; do
+        local/data_prep.sh \
+            --fs "$(yq ".sampling_rate" "${conf}")" \
+            --train_set "train_nodev_${spk}" \
+            --dev_set "dev_${spk}" \
+            --eval_set "eval_${spk}" \
+            "${download_dir}/VCTK-Corpus" "${spk}" data
+        train_data_dirs+=" data/train_nodev_${spk}"
+        dev_data_dirs+=" data/dev_${spk}"
+        eval_data_dirs+=" data/eval_${spk}"
+    done
+    # shellcheck disable=SC2086
+    utils/combine_data.sh "data/${train_set}" ${train_data_dirs}
+    # shellcheck disable=SC2086
+    utils/combine_data.sh "data/${dev_set}" ${dev_data_dirs}
+    # shellcheck disable=SC2086
+    utils/combine_data.sh "data/${eval_set}" ${eval_data_dirs}
+fi
+
+stats_ext=$(grep -q "hdf5" <(yq ".format" "${conf}") && echo "h5" || echo "npy")
+if [ "${stage}" -le 1 ] && [ "${stop_stage}" -ge 1 ]; then
+    echo "Stage 1: Feature extraction"
+    # extract raw features
+    pids=()
+    for name in "${train_set}" "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${dumpdir}/${name}/raw" ] && mkdir -p "${dumpdir}/${name}/raw"
+        echo "Feature extraction start. See the progress via ${dumpdir}/${name}/raw/preprocessing.*.log."
+        utils/make_subset_data.sh "data/${name}" "${n_jobs}" "${dumpdir}/${name}/raw"
+        ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/raw/preprocessing.JOB.log" \
+            parallel-wavegan-preprocess \
+                --config "${conf}" \
+                --scp "${dumpdir}/${name}/raw/wav.JOB.scp" \
+                --segments "${dumpdir}/${name}/raw/segments.JOB" \
+                --dumpdir "${dumpdir}/${name}/raw/dump.JOB" \
+                --verbose "${verbose}"
+        echo "Successfully finished feature extraction of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished feature extraction."
+
+    # calculate statistics for normalization
+    echo "Statistics computation start. See the progress via ${dumpdir}/${train_set}/compute_statistics.log."
+    ${train_cmd} "${dumpdir}/${train_set}/compute_statistics.log" \
+        parallel-wavegan-compute-statistics \
+            --config "${conf}" \
+            --rootdir "${dumpdir}/${train_set}/raw" \
+            --dumpdir "${dumpdir}/${train_set}" \
+            --verbose "${verbose}"
+    echo "Successfully finished calculation of statistics."
+
+    # normalize and dump them
+    pids=()
+    for name in "${train_set}" "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${dumpdir}/${name}/norm" ] && mkdir -p "${dumpdir}/${name}/norm"
+        echo "Nomalization start. See the progress via ${dumpdir}/${name}/norm/normalize.*.log."
+        ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/norm/normalize.JOB.log" \
+            parallel-wavegan-normalize \
+                --config "${conf}" \
+                --stats "${dumpdir}/${train_set}/stats.${stats_ext}" \
+                --rootdir "${dumpdir}/${name}/raw/dump.JOB" \
+                --dumpdir "${dumpdir}/${name}/norm/dump.JOB" \
+                --verbose "${verbose}"
+        echo "Successfully finished normalization of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished normalization."
+fi
+
+if [ -z "${tag}" ]; then
+    expdir="exp/${train_set}_vctk_$(basename "${conf}" .yaml)"
+else
+    expdir="exp/${train_set}_vctk_${tag}"
+fi
+if [ "${stage}" -le 2 ] && [ "${stop_stage}" -ge 2 ]; then
+    echo "Stage 2: Network training"
+    [ ! -e "${expdir}" ] && mkdir -p "${expdir}"
+    cp "${dumpdir}/${train_set}/stats.${stats_ext}" "${expdir}"
+    if [ "${n_gpus}" -gt 1 ]; then
+        train="python -m parallel_wavegan.distributed.launch --nproc_per_node ${n_gpus} -c parallel-wavegan-train"
+    else
+        train="parallel-wavegan-train"
+    fi
+    echo "Training start. See the progress via ${expdir}/train.log."
+    ${cuda_cmd} --gpu "${n_gpus}" "${expdir}/train.log" \
+        ${train} \
+            --config "${conf}" \
+            --train-dumpdir "${dumpdir}/${train_set}/norm" \
+            --dev-dumpdir "${dumpdir}/${dev_set}/norm" \
+            --outdir "${expdir}" \
+            --resume "${resume}" \
+            --verbose "${verbose}"
+    echo "Successfully finished training."
+fi
+
+if [ "${stage}" -le 3 ] && [ "${stop_stage}" -ge 3 ]; then
+    echo "Stage 3: Network decoding"
+    # shellcheck disable=SC2012
+    [ -z "${checkpoint}" ] && checkpoint="$(ls -dt "${expdir}"/*.pkl | head -1 || true)"
+    outdir="${expdir}/wav/$(basename "${checkpoint}" .pkl)"
+    pids=()
+    for name in "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${outdir}/${name}" ] && mkdir -p "${outdir}/${name}"
+        [ "${n_gpus}" -gt 1 ] && n_gpus=1
+        echo "Decoding start. See the progress via ${outdir}/${name}/decode.log."
+        ${cuda_cmd} --gpu "${n_gpus}" "${outdir}/${name}/decode.log" \
+            parallel-wavegan-decode \
+                --dumpdir "${dumpdir}/${name}/norm" \
+                --checkpoint "${checkpoint}" \
+                --outdir "${outdir}/${name}" \
+                --verbose "${verbose}"
+        echo "Successfully finished decoding of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished decoding."
+fi
+echo "Finished."
diff --git a/ParallelWaveGAN/egs/vctk/voc1/utils b/ParallelWaveGAN/egs/vctk/voc1/utils
new file mode 120000
index 0000000000000000000000000000000000000000..973afe674f2c85f7a400600f963e0709767602dc
--- /dev/null
+++ b/ParallelWaveGAN/egs/vctk/voc1/utils
@@ -0,0 +1 @@
+../../../utils
\ No newline at end of file
diff --git a/ParallelWaveGAN/egs/yesno/voc1/cmd.sh b/ParallelWaveGAN/egs/yesno/voc1/cmd.sh
new file mode 100644
index 0000000000000000000000000000000000000000..19f342102fc4f3389157c48f1196b16b68eb1cf1
--- /dev/null
+++ b/ParallelWaveGAN/egs/yesno/voc1/cmd.sh
@@ -0,0 +1,91 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend="local"
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="utils/run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="utils/run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="utils/run.pl"
+
+# Local machine, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="utils/stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="utils/stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="utils/stdout.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="utils/queue.pl"
+    export cuda_cmd="utils/queue.pl"
+    export decode_cmd="utils/queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="utils/slurm.pl"
+    export cuda_cmd="utils/slurm.pl"
+    export decode_cmd="utils/slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="utils/ssh.pl"
+    export cuda_cmd="utils/ssh.pl"
+    export decode_cmd="utils/ssh.pl"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/ParallelWaveGAN/egs/yesno/voc1/conf/hifigan.v1.debug.yaml b/ParallelWaveGAN/egs/yesno/voc1/conf/hifigan.v1.debug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..739282c81329b5d749b342d273d741eb36aad3a2
--- /dev/null
+++ b/ParallelWaveGAN/egs/yesno/voc1/conf/hifigan.v1.debug.yaml
@@ -0,0 +1,152 @@
+# This is the configuration file for yesno dataset.
+# Note that this configuration is just for debugging.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 8000      # Sampling rate.
+fft_size: 1024           # FFT size.
+hop_size: 256            # Hop size.
+win_length: null         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 3800               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: true       # Whether to trim the start and end of silence.
+trim_threshold_in_db: 20 # Need to tune carefully if the recording is not good.
+trim_frame_size: 1024    # Frame size in trimming.
+trim_hop_size: 256       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_type: HiFiGANGenerator
+generator_params:
+    in_channels: 80                       # Number of input channels.
+    out_channels: 1                       # Number of output channels.
+    channels: 32                          # Number of initial channels.
+    kernel_size: 7                        # Kernel size of initial and final conv layers.
+    upsample_scales: [8, 8, 2, 2]         # Upsampling scales.
+    upsample_kernel_sizes: [16, 16, 4, 4] # Kernel size for upsampling layers.
+    resblock_kernel_sizes: [3, 7, 11]     # Kernel size for residual blocks.
+    resblock_dilations:                   # Dilations for residual blocks.
+        - [1, 3, 5]
+        - [1, 3, 5]
+        - [1, 3, 5]
+    use_additional_convs: true            # Whether to use additional conv layer in residual blocks.
+    bias: true                            # Whether to use bias parameter in conv.
+    nonlinear_activation: "LeakyReLU"     # Nonlinear activation type.
+    nonlinear_activation_params:          # Nonlinear activation paramters.
+        negative_slope: 0.1
+    use_weight_norm: true                 # Whether to apply weight normalization.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_type: HiFiGANMultiScaleMultiPeriodDiscriminator
+discriminator_params:
+    scales: 2                             # Number of multi-scale discriminator.
+    scale_downsample_pooling: "AvgPool1d" # Pooling operation for scale discriminator.
+    scale_downsample_pooling_params:
+        kernel_size: 4                    # Pooling kernel size.
+        stride: 2                         # Pooling stride.
+        padding: 2                        # Padding size.
+    scale_discriminator_params:
+        in_channels: 1                    # Number of input channels.
+        out_channels: 1                   # Number of output channels.
+        kernel_sizes: [15, 41, 5, 3]      # List of kernel sizes.
+        channels: 16                      # Initial number of channels.
+        max_downsample_channels: 32       # Maximum number of channels in downsampling conv layers.
+        max_groups: 16                    # Maximum number of groups in downsampling conv layers.
+        bias: true
+        downsample_scales: [4, 4, 4, 4]   # Downsampling scales.
+        nonlinear_activation: "LeakyReLU" # Nonlinear activation.
+        nonlinear_activation_params:
+            negative_slope: 0.1
+    follow_official_norm: true            # Whether to follow the official norm setting.
+    periods: [2, 3]                       # List of period for multi-period discriminator.
+    period_discriminator_params:
+        in_channels: 1                    # Number of input channels.
+        out_channels: 1                   # Number of output channels.
+        kernel_sizes: [5, 3]              # List of kernel sizes.
+        channels: 32                      # Initial number of channels.
+        downsample_scales: [4, 4, 4, 4]   # Downsampling scales.
+        max_downsample_channels: 32       # Maximum number of channels in downsampling conv layers.
+        bias: true                        # Whether to use bias parameter in conv layer."
+        nonlinear_activation: "LeakyReLU" # Nonlinear activation.
+        nonlinear_activation_params:      # Nonlinear activation paramters.
+            negative_slope: 0.1
+        use_weight_norm: true             # Whether to apply weight normalization.
+        use_spectral_norm: false          # Whether to apply spectral normalization.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+use_stft_loss: false
+use_mel_loss: true
+generator_adv_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+discriminator_adv_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+use_feat_match_loss: true
+feat_match_loss_params:
+    average_by_discriminators: false # Whether to average loss by # discriminators.
+    average_by_layers: false         # Whether to average loss by # layers in each discriminator.
+    include_final_outputs: true      # Whether to include final outputs in feat match loss calculation.
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_aux: 45.0       # Loss balancing coefficient for STFT loss.
+lambda_adv: 1.0        # Loss balancing coefficient for adversarial loss.
+lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss..
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 2               # Batch size.
+batch_max_steps: 4096       # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2              # Number of workers in Pytorch DataLoader.
+remove_short_samples: false # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_type: AdamW
+generator_optimizer_params:
+    lr: 0.0002            # Generator's learning rate.
+    betas: [0.8, 0.99]    # Generator's optimizer beta
+    weight_decay: 0.0     # Generator's weight decay coefficient.
+generator_scheduler_type: ExponentialLR
+generator_scheduler_params:
+    gamma: 0.999           # Generator's scheduler gamma.
+generator_grad_norm: -1    # Generator's gradient norm.
+discriminator_optimizer_type: AdamW
+discriminator_optimizer_params:
+    lr: 0.0002            # Discriminator's learning rate.
+    betas: [0.8, 0.99]    # Discriminator's optimizer beta
+    weight_decay: 0.0     # Discriminator's weight decay coefficient.
+discriminator_scheduler_type: ExponentialLR
+discriminator_scheduler_params:
+    gamma: 0.999            # Discriminator's scheduler gamma.
+discriminator_grad_norm: -1 # Discriminator's gradient norm.
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+generator_train_start_steps: 5     # Number of steps to start to train discriminator.
+discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
+train_max_steps: 10                # Number of training steps.
+save_interval_steps: 5             # Interval steps to save checkpoint.
+eval_interval_steps: 5             # Interval steps to evaluate the network.
+log_interval_steps: 5              # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/yesno/voc1/conf/melgan.v1.debug.yaml b/ParallelWaveGAN/egs/yesno/voc1/conf/melgan.v1.debug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..04ea6e44cb986fa69e1cc679b478f8d2f6378eb7
--- /dev/null
+++ b/ParallelWaveGAN/egs/yesno/voc1/conf/melgan.v1.debug.yaml
@@ -0,0 +1,112 @@
+# This is the configuration file for yesno dataset.
+# Note that this configuration is just for debugging.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 8000      # Sampling rate.
+fft_size: 1024           # FFT size.
+hop_size: 256            # Hop size.
+win_length: null         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 4000               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: true       # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 1024    # Frame size in trimming.
+trim_hop_size: 256       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_type: "MelGANGenerator" # Generator type.
+generator_params:
+    in_channels: 80               # Number of input channels.
+    out_channels: 1               # Number of output channels.
+    kernel_size: 7                # Kernel size of initial and final conv layers.
+    channels: 256                 # Initial number of channels for conv layers.
+    upsample_scales: [8, 8, 2, 2] # List of Upsampling scales.
+    stack_kernel_size: 3          # Kernel size of dilated conv layers in residual stack.
+    stacks: 1                     # Number of stacks in a single residual stack module.
+    use_weight_norm: True         # Whether to use weight normalization.
+    use_causal_conv: False        # Whether to use causal convolution.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Number of output channels.
+    layers: 10            # Number of conv layers.
+    conv_channels: 32     # Number of chnn layers.
+    bias: true            # Whether to use bias parameter in conv.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
+    nonlinear_activation_params:      # Nonlinear function parameters
+        negative_slope: 0.2           # Alpha in LeakyReLU.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 4.0  # Loss balancing coefficient.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 2               # Batch size.
+batch_max_steps: 4096       # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2              # Number of workers in Pytorch DataLoader.
+remove_short_samples: false # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    lr: 0.0001             # Generator's learning rate.
+    eps: 1.0e-6            # Generator's epsilon.
+    weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+    step_size: 200000      # Generator's scheduler step size.
+    gamma: 0.5             # Generator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+    lr: 0.00005            # Discriminator's learning rate.
+    eps: 1.0e-6            # Discriminator's epsilon.
+    weight_decay: 0.0      # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    step_size: 200000      # Discriminator's scheduler step size.
+    gamma: 0.5             # Discriminator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 5 # Number of steps to start to train discriminator.
+train_max_steps: 10                # Number of training steps.
+save_interval_steps: 5             # Interval steps to save checkpoint.
+eval_interval_steps: 5             # Interval steps to evaluate the network.
+log_interval_steps: 5              # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/yesno/voc1/conf/melgan.v3.debug.yaml b/ParallelWaveGAN/egs/yesno/voc1/conf/melgan.v3.debug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7fcbf28ee90f0e2cefdcc91a489c7c1d50a5ca24
--- /dev/null
+++ b/ParallelWaveGAN/egs/yesno/voc1/conf/melgan.v3.debug.yaml
@@ -0,0 +1,121 @@
+# This is the configuration file for yesno dataset.
+# Note that this configuration is just for debugging.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 8000      # Sampling rate.
+fft_size: 1024           # FFT size.
+hop_size: 256            # Hop size.
+win_length: null         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 0                  # Minimum freq in mel basis calculation.
+fmax: 4000               # Maximum frequency in mel basis calculation.
+global_gain_scale: 0.8   # Will be multiplied to all of waveform.
+trim_silence: false      # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 1024    # Frame size in trimming.
+trim_hop_size: 256       # Hop size in trimming.
+format: "hdf5"           # Feature file format. " npy " or " hdf5 " is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_type: "MelGANGenerator" # Generator type.
+generator_params:
+    in_channels: 80               # Number of input channels.
+    out_channels: 1               # Number of output channels.
+    kernel_size: 7                # Kernel size of initial and final conv layers.
+    channels: 256                 # Initial number of channels for conv layers.
+    upsample_scales: [8, 8, 2, 2] # List of Upsampling scales.
+    stack_kernel_size: 3          # Kernel size of dilated conv layers in residual stack.
+    stacks: 1                     # Number of stacks in a single residual stack module.
+    use_weight_norm: True         # Whether to use weight normalization.
+    use_causal_conv: False        # Whether to use causal convolution.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_type: "MelGANMultiScaleDiscriminator" # Discriminator type.
+discriminator_params:
+    in_channels: 1                    # Number of input channels.
+    out_channels: 1                   # Number of output channels.
+    scales: 3                         # Number of multi-scales.
+    downsample_pooling: "AvgPool1d"   # Pooling type for the input downsampling.
+    downsample_pooling_params:        # Parameters of the above pooling function.
+        kernel_size: 4
+        stride: 2
+        padding: 1
+        count_include_pad: False
+    kernel_sizes: [5, 3]              # List of kernel size.
+    channels: 16                      # Number of channels of the initial conv layer.
+    max_downsample_channels: 64       # Maximum number of channels of downsampling layers.
+    downsample_scales: [4, 4, 4, 4]   # List of downsampling scales.
+    nonlinear_activation: "LeakyReLU" # Nonlinear activation function.
+    nonlinear_activation_params:      # Parameters of nonlinear activation function.
+        negative_slope: 0.2
+    use_weight_norm: True             # Whether to use weight norm.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+use_feat_match_loss: true # Whether to use feature matching loss.
+lambda_feat_match: 25.0   # Loss balancing coefficient for feature matching loss.
+lambda_adv: 4.0           # Loss balancing coefficient.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 2               # Batch size.
+batch_max_steps: 4096       # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2              # Number of workers in Pytorch DataLoader.
+remove_short_samples: false # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    lr: 0.0001             # Generator's learning rate.
+    eps: 1.0e-6            # Generator's epsilon.
+    weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+    step_size: 200000      # Generator's scheduler step size.
+    gamma: 0.5             # Generator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+    lr: 0.00005            # Discriminator's learning rate.
+    eps: 1.0e-6            # Discriminator's epsilon.
+    weight_decay: 0.0      # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    step_size: 200000      # Discriminator's scheduler step size.
+    gamma: 0.5             # Discriminator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 5 # Number of steps to start to train discriminator.
+train_max_steps: 10                # Number of training steps.
+save_interval_steps: 5             # Interval steps to save checkpoint.
+eval_interval_steps: 5             # Interval steps to evaluate the network.
+log_interval_steps: 5              # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/yesno/voc1/conf/multi_band_melgan.v1.debug.yaml b/ParallelWaveGAN/egs/yesno/voc1/conf/multi_band_melgan.v1.debug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..46fa16b958dfb5a7272b8db932c867b6d86a06b6
--- /dev/null
+++ b/ParallelWaveGAN/egs/yesno/voc1/conf/multi_band_melgan.v1.debug.yaml
@@ -0,0 +1,126 @@
+# This is the configuration file for yesno dataset.
+# Note that this configuration is just for debugging.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 8000      # Sampling rate.
+fft_size: 1024           # FFT size.
+hop_size: 200            # Hop size.
+win_length: null         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 0                  # Minimum freq in mel basis calculation.
+fmax: 4000               # Maximum frequency in mel basis calculation.
+global_gain_scale: 0.8   # Will be multiplied to all of waveform.
+trim_silence: false      # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 1024    # Frame size in trimming.
+trim_hop_size: 256       # Hop size in trimming.
+format: "hdf5"           # Feature file format. " npy " or " hdf5 " is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_type: "MelGANGenerator" # Generator type.
+generator_params:
+    in_channels: 80               # Number of input channels.
+    out_channels: 4               # Number of output channels.
+    kernel_size: 7                # Kernel size of initial and final conv layers.
+    channels: 256                 # Initial number of channels for conv layers.
+    upsample_scales: [2, 5, 5]    # List of Upsampling scales.
+    stack_kernel_size: 3          # Kernel size of dilated conv layers in residual stack.
+    stacks: 1                     # Number of stacks in a single residual stack module.
+    use_weight_norm: True         # Whether to use weight normalization.
+    use_causal_conv: False        # Whether to use causal convolution.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_type: "MelGANMultiScaleDiscriminator" # Discriminator type.
+discriminator_params:
+    in_channels: 1                    # Number of input channels.
+    out_channels: 1                   # Number of output channels.
+    scales: 3                         # Number of multi-scales.
+    downsample_pooling: "AvgPool1d"   # Pooling type for the input downsampling.
+    downsample_pooling_params:        # Parameters of the above pooling function.
+        kernel_size: 4
+        stride: 2
+        padding: 1
+        count_include_pad: False
+    kernel_sizes: [5, 3]              # List of kernel size.
+    channels: 16                      # Number of channels of the initial conv layer.
+    max_downsample_channels: 64       # Maximum number of channels of downsampling layers.
+    downsample_scales: [4, 4, 4]      # List of downsampling scales.
+    nonlinear_activation: "LeakyReLU" # Nonlinear activation function.
+    nonlinear_activation_params:      # Parameters of nonlinear activation function.
+        negative_slope: 0.2
+    use_weight_norm: True             # Whether to use weight norm.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+use_subband_stft_loss: true
+subband_stft_loss_params:
+    fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
+    hop_sizes: [30, 60, 10]     # List of hop size for STFT-based loss
+    win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+use_feat_match_loss: false # Whether to use feature matching loss.
+lambda_adv: 2.5            # Loss balancing coefficient.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 2               # Batch size.
+batch_max_steps: 4000       # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2              # Number of workers in Pytorch DataLoader.
+remove_short_samples: false # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    lr: 0.0001             # Generator's learning rate.
+    eps: 1.0e-6            # Generator's epsilon.
+    weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+    step_size: 200000      # Generator's scheduler step size.
+    gamma: 0.5             # Generator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+    lr: 0.00005            # Discriminator's learning rate.
+    eps: 1.0e-6            # Discriminator's epsilon.
+    weight_decay: 0.0      # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    step_size: 200000      # Discriminator's scheduler step size.
+    gamma: 0.5             # Discriminator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 5 # Number of steps to start to train discriminator.
+train_max_steps: 10                # Number of training steps.
+save_interval_steps: 5             # Interval steps to save checkpoint.
+eval_interval_steps: 5             # Interval steps to evaluate the network.
+log_interval_steps: 5              # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/yesno/voc1/conf/parallel_wavegan.v1.debug.diff_fs.yaml b/ParallelWaveGAN/egs/yesno/voc1/conf/parallel_wavegan.v1.debug.diff_fs.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e504a60b54936755ac22189ee12c17c19a2c01a4
--- /dev/null
+++ b/ParallelWaveGAN/egs/yesno/voc1/conf/parallel_wavegan.v1.debug.diff_fs.yaml
@@ -0,0 +1,121 @@
+# This is the configuration file for yesno dataset.
+# Note that this configuration is just for debugging.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 8000           # Sampling rate.
+sampling_rate_for_feats: 4000 # Sampling rate for features.
+fft_size: 1024                # FFT size.
+hop_size: 256                 # Hop size.
+win_length: null              # Window length.
+                              # If set to null, it will be the same as fft_size.
+window: "hann"                # Window function.
+num_mels: 80                  # Number of mel basis.
+fmin: 80                      # Minimum freq in mel basis calculation.
+fmax: 2000                    # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0        # Will be multiplied to all of waveform.
+trim_silence: true            # Whether to trim the start and end of silence.
+trim_threshold_in_db: 20      # Need to tune carefully if the recording is not good.
+trim_frame_size: 1024         # Frame size in trimming.
+trim_hop_size: 256            # Hop size in trimming.
+format: "hdf5"                # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Kernel size of dilated convolution.
+    layers: 20            # Number of residual block layers.
+    stacks: 2             # Number of stacks i.e., dilation cycles.
+    residual_channels: 16 # Number of channels in residual conv.
+    gate_channels: 32     # Number of channels in gated conv.
+    skip_channels: 16     # Number of channels in skip conv.
+    aux_channels: 80      # Number of channels for auxiliary feature conv.
+                          # Must be the same as num_mels.
+    aux_context_window: 1 # Context window size for auxiliary feature.
+                          # If set to 2, previous 2 and future 2 frames will be considered.
+    dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
+    upsample_params:                      # Upsampling network parameters.
+        upsample_scales: [4, 4, 4, 4]     # Upsampling scales. Prodcut of these must be the same as hop size.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Number of output channels.
+    layers: 10            # Number of conv layers.
+    conv_channels: 16     # Number of chnn layers.
+    bias: true            # Whether to use bias parameter in conv.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
+    nonlinear_activation_params:      # Nonlinear function parameters
+        negative_slope: 0.2           # Alpha in LeakyReLU.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 4.0  # Loss balancing coefficient.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 2               # Batch size.
+batch_max_steps: 4096       # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2              # Number of workers in Pytorch DataLoader.
+remove_short_samples: false # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    lr: 0.0001             # Generator's learning rate.
+    eps: 1.0e-6            # Generator's epsilon.
+    weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+    step_size: 200000      # Generator's scheduler step size.
+    gamma: 0.5             # Generator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+    lr: 0.00005            # Discriminator's learning rate.
+    eps: 1.0e-6            # Discriminator's epsilon.
+    weight_decay: 0.0      # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    step_size: 200000      # Discriminator's scheduler step size.
+    gamma: 0.5             # Discriminator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 5 # Number of steps to start to train discriminator.
+train_max_steps: 10                # Number of training steps.
+save_interval_steps: 5             # Interval steps to save checkpoint.
+eval_interval_steps: 5             # Interval steps to evaluate the network.
+log_interval_steps: 5              # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/yesno/voc1/conf/parallel_wavegan.v1.debug.npy.yaml b/ParallelWaveGAN/egs/yesno/voc1/conf/parallel_wavegan.v1.debug.npy.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b6f8602b07d4242ea3354a0276f59415ab051371
--- /dev/null
+++ b/ParallelWaveGAN/egs/yesno/voc1/conf/parallel_wavegan.v1.debug.npy.yaml
@@ -0,0 +1,121 @@
+# This is the configuration file for yesno dataset.
+# Note that this configuration is just for debugging.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 8000      # Sampling rate.
+fft_size: 1024           # FFT size.
+hop_size: 256            # Hop size.
+win_length: null         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 3800               # Maximum frequency in mel basis calculation.
+log_base: null           # Log base.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: true       # Whether to trim the start and end of silence.
+trim_threshold_in_db: 20 # Need to tune carefully if the recording is not good.
+trim_frame_size: 1024    # Frame size in trimming.
+trim_hop_size: 256       # Hop size in trimming.
+format: "npy"            # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Kernel size of dilated convolution.
+    layers: 20            # Number of residual block layers.
+    stacks: 2             # Number of stacks i.e., dilation cycles.
+    residual_channels: 16 # Number of channels in residual conv.
+    gate_channels: 32     # Number of channels in gated conv.
+    skip_channels: 16     # Number of channels in skip conv.
+    aux_channels: 80      # Number of channels for auxiliary feature conv.
+                          # Must be the same as num_mels.
+    aux_context_window: 1 # Context window size for auxiliary feature.
+                          # If set to 2, previous 2 and future 2 frames will be considered.
+    dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
+    upsample_params:                      # Upsampling network parameters.
+        upsample_scales: [4, 4, 4, 4]     # Upsampling scales. Prodcut of these must be the same as hop size.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Number of output channels.
+    layers: 10            # Number of conv layers.
+    conv_channels: 16     # Number of chnn layers.
+    bias: true            # Whether to use bias parameter in conv.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
+    nonlinear_activation_params:      # Nonlinear function parameters
+        negative_slope: 0.2           # Alpha in LeakyReLU.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 4.0  # Loss balancing coefficient.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 2               # Batch size.
+batch_max_steps: 4096       # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2              # Number of workers in Pytorch DataLoader.
+remove_short_samples: false # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    lr: 0.0001             # Generator's learning rate.
+    eps: 1.0e-6            # Generator's epsilon.
+    weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+    step_size: 200000      # Generator's scheduler step size.
+    gamma: 0.5             # Generator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+    lr: 0.00005            # Discriminator's learning rate.
+    eps: 1.0e-6            # Discriminator's epsilon.
+    weight_decay: 0.0      # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    step_size: 200000      # Discriminator's scheduler step size.
+    gamma: 0.5             # Discriminator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 5 # Number of steps to start to train discriminator.
+train_max_steps: 10                # Number of training steps.
+save_interval_steps: 5             # Interval steps to save checkpoint.
+eval_interval_steps: 5             # Interval steps to evaluate the network.
+log_interval_steps: 5              # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/yesno/voc1/conf/parallel_wavegan.v1.debug.yaml b/ParallelWaveGAN/egs/yesno/voc1/conf/parallel_wavegan.v1.debug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3a26f1b0db3a0b31b02f8b4b341fcf45061ab92d
--- /dev/null
+++ b/ParallelWaveGAN/egs/yesno/voc1/conf/parallel_wavegan.v1.debug.yaml
@@ -0,0 +1,120 @@
+# This is the configuration file for yesno dataset.
+# Note that this configuration is just for debugging.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 8000      # Sampling rate.
+fft_size: 1024           # FFT size.
+hop_size: 256            # Hop size.
+win_length: null         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 3800               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: true       # Whether to trim the start and end of silence.
+trim_threshold_in_db: 20 # Need to tune carefully if the recording is not good.
+trim_frame_size: 1024    # Frame size in trimming.
+trim_hop_size: 256       # Hop size in trimming.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Kernel size of dilated convolution.
+    layers: 20            # Number of residual block layers.
+    stacks: 2             # Number of stacks i.e., dilation cycles.
+    residual_channels: 16 # Number of channels in residual conv.
+    gate_channels: 32     # Number of channels in gated conv.
+    skip_channels: 16     # Number of channels in skip conv.
+    aux_channels: 80      # Number of channels for auxiliary feature conv.
+                          # Must be the same as num_mels.
+    aux_context_window: 1 # Context window size for auxiliary feature.
+                          # If set to 2, previous 2 and future 2 frames will be considered.
+    dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
+    upsample_params:                      # Upsampling network parameters.
+        upsample_scales: [4, 4, 4, 4]     # Upsampling scales. Prodcut of these must be the same as hop size.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Number of output channels.
+    layers: 10            # Number of conv layers.
+    conv_channels: 16     # Number of chnn layers.
+    bias: true            # Whether to use bias parameter in conv.
+    use_weight_norm: true # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
+    nonlinear_activation_params:      # Nonlinear function parameters
+        negative_slope: 0.2           # Alpha in LeakyReLU.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 4.0  # Loss balancing coefficient.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 2               # Batch size.
+batch_max_steps: 4096       # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2              # Number of workers in Pytorch DataLoader.
+remove_short_samples: false # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    lr: 0.0001             # Generator's learning rate.
+    eps: 1.0e-6            # Generator's epsilon.
+    weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+    step_size: 200000      # Generator's scheduler step size.
+    gamma: 0.5             # Generator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+    lr: 0.00005            # Discriminator's learning rate.
+    eps: 1.0e-6            # Discriminator's epsilon.
+    weight_decay: 0.0      # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    step_size: 200000      # Discriminator's scheduler step size.
+    gamma: 0.5             # Discriminator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 5 # Number of steps to start to train discriminator.
+train_max_steps: 10                # Number of training steps.
+save_interval_steps: 5             # Interval steps to save checkpoint.
+eval_interval_steps: 5             # Interval steps to evaluate the network.
+log_interval_steps: 5              # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/yesno/voc1/conf/slurm.conf b/ParallelWaveGAN/egs/yesno/voc1/conf/slurm.conf
new file mode 100644
index 0000000000000000000000000000000000000000..cefd21f031d647f798e8045a6beb92b0b833d041
--- /dev/null
+++ b/ParallelWaveGAN/egs/yesno/voc1/conf/slurm.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command sbatch --export=PATH  --ntasks-per-node=1
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
+option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1 # Do not add anything to qsub_opts
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/ParallelWaveGAN/egs/yesno/voc1/conf/style_melgan.v1.debug.yaml b/ParallelWaveGAN/egs/yesno/voc1/conf/style_melgan.v1.debug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c905a96697b2854956b6e18d93ccbe063571b282
--- /dev/null
+++ b/ParallelWaveGAN/egs/yesno/voc1/conf/style_melgan.v1.debug.yaml
@@ -0,0 +1,135 @@
+# This is the configuration file for yesno dataset.
+# Note that this configuration is just for debugging.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 8000      # Sampling rate.
+fft_size: 1024           # FFT size.
+hop_size: 256            # Hop size.
+win_length: null         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 0                  # Minimum freq in mel basis calculation.
+fmax: 4000               # Maximum frequency in mel basis calculation.
+global_gain_scale: 0.8   # Will be multiplied to all of waveform.
+trim_silence: false      # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 1024    # Frame size in trimming.
+trim_hop_size: 256       # Hop size in trimming.
+format: "hdf5"           # Feature file format. " npy " or " hdf5 " is supported.
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_type: "StyleMelGANGenerator" # Generator type.
+generator_params:
+    in_channels: 128
+    aux_channels: 80
+    channels: 64
+    out_channels: 1
+    kernel_size: 9
+    dilation: 2
+    bias: True
+    noise_upsample_scales: [4, 2, 2, 2]
+    noise_upsample_activation: "LeakyReLU"
+    noise_upsample_activation_params:
+        negative_slope: 0.2
+    upsample_scales: [2, 2, 2, 2, 2, 2, 2, 2, 1]
+    upsample_mode: "nearest"
+    gated_function: "softmax"
+    use_weight_norm: True
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_type: "StyleMelGANDiscriminator" # Discriminator type.
+discriminator_params:
+    repeats: 4
+    window_sizes: [512, 1024, 2048, 4096]
+    pqmf_params:
+        - [1, None, None, None]
+        - [2, 62, 0.26700, 9.0]
+        - [4, 62, 0.14200, 9.0]
+        - [8, 62, 0.07949, 9.0]
+    discriminator_params:
+        out_channels: 1
+        kernel_sizes: [5, 3]
+        channels: 16
+        max_downsample_channels: 512
+        bias: True
+        downsample_scales: [4, 4, 4, 1]
+        nonlinear_activation: "LeakyReLU"
+        nonlinear_activation_params:
+            negative_slope: 0.2
+    use_weight_norm: True
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann_window"         # Window function for STFT-based loss
+lambda_aux: 1.0                   # Loss balancing coefficient for aux loss.
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 1.0 # Loss balancing coefficient for adv loss.
+generator_adv_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+    loss_type: hinge                 # Loss function type ("mse" or "hinge").
+discriminator_adv_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+    loss_type: hinge                 # Loss function type ("mse" or "hinge").
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 2               # Batch size.
+batch_max_steps: 8192       # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
+num_workers: 2              # Number of workers in Pytorch DataLoader.
+remove_short_samples: false # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_type: Adam
+generator_optimizer_params:
+    lr: 0.0001              # Generator's learning rate.
+    betas: [0.5, 0.9]
+    weight_decay: 0.0       # Generator's weight decay coefficient.
+generator_scheduler_params:
+    step_size: 200000       # Generator's scheduler step size.
+    gamma: 0.5              # Generator's scheduler gamma.
+                            # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: -1     # Generator's gradient norm.
+discriminator_optimizer_type: Adam
+discriminator_optimizer_params:
+    lr: 0.00005             # Discriminator's learning rate.
+    betas: [0.5, 0.9]
+    weight_decay: 0.0       # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    step_size: 200000       # Discriminator's scheduler step size.
+    gamma: 0.5              # Discriminator's scheduler gamma.
+                            # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: -1 # Discriminator's gradient norm.
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 5 # Number of steps to start to train discriminator.
+train_max_steps: 10                # Number of training steps.
+save_interval_steps: 5             # Interval steps to save checkpoint.
+eval_interval_steps: 5             # Interval steps to evaluate the network.
+log_interval_steps: 5              # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/ParallelWaveGAN/egs/yesno/voc1/local/data_download.sh b/ParallelWaveGAN/egs/yesno/voc1/local/data_download.sh
new file mode 100755
index 0000000000000000000000000000000000000000..7805faacb54100647a373d11c69ba17ba53ced21
--- /dev/null
+++ b/ParallelWaveGAN/egs/yesno/voc1/local/data_download.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+download_dir=$1
+
+# check arguments
+if [ $# != 1 ]; then
+    echo "Usage: $0 <download_dir>"
+    exit 1
+fi
+
+set -euo pipefail
+
+cwd=$(pwd)
+if [ ! -e "${download_dir}/LJSpeech-1.1" ]; then
+    mkdir -p "${download_dir}"
+    cd "${download_dir}"
+    wget http://www.openslr.org/resources/1/waves_yesno.tar.gz
+    tar -xvzf waves_yesno.tar.gz
+    rm ./waves_yesno/README*
+    rm waves_yesno.tar.gz
+    cd "${cwd}"
+    echo "Successfully downloaded data."
+else
+    echo "Already exists. Skipped."
+fi
diff --git a/ParallelWaveGAN/egs/yesno/voc1/local/data_prep.sh b/ParallelWaveGAN/egs/yesno/voc1/local/data_prep.sh
new file mode 100755
index 0000000000000000000000000000000000000000..5f2fbc8df0c45946453f1388b3c796a22835f3c5
--- /dev/null
+++ b/ParallelWaveGAN/egs/yesno/voc1/local/data_prep.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+# shellcheck disable=SC1091
+. ./path.sh || exit 1;
+
+num_dev=10
+num_eval=10
+train_set="train_nodev"
+dev_set="dev"
+eval_set="eval"
+shuffle=false
+
+# shellcheck disable=SC1091
+. utils/parse_options.sh || exit 1;
+
+db_root=$1
+data_dir=$2
+
+# check arguments
+if [ $# != 2 ]; then
+    echo "Usage: $0 [Options] <db_root> <data_dir>"
+    echo "e.g.: $0 downloads/waves_yesno data"
+    echo ""
+    echo "Options:"
+    echo "    --num_dev: number of development uttreances (default=10)."
+    echo "    --num_eval: number of evaluation uttreances (default=10)."
+    echo "    --train_set: name of train set (default=train_nodev)."
+    echo "    --dev_set: name of dev set (default=dev)."
+    echo "    --eval_set: name of eval set (default=eval)."
+    echo "    --shuffle: whether to perform shuffle in making dev / eval set (default=false)."
+    exit 1
+fi
+
+set -euo pipefail
+
+[ ! -e "${data_dir}/all" ] && mkdir -p "${data_dir}/all"
+
+# set filenames
+scp="${data_dir}/all/wav.scp"
+
+# check file existence
+[ -e "${scp}" ] && rm "${scp}"
+
+# make all scp
+find "${db_root}" -follow -name "*.wav" | sort | while read -r filename; do
+    id=$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")
+    echo "${id} ${filename}" >> "${scp}"
+done
+
+# split
+num_all=$(wc -l < "${scp}")
+num_deveval=$((num_dev + num_eval))
+num_train=$((num_all - num_deveval))
+utils/split_data.sh \
+    --num_first "${num_train}" \
+    --num_second "${num_deveval}" \
+    --shuffle "${shuffle}" \
+    "${data_dir}/all" \
+    "${data_dir}/${train_set}" \
+    "${data_dir}/deveval"
+utils/split_data.sh \
+    --num_first "${num_dev}" \
+    --num_second "${num_eval}" \
+    --shuffle "${shuffle}" \
+    "${data_dir}/deveval" \
+    "${data_dir}/${dev_set}" \
+    "${data_dir}/${eval_set}"
+
+# remove tmp directories
+rm -rf "${data_dir}/all"
+rm -rf "${data_dir}/deveval"
+
+echo "Successfully prepared data."
diff --git a/ParallelWaveGAN/egs/yesno/voc1/path.sh b/ParallelWaveGAN/egs/yesno/voc1/path.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b0ca27c615f70aa29e240222ec370f8ad4e7b45a
--- /dev/null
+++ b/ParallelWaveGAN/egs/yesno/voc1/path.sh
@@ -0,0 +1,33 @@
+# cuda related
+export CUDA_HOME=/usr/local/cuda-10.0
+export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
+
+# path related
+export PRJ_ROOT="${PWD}/../../.."
+if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then
+    # shellcheck disable=SC1090
+    . "${PRJ_ROOT}/tools/venv/bin/activate"
+fi
+
+# python related
+export OMP_NUM_THREADS=1
+export PYTHONIOENCODING=UTF-8
+export MPL_BACKEND=Agg
+
+# check installation
+if ! command -v parallel-wavegan-train > /dev/null; then
+    echo "Error: It seems setup is not finished." >&2
+    echo "Error: Please setup your environment by following README.md" >&2
+    return 1
+fi
+if ! command -v jq > /dev/null; then
+    echo "Error: It seems jq is not installed." >&2
+    echo "Error: Please install via \`sudo apt-get install jq\`." >&2
+    echo "Error: If you do not have sudo, please download from https://stedolan.github.io/jq/download/." >&2
+    return 1
+fi
+if ! command -v yq > /dev/null; then
+    echo "Error: It seems yq is not installed." >&2
+    echo "Error: Please install via \`pip install yq\`." >&2
+    return 1
+fi
diff --git a/ParallelWaveGAN/egs/yesno/voc1/run.sh b/ParallelWaveGAN/egs/yesno/voc1/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..55e7282b38282db2e9fb38faaef37a7d6e0ec62a
--- /dev/null
+++ b/ParallelWaveGAN/egs/yesno/voc1/run.sh
@@ -0,0 +1,174 @@
+#!/bin/bash
+
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+. ./cmd.sh || exit 1;
+. ./path.sh || exit 1;
+
+# basic settings
+stage=-1       # stage to start
+stop_stage=100 # stage to stop
+verbose=1      # verbosity level (lower is less info)
+n_gpus=0       # number of gpus in training
+n_jobs=2       # number of parallel jobs in feature extraction
+
+# NOTE(kan-bayashi): renamed to conf to avoid conflict in parse_options.sh
+conf=conf/parallel_wavegan.v1.debug.yaml
+
+# directory path setting
+download_dir=downloads # direcotry to save downloaded files
+dumpdir=dump           # directory to dump features
+
+# training related setting
+tag=""     # tag for directory to save model
+resume=""  # checkpoint path to resume training
+           # (e.g. <path>/<to>/checkpoint-10000steps.pkl)
+
+# decoding related setting
+checkpoint="" # checkpoint path to be used for decoding
+              # if not provided, the latest one will be used
+              # (e.g. <path>/<to>/checkpoint-400000steps.pkl)
+
+# shellcheck disable=SC1091
+. utils/parse_options.sh || exit 1;
+
+train_set="train_nodev" # name of training data directory
+dev_set="dev"           # name of development data direcotry
+eval_set="eval"         # name of evaluation data direcotry
+
+set -euo pipefail
+
+if [ "${stage}" -le -1 ] && [ "${stop_stage}" -ge -1 ]; then
+    echo "Stage -1: Data download"
+    local/data_download.sh "${download_dir}"
+fi
+
+if [ "${stage}" -le 0 ] && [ "${stop_stage}" -ge 0 ]; then
+    echo "Stage 0: Data preparation"
+    local/data_prep.sh \
+        --train_set "${train_set}" \
+        --dev_set "${dev_set}" \
+        --eval_set "${eval_set}" \
+        "${download_dir}/waves_yesno" data
+fi
+
+stats_ext=$(grep -q "hdf5" <(yq ".format" "${conf}") && echo "h5" || echo "npy")
+if [ "${stage}" -le 1 ] && [ "${stop_stage}" -ge 1 ]; then
+    echo "Stage 1: Feature extraction"
+    # extract raw features
+    pids=()
+    for name in "${train_set}" "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${dumpdir}/${name}/raw" ] && mkdir -p "${dumpdir}/${name}/raw"
+        echo "Feature extraction start. See the progress via ${dumpdir}/${name}/raw/preprocessing.*.log."
+        utils/make_subset_data.sh "data/${name}" "${n_jobs}" "${dumpdir}/${name}/raw"
+        ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/raw/preprocessing.JOB.log" \
+            parallel-wavegan-preprocess \
+                --config "${conf}" \
+                --scp "${dumpdir}/${name}/raw/wav.JOB.scp" \
+                --dumpdir "${dumpdir}/${name}/raw/dump.JOB" \
+                --verbose "${verbose}"
+        echo "Successfully finished feature extraction of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished feature extraction."
+
+    # calculate statistics for normalization
+    echo "Statistics computation start. See the progress via ${dumpdir}/${train_set}/compute_statistics.log."
+    ${train_cmd} "${dumpdir}/${train_set}/compute_statistics.log" \
+        parallel-wavegan-compute-statistics \
+            --config "${conf}" \
+            --rootdir "${dumpdir}/${train_set}/raw" \
+            --dumpdir "${dumpdir}/${train_set}" \
+            --verbose "${verbose}"
+    echo "Successfully finished calculation of statistics."
+
+    # normalize and dump them
+    pids=()
+    for name in "${train_set}" "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${dumpdir}/${name}/norm" ] && mkdir -p "${dumpdir}/${name}/norm"
+        echo "Nomalization start. See the progress via ${dumpdir}/${name}/norm/normalize.*.log."
+        ${train_cmd} JOB=1:${n_jobs} "${dumpdir}/${name}/norm/normalize.JOB.log" \
+            parallel-wavegan-normalize \
+                --config "${conf}" \
+                --stats "${dumpdir}/${train_set}/stats.${stats_ext}" \
+                --rootdir "${dumpdir}/${name}/raw/dump.JOB" \
+                --dumpdir "${dumpdir}/${name}/norm/dump.JOB" \
+                --verbose "${verbose}"
+        echo "Successfully finished normalization of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished normalization."
+fi
+
+if [ -z "${tag}" ]; then
+    expdir="exp/${train_set}_yesno_$(basename "${conf}" .yaml)"
+else
+    expdir="exp/${train_set}_yesno_${tag}"
+fi
+if [ "${stage}" -le 2 ] && [ "${stop_stage}" -ge 2 ]; then
+    echo "Stage 2: Network training"
+    [ ! -e "${expdir}" ] && mkdir -p "${expdir}"
+    cp "${dumpdir}/${train_set}/stats.${stats_ext}" "${expdir}"
+    if [ "${n_gpus}" -gt 1 ]; then
+        train="python -m parallel_wavegan.distributed.launch --nproc_per_node ${n_gpus} -c parallel-wavegan-train"
+    else
+        train="parallel-wavegan-train"
+    fi
+    echo "Training start. See the progress via ${expdir}/train.log."
+    ${cuda_cmd} --gpu "${n_gpus}" "${expdir}/train.log" \
+        ${train} \
+            --config "${conf}" \
+            --train-dumpdir "${dumpdir}/${train_set}/norm" \
+            --dev-dumpdir "${dumpdir}/${dev_set}/norm" \
+            --outdir "${expdir}" \
+            --resume "${resume}" \
+            --verbose "${verbose}"
+    echo "Successfully finished training."
+fi
+
+if [ "${stage}" -le 3 ] && [ "${stop_stage}" -ge 3 ]; then
+    echo "Stage 3: Network decoding"
+    # shellcheck disable=SC2012
+    [ -z "${checkpoint}" ] && checkpoint="$(ls -dt "${expdir}"/*.pkl | head -1 || true)"
+    outdir="${expdir}/wav/$(basename "${checkpoint}" .pkl)"
+    pids=()
+    for name in "${dev_set}" "${eval_set}"; do
+    (
+        [ ! -e "${outdir}/${name}" ] && mkdir -p "${outdir}/${name}"
+        [ "${n_gpus}" -gt 1 ] && n_gpus=1
+        echo "Decoding start. See the progress via ${outdir}/${name}/decode.log."
+        ${cuda_cmd} --gpu "${n_gpus}" "${outdir}/${name}/decode.log" \
+            parallel-wavegan-decode \
+                --dumpdir "${dumpdir}/${name}/norm" \
+                --checkpoint "${checkpoint}" \
+                --outdir "${outdir}/${name}" \
+                --verbose "${verbose}"
+        echo "Successfully finished decoding of ${name} set."
+
+        # NOTE(kan-bayashi): Extra decoding for debugging
+        echo "Decoding start. See the progress via ${outdir}/${name}/decode.log."
+        ${cuda_cmd} --gpu "${n_gpus}" "${outdir}/${name}/decode.log" \
+            parallel-wavegan-decode \
+                --normalize-before \
+                --dumpdir "${dumpdir}/${name}/raw" \
+                --checkpoint "${checkpoint}" \
+                --outdir "${outdir}/${name}" \
+                --verbose "${verbose}"
+        echo "Successfully finished decoding of ${name} set."
+    ) &
+    pids+=($!)
+    done
+    i=0; for pid in "${pids[@]}"; do wait "${pid}" || ((++i)); done
+    [ "${i}" -gt 0 ] && echo "$0: ${i} background jobs are failed." && exit 1;
+    echo "Successfully finished decoding."
+fi
+echo "Finished."
diff --git a/ParallelWaveGAN/egs/yesno/voc1/utils b/ParallelWaveGAN/egs/yesno/voc1/utils
new file mode 120000
index 0000000000000000000000000000000000000000..973afe674f2c85f7a400600f963e0709767602dc
--- /dev/null
+++ b/ParallelWaveGAN/egs/yesno/voc1/utils
@@ -0,0 +1 @@
+../../../utils
\ No newline at end of file
diff --git a/ParallelWaveGAN/parallel_wavegan/__init__.py b/ParallelWaveGAN/parallel_wavegan/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..28db0384b5bc44f28b4e702efd605da5819da6e2
--- /dev/null
+++ b/ParallelWaveGAN/parallel_wavegan/__init__.py
@@ -0,0 +1,3 @@
+# -*- coding: utf-8 -*-
+
+__version__ = "0.5.3"
diff --git a/ParallelWaveGAN/parallel_wavegan/bin/__init__.py b/ParallelWaveGAN/parallel_wavegan/bin/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/ParallelWaveGAN/parallel_wavegan/bin/compute_statistics.py b/ParallelWaveGAN/parallel_wavegan/bin/compute_statistics.py
new file mode 100755
index 0000000000000000000000000000000000000000..4d06070b7bf4cc49589efdda3e717471bc0d6a65
--- /dev/null
+++ b/ParallelWaveGAN/parallel_wavegan/bin/compute_statistics.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+"""Calculate statistics of feature files."""
+
+import argparse
+import logging
+import os
+
+import numpy as np
+import yaml
+
+from sklearn.preprocessing import StandardScaler
+from tqdm import tqdm
+
+from parallel_wavegan.datasets import MelDataset
+from parallel_wavegan.datasets import MelSCPDataset
+from parallel_wavegan.utils import read_hdf5
+from parallel_wavegan.utils import write_hdf5
+
+
+def main():
+    """Run preprocessing process."""
+    parser = argparse.ArgumentParser(
+        description="Compute mean and variance of dumped raw features "
+        "(See detail in parallel_wavegan/bin/compute_statistics.py)."
+    )
+    parser.add_argument(
+        "--feats-scp",
+        "--scp",
+        default=None,
+        type=str,
+        help="kaldi-style feats.scp file. "
+        "you need to specify either feats-scp or rootdir.",
+    )
+    parser.add_argument(
+        "--rootdir",
+        type=str,
+        help="directory including feature files. "
+        "you need to specify either feats-scp or rootdir.",
+    )
+    parser.add_argument(
+        "--config",
+        type=str,
+        required=True,
+        help="yaml format configuration file.",
+    )
+    parser.add_argument(
+        "--dumpdir",
+        default=None,
+        type=str,
+        required=True,
+        help="directory to save statistics. if not provided, "
+        "stats will be saved in the above root directory. (default=None)",
+    )
+    parser.add_argument(
+        "--verbose",
+        type=int,
+        default=1,
+        help="logging level. higher is more logging. (default=1)",
+    )
+    args = parser.parse_args()
+
+    # set logger
+    if args.verbose > 1:
+        logging.basicConfig(
+            level=logging.DEBUG,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    elif args.verbose > 0:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+
+    # load config
+    with open(args.config) as f:
+        config = yaml.load(f, Loader=yaml.Loader)
+    config.update(vars(args))
+
+    # check arguments
+    if (args.feats_scp is not None and args.rootdir is not None) or (
+        args.feats_scp is None and args.rootdir is None
+    ):
+        raise ValueError("Please specify either --rootdir or --feats-scp.")
+
+    # check directory existence
+    if not os.path.exists(args.dumpdir):
+        os.makedirs(args.dumpdir)
+
+    # get dataset
+    if args.feats_scp is None:
+        if config["format"] == "hdf5":
+            mel_query = "*.h5"
+            mel_load_fn = lambda x: read_hdf5(x, "feats")  # NOQA
+        elif config["format"] == "npy":
+            mel_query = "*-feats.npy"
+            mel_load_fn = np.load
+        else:
+            raise ValueError("support only hdf5 or npy format.")
+        dataset = MelDataset(args.rootdir, mel_query=mel_query, mel_load_fn=mel_load_fn)
+    else:
+        dataset = MelSCPDataset(args.feats_scp)
+    logging.info(f"The number of files = {len(dataset)}.")
+
+    # calculate statistics
+    scaler = StandardScaler()
+    for mel in tqdm(dataset):
+        scaler.partial_fit(mel)
+
+    if config["format"] == "hdf5":
+        write_hdf5(
+            os.path.join(args.dumpdir, "stats.h5"),
+            "mean",
+            scaler.mean_.astype(np.float32),
+        )
+        write_hdf5(
+            os.path.join(args.dumpdir, "stats.h5"),
+            "scale",
+            scaler.scale_.astype(np.float32),
+        )
+    else:
+        stats = np.stack([scaler.mean_, scaler.scale_], axis=0)
+        np.save(
+            os.path.join(args.dumpdir, "stats.npy"),
+            stats.astype(np.float32),
+            allow_pickle=False,
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ParallelWaveGAN/parallel_wavegan/bin/decode.py b/ParallelWaveGAN/parallel_wavegan/bin/decode.py
new file mode 100755
index 0000000000000000000000000000000000000000..b6c2e9a1488ce7c01bb1a86d0a6fae758063c9bb
--- /dev/null
+++ b/ParallelWaveGAN/parallel_wavegan/bin/decode.py
@@ -0,0 +1,182 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+"""Decode with trained Parallel WaveGAN Generator."""
+
+import argparse
+import logging
+import os
+import time
+
+import numpy as np
+import soundfile as sf
+import torch
+import yaml
+
+from tqdm import tqdm
+
+from parallel_wavegan.datasets import MelDataset
+from parallel_wavegan.datasets import MelSCPDataset
+from parallel_wavegan.utils import load_model
+from parallel_wavegan.utils import read_hdf5
+
+
+def main():
+    """Run decoding process."""
+    parser = argparse.ArgumentParser(
+        description="Decode dumped features with trained Parallel WaveGAN Generator "
+        "(See detail in parallel_wavegan/bin/decode.py)."
+    )
+    parser.add_argument(
+        "--feats-scp",
+        "--scp",
+        default=None,
+        type=str,
+        help="kaldi-style feats.scp file. "
+        "you need to specify either feats-scp or dumpdir.",
+    )
+    parser.add_argument(
+        "--dumpdir",
+        default=None,
+        type=str,
+        help="directory including feature files. "
+        "you need to specify either feats-scp or dumpdir.",
+    )
+    parser.add_argument(
+        "--outdir",
+        type=str,
+        required=True,
+        help="directory to save generated speech.",
+    )
+    parser.add_argument(
+        "--checkpoint",
+        type=str,
+        required=True,
+        help="checkpoint file to be loaded.",
+    )
+    parser.add_argument(
+        "--config",
+        default=None,
+        type=str,
+        help="yaml format configuration file. if not explicitly provided, "
+        "it will be searched in the checkpoint directory. (default=None)",
+    )
+    parser.add_argument(
+        "--normalize-before",
+        default=False,
+        action="store_true",
+        help="whether to perform feature normalization before input to the model. "
+        "if true, it assumes that the feature is de-normalized. this is useful when "
+        "text2mel model and vocoder use different feature statistics.",
+    )
+    parser.add_argument(
+        "--verbose",
+        type=int,
+        default=1,
+        help="logging level. higher is more logging. (default=1)",
+    )
+    args = parser.parse_args()
+
+    # set logger
+    if args.verbose > 1:
+        logging.basicConfig(
+            level=logging.DEBUG,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    elif args.verbose > 0:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+
+    # check directory existence
+    if not os.path.exists(args.outdir):
+        os.makedirs(args.outdir)
+
+    # load config
+    if args.config is None:
+        dirname = os.path.dirname(args.checkpoint)
+        args.config = os.path.join(dirname, "config.yml")
+    with open(args.config) as f:
+        config = yaml.load(f, Loader=yaml.Loader)
+    config.update(vars(args))
+
+    # check arguments
+    if (args.feats_scp is not None and args.dumpdir is not None) or (
+        args.feats_scp is None and args.dumpdir is None
+    ):
+        raise ValueError("Please specify either --dumpdir or --feats-scp.")
+
+    # get dataset
+    if args.dumpdir is not None:
+        if config["format"] == "hdf5":
+            mel_query = "*.h5"
+            mel_load_fn = lambda x: read_hdf5(x, "feats")  # NOQA
+        elif config["format"] == "npy":
+            mel_query = "*-feats.npy"
+            mel_load_fn = np.load
+        else:
+            raise ValueError("Support only hdf5 or npy format.")
+        dataset = MelDataset(
+            args.dumpdir,
+            mel_query=mel_query,
+            mel_load_fn=mel_load_fn,
+            return_utt_id=True,
+        )
+    else:
+        dataset = MelSCPDataset(
+            feats_scp=args.feats_scp,
+            return_utt_id=True,
+        )
+    logging.info(f"The number of features to be decoded = {len(dataset)}.")
+
+    # setup model
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+    else:
+        device = torch.device("cpu")
+    model = load_model(args.checkpoint, config)
+    logging.info(f"Loaded model parameters from {args.checkpoint}.")
+    if args.normalize_before:
+        assert hasattr(model, "mean"), "Feature stats are not registered."
+        assert hasattr(model, "scale"), "Feature stats are not registered."
+    model.remove_weight_norm()
+    model = model.eval().to(device)
+
+    # start generation
+    total_rtf = 0.0
+    with torch.no_grad(), tqdm(dataset, desc="[decode]") as pbar:
+        for idx, (utt_id, c) in enumerate(pbar, 1):
+            # generate
+            c = torch.tensor(c, dtype=torch.float).to(device)
+            start = time.time()
+            y = model.inference(c, normalize_before=args.normalize_before).view(-1)
+            rtf = (time.time() - start) / (len(y) / config["sampling_rate"])
+            pbar.set_postfix({"RTF": rtf})
+            total_rtf += rtf
+
+            # save as PCM 16 bit wav file
+            sf.write(
+                os.path.join(config["outdir"], f"{utt_id}_gen.wav"),
+                y.cpu().numpy(),
+                config["sampling_rate"],
+                "PCM_16",
+            )
+
+    # report average RTF
+    logging.info(
+        f"Finished generation of {idx} utterances (RTF = {total_rtf / idx:.03f})."
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ParallelWaveGAN/parallel_wavegan/bin/normalize.py b/ParallelWaveGAN/parallel_wavegan/bin/normalize.py
new file mode 100755
index 0000000000000000000000000000000000000000..53644fd5964299787ea1be39e6082627be32fbfd
--- /dev/null
+++ b/ParallelWaveGAN/parallel_wavegan/bin/normalize.py
@@ -0,0 +1,217 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+"""Normalize feature files and dump them."""
+
+import argparse
+import logging
+import os
+
+import numpy as np
+import yaml
+
+from sklearn.preprocessing import StandardScaler
+from tqdm import tqdm
+
+from parallel_wavegan.datasets import AudioMelDataset
+from parallel_wavegan.datasets import AudioMelSCPDataset
+from parallel_wavegan.datasets import MelDataset
+from parallel_wavegan.datasets import MelSCPDataset
+from parallel_wavegan.utils import read_hdf5
+from parallel_wavegan.utils import write_hdf5
+
+
+def main():
+    """Run preprocessing process."""
+    parser = argparse.ArgumentParser(
+        description="Normalize dumped raw features (See detail in parallel_wavegan/bin/normalize.py)."
+    )
+    parser.add_argument(
+        "--rootdir",
+        default=None,
+        type=str,
+        help="directory including feature files to be normalized. "
+        "you need to specify either *-scp or rootdir.",
+    )
+    parser.add_argument(
+        "--wav-scp",
+        default=None,
+        type=str,
+        help="kaldi-style wav.scp file. "
+        "you need to specify either *-scp or rootdir.",
+    )
+    parser.add_argument(
+        "--feats-scp",
+        default=None,
+        type=str,
+        help="kaldi-style feats.scp file. "
+        "you need to specify either *-scp or rootdir.",
+    )
+    parser.add_argument(
+        "--segments",
+        default=None,
+        type=str,
+        help="kaldi-style segments file.",
+    )
+    parser.add_argument(
+        "--dumpdir",
+        type=str,
+        required=True,
+        help="directory to dump normalized feature files.",
+    )
+    parser.add_argument(
+        "--stats",
+        type=str,
+        required=True,
+        help="statistics file.",
+    )
+    parser.add_argument(
+        "--skip-wav-copy",
+        default=False,
+        action="store_true",
+        help="whether to skip the copy of wav files.",
+    )
+    parser.add_argument(
+        "--config", type=str, required=True, help="yaml format configuration file."
+    )
+    parser.add_argument(
+        "--verbose",
+        type=int,
+        default=1,
+        help="logging level. higher is more logging. (default=1)",
+    )
+    args = parser.parse_args()
+
+    # set logger
+    if args.verbose > 1:
+        logging.basicConfig(
+            level=logging.DEBUG,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    elif args.verbose > 0:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+
+    # load config
+    with open(args.config) as f:
+        config = yaml.load(f, Loader=yaml.Loader)
+    config.update(vars(args))
+
+    # check arguments
+    if (args.feats_scp is not None and args.rootdir is not None) or (
+        args.feats_scp is None and args.rootdir is None
+    ):
+        raise ValueError("Please specify either --rootdir or --feats-scp.")
+
+    # check directory existence
+    if not os.path.exists(args.dumpdir):
+        os.makedirs(args.dumpdir)
+
+    # get dataset
+    if args.rootdir is not None:
+        if config["format"] == "hdf5":
+            audio_query, mel_query = "*.h5", "*.h5"
+            audio_load_fn = lambda x: read_hdf5(x, "wave")  # NOQA
+            mel_load_fn = lambda x: read_hdf5(x, "feats")  # NOQA
+        elif config["format"] == "npy":
+            audio_query, mel_query = "*-wave.npy", "*-feats.npy"
+            audio_load_fn = np.load
+            mel_load_fn = np.load
+        else:
+            raise ValueError("support only hdf5 or npy format.")
+        if not args.skip_wav_copy:
+            dataset = AudioMelDataset(
+                root_dir=args.rootdir,
+                audio_query=audio_query,
+                mel_query=mel_query,
+                audio_load_fn=audio_load_fn,
+                mel_load_fn=mel_load_fn,
+                return_utt_id=True,
+            )
+        else:
+            dataset = MelDataset(
+                root_dir=args.rootdir,
+                mel_query=mel_query,
+                mel_load_fn=mel_load_fn,
+                return_utt_id=True,
+            )
+    else:
+        if not args.skip_wav_copy:
+            dataset = AudioMelSCPDataset(
+                wav_scp=args.wav_scp,
+                feats_scp=args.feats_scp,
+                segments=args.segments,
+                return_utt_id=True,
+            )
+        else:
+            dataset = MelSCPDataset(
+                feats_scp=args.feats_scp,
+                return_utt_id=True,
+            )
+    logging.info(f"The number of files = {len(dataset)}.")
+
+    # restore scaler
+    scaler = StandardScaler()
+    if config["format"] == "hdf5":
+        scaler.mean_ = read_hdf5(args.stats, "mean")
+        scaler.scale_ = read_hdf5(args.stats, "scale")
+    elif config["format"] == "npy":
+        scaler.mean_ = np.load(args.stats)[0]
+        scaler.scale_ = np.load(args.stats)[1]
+    else:
+        raise ValueError("support only hdf5 or npy format.")
+    # from version 0.23.0, this information is needed
+    scaler.n_features_in_ = scaler.mean_.shape[0]
+
+    # process each file
+    for items in tqdm(dataset):
+        if not args.skip_wav_copy:
+            utt_id, audio, mel = items
+        else:
+            utt_id, mel = items
+
+        # normalize
+        mel = scaler.transform(mel)
+
+        # save
+        if config["format"] == "hdf5":
+            write_hdf5(
+                os.path.join(args.dumpdir, f"{utt_id}.h5"),
+                "feats",
+                mel.astype(np.float32),
+            )
+            if not args.skip_wav_copy:
+                write_hdf5(
+                    os.path.join(args.dumpdir, f"{utt_id}.h5"),
+                    "wave",
+                    audio.astype(np.float32),
+                )
+        elif config["format"] == "npy":
+            np.save(
+                os.path.join(args.dumpdir, f"{utt_id}-feats.npy"),
+                mel.astype(np.float32),
+                allow_pickle=False,
+            )
+            if not args.skip_wav_copy:
+                np.save(
+                    os.path.join(args.dumpdir, f"{utt_id}-wave.npy"),
+                    audio.astype(np.float32),
+                    allow_pickle=False,
+                )
+        else:
+            raise ValueError("support only hdf5 or npy format.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ParallelWaveGAN/parallel_wavegan/bin/preprocess.py b/ParallelWaveGAN/parallel_wavegan/bin/preprocess.py
new file mode 100755
index 0000000000000000000000000000000000000000..b1cee745630ad3b0ea22a02ad41548d2b051cc92
--- /dev/null
+++ b/ParallelWaveGAN/parallel_wavegan/bin/preprocess.py
@@ -0,0 +1,270 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+"""Perform preprocessing and raw feature extraction."""
+
+import argparse
+import logging
+import os
+
+import librosa
+import numpy as np
+import soundfile as sf
+import yaml
+
+from tqdm import tqdm
+
+from parallel_wavegan.datasets import AudioDataset
+from parallel_wavegan.datasets import AudioSCPDataset
+from parallel_wavegan.utils import write_hdf5
+
+
+def logmelfilterbank(
+    audio,
+    sampling_rate,
+    fft_size=1024,
+    hop_size=256,
+    win_length=None,
+    window="hann",
+    num_mels=80,
+    fmin=None,
+    fmax=None,
+    eps=1e-10,
+    log_base=10.0,
+):
+    """Compute log-Mel filterbank feature.
+
+    Args:
+        audio (ndarray): Audio signal (T,).
+        sampling_rate (int): Sampling rate.
+        fft_size (int): FFT size.
+        hop_size (int): Hop size.
+        win_length (int): Window length. If set to None, it will be the same as fft_size.
+        window (str): Window function type.
+        num_mels (int): Number of mel basis.
+        fmin (int): Minimum frequency in mel basis calculation.
+        fmax (int): Maximum frequency in mel basis calculation.
+        eps (float): Epsilon value to avoid inf in log calculation.
+        log_base (float): Log base. If set to None, use np.log.
+
+    Returns:
+        ndarray: Log Mel filterbank feature (#frames, num_mels).
+
+    """
+    # get amplitude spectrogram
+    x_stft = librosa.stft(
+        audio,
+        n_fft=fft_size,
+        hop_length=hop_size,
+        win_length=win_length,
+        window=window,
+        pad_mode="reflect",
+    )
+    spc = np.abs(x_stft).T  # (#frames, #bins)
+
+    # get mel basis
+    fmin = 0 if fmin is None else fmin
+    fmax = sampling_rate / 2 if fmax is None else fmax
+    mel_basis = librosa.filters.mel(sampling_rate, fft_size, num_mels, fmin, fmax)
+    mel = np.maximum(eps, np.dot(spc, mel_basis.T))
+
+    if log_base is None:
+        return np.log(mel)
+    elif log_base == 10.0:
+        return np.log10(mel)
+    elif log_base == 2.0:
+        return np.log2(mel)
+    else:
+        raise ValueError(f"{log_base} is not supported.")
+
+
+def main():
+    """Run preprocessing process."""
+    parser = argparse.ArgumentParser(
+        description="Preprocess audio and then extract features (See detail in parallel_wavegan/bin/preprocess.py)."
+    )
+    parser.add_argument(
+        "--wav-scp",
+        "--scp",
+        default=None,
+        type=str,
+        help="kaldi-style wav.scp file. you need to specify either scp or rootdir.",
+    )
+    parser.add_argument(
+        "--segments",
+        default=None,
+        type=str,
+        help="kaldi-style segments file. if use, you must to specify both scp and segments.",
+    )
+    parser.add_argument(
+        "--rootdir",
+        default=None,
+        type=str,
+        help="directory including wav files. you need to specify either scp or rootdir.",
+    )
+    parser.add_argument(
+        "--dumpdir",
+        type=str,
+        required=True,
+        help="directory to dump feature files.",
+    )
+    parser.add_argument(
+        "--config",
+        type=str,
+        required=True,
+        help="yaml format configuration file.",
+    )
+    parser.add_argument(
+        "--verbose",
+        type=int,
+        default=1,
+        help="logging level. higher is more logging. (default=1)",
+    )
+    args = parser.parse_args()
+
+    # set logger
+    if args.verbose > 1:
+        logging.basicConfig(
+            level=logging.DEBUG,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    elif args.verbose > 0:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+
+    # load config
+    with open(args.config) as f:
+        config = yaml.load(f, Loader=yaml.Loader)
+    config.update(vars(args))
+
+    # check arguments
+    if (args.wav_scp is not None and args.rootdir is not None) or (
+        args.wav_scp is None and args.rootdir is None
+    ):
+        raise ValueError("Please specify either --rootdir or --wav-scp.")
+
+    # get dataset
+    if args.rootdir is not None:
+        dataset = AudioDataset(
+            args.rootdir,
+            "*.wav",
+            audio_load_fn=sf.read,
+            return_utt_id=True,
+        )
+    else:
+        dataset = AudioSCPDataset(
+            args.wav_scp,
+            segments=args.segments,
+            return_utt_id=True,
+            return_sampling_rate=True,
+        )
+
+    # check directly existence
+    if not os.path.exists(args.dumpdir):
+        os.makedirs(args.dumpdir, exist_ok=True)
+
+    # process each data
+    for utt_id, (audio, fs) in tqdm(dataset):
+        # check
+        assert len(audio.shape) == 1, f"{utt_id} seems to be multi-channel signal."
+        assert (
+            np.abs(audio).max() <= 1.0
+        ), f"{utt_id} seems to be different from 16 bit PCM."
+        assert (
+            fs == config["sampling_rate"]
+        ), f"{utt_id} seems to have a different sampling rate."
+
+        # trim silence
+        if config["trim_silence"]:
+            audio, _ = librosa.effects.trim(
+                audio,
+                top_db=config["trim_threshold_in_db"],
+                frame_length=config["trim_frame_size"],
+                hop_length=config["trim_hop_size"],
+            )
+
+        if "sampling_rate_for_feats" not in config:
+            x = audio
+            sampling_rate = config["sampling_rate"]
+            hop_size = config["hop_size"]
+        else:
+            # NOTE(kan-bayashi): this procedure enables to train the model with different
+            #   sampling rate for feature and audio, e.g., training with mel extracted
+            #   using 16 kHz audio and 24 kHz audio as a target waveform
+            x = librosa.resample(audio, fs, config["sampling_rate_for_feats"])
+            sampling_rate = config["sampling_rate_for_feats"]
+            assert (
+                config["hop_size"] * config["sampling_rate_for_feats"] % fs == 0
+            ), "hop_size must be int value. please check sampling_rate_for_feats is correct."
+            hop_size = config["hop_size"] * config["sampling_rate_for_feats"] // fs
+
+        # extract feature
+        mel = logmelfilterbank(
+            x,
+            sampling_rate=sampling_rate,
+            hop_size=hop_size,
+            fft_size=config["fft_size"],
+            win_length=config["win_length"],
+            window=config["window"],
+            num_mels=config["num_mels"],
+            fmin=config["fmin"],
+            fmax=config["fmax"],
+            # keep compatibility
+            log_base=config.get("log_base", 10.0),
+        )
+
+        # make sure the audio length and feature length are matched
+        audio = np.pad(audio, (0, config["fft_size"]), mode="reflect")
+        audio = audio[: len(mel) * config["hop_size"]]
+        assert len(mel) * config["hop_size"] == len(audio)
+
+        # apply global gain
+        if config["global_gain_scale"] > 0.0:
+            audio *= config["global_gain_scale"]
+        if np.abs(audio).max() >= 1.0:
+            logging.warn(
+                f"{utt_id} causes clipping. "
+                f"it is better to re-consider global gain scale."
+            )
+            continue
+
+        # save
+        if config["format"] == "hdf5":
+            write_hdf5(
+                os.path.join(args.dumpdir, f"{utt_id}.h5"),
+                "wave",
+                audio.astype(np.float32),
+            )
+            write_hdf5(
+                os.path.join(args.dumpdir, f"{utt_id}.h5"),
+                "feats",
+                mel.astype(np.float32),
+            )
+        elif config["format"] == "npy":
+            np.save(
+                os.path.join(args.dumpdir, f"{utt_id}-wave.npy"),
+                audio.astype(np.float32),
+                allow_pickle=False,
+            )
+            np.save(
+                os.path.join(args.dumpdir, f"{utt_id}-feats.npy"),
+                mel.astype(np.float32),
+                allow_pickle=False,
+            )
+        else:
+            raise ValueError("support only hdf5 or npy format.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ParallelWaveGAN/parallel_wavegan/bin/train.py b/ParallelWaveGAN/parallel_wavegan/bin/train.py
new file mode 100755
index 0000000000000000000000000000000000000000..d33021434cb6528c7e8ee54dee744527b7f5d26d
--- /dev/null
+++ b/ParallelWaveGAN/parallel_wavegan/bin/train.py
@@ -0,0 +1,1077 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+"""Train Parallel WaveGAN."""
+
+import argparse
+import logging
+import os
+import sys
+
+from collections import defaultdict
+
+import matplotlib
+import numpy as np
+import soundfile as sf
+import torch
+import yaml
+
+from tensorboardX import SummaryWriter
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+import parallel_wavegan
+import parallel_wavegan.models
+import parallel_wavegan.optimizers
+
+from parallel_wavegan.datasets import AudioMelDataset
+from parallel_wavegan.datasets import AudioMelSCPDataset
+from parallel_wavegan.layers import PQMF
+from parallel_wavegan.losses import DiscriminatorAdversarialLoss
+from parallel_wavegan.losses import FeatureMatchLoss
+from parallel_wavegan.losses import GeneratorAdversarialLoss
+from parallel_wavegan.losses import MelSpectrogramLoss
+from parallel_wavegan.losses import MultiResolutionSTFTLoss
+from parallel_wavegan.utils import read_hdf5
+
+# set to avoid matplotlib error in CLI environment
+matplotlib.use("Agg")
+
+
+class Trainer(object):
+    """Customized trainer module for Parallel WaveGAN training."""
+
+    def __init__(
+        self,
+        steps,
+        epochs,
+        data_loader,
+        sampler,
+        model,
+        criterion,
+        optimizer,
+        scheduler,
+        config,
+        device=torch.device("cpu"),
+    ):
+        """Initialize trainer.
+
+        Args:
+            steps (int): Initial global steps.
+            epochs (int): Initial global epochs.
+            data_loader (dict): Dict of data loaders. It must contrain "train" and "dev" loaders.
+            model (dict): Dict of models. It must contrain "generator" and "discriminator" models.
+            criterion (dict): Dict of criterions. It must contrain "stft" and "mse" criterions.
+            optimizer (dict): Dict of optimizers. It must contrain "generator" and "discriminator" optimizers.
+            scheduler (dict): Dict of schedulers. It must contrain "generator" and "discriminator" schedulers.
+            config (dict): Config dict loaded from yaml format configuration file.
+            device (torch.deive): Pytorch device instance.
+
+        """
+        self.steps = steps
+        self.epochs = epochs
+        self.data_loader = data_loader
+        self.sampler = sampler
+        self.model = model
+        self.criterion = criterion
+        self.optimizer = optimizer
+        self.scheduler = scheduler
+        self.config = config
+        self.device = device
+        self.writer = SummaryWriter(config["outdir"])
+        self.finish_train = False
+        self.total_train_loss = defaultdict(float)
+        self.total_eval_loss = defaultdict(float)
+
+    def run(self):
+        """Run training."""
+        self.tqdm = tqdm(
+            initial=self.steps, total=self.config["train_max_steps"], desc="[train]"
+        )
+        while True:
+            # train one epoch
+            self._train_epoch()
+
+            # check whether training is finished
+            if self.finish_train:
+                break
+
+        self.tqdm.close()
+        logging.info("Finished training.")
+
+    def save_checkpoint(self, checkpoint_path):
+        """Save checkpoint.
+
+        Args:
+            checkpoint_path (str): Checkpoint path to be saved.
+
+        """
+        state_dict = {
+            "optimizer": {
+                "generator": self.optimizer["generator"].state_dict(),
+                "discriminator": self.optimizer["discriminator"].state_dict(),
+            },
+            "scheduler": {
+                "generator": self.scheduler["generator"].state_dict(),
+                "discriminator": self.scheduler["discriminator"].state_dict(),
+            },
+            "steps": self.steps,
+            "epochs": self.epochs,
+        }
+        if self.config["distributed"]:
+            state_dict["model"] = {
+                "generator": self.model["generator"].module.state_dict(),
+                "discriminator": self.model["discriminator"].module.state_dict(),
+            }
+        else:
+            state_dict["model"] = {
+                "generator": self.model["generator"].state_dict(),
+                "discriminator": self.model["discriminator"].state_dict(),
+            }
+
+        if not os.path.exists(os.path.dirname(checkpoint_path)):
+            os.makedirs(os.path.dirname(checkpoint_path))
+        torch.save(state_dict, checkpoint_path)
+
+    def load_checkpoint(self, checkpoint_path, load_only_params=False):
+        """Load checkpoint.
+
+        Args:
+            checkpoint_path (str): Checkpoint path to be loaded.
+            load_only_params (bool): Whether to load only model parameters.
+
+        """
+        state_dict = torch.load(checkpoint_path, map_location="cpu")
+        if self.config["distributed"]:
+            self.model["generator"].module.load_state_dict(
+                state_dict["model"]["generator"]
+            )
+            self.model["discriminator"].module.load_state_dict(
+                state_dict["model"]["discriminator"]
+            )
+        else:
+            self.model["generator"].load_state_dict(state_dict["model"]["generator"])
+            self.model["discriminator"].load_state_dict(
+                state_dict["model"]["discriminator"]
+            )
+        if not load_only_params:
+            self.steps = state_dict["steps"]
+            self.epochs = state_dict["epochs"]
+            self.optimizer["generator"].load_state_dict(
+                state_dict["optimizer"]["generator"]
+            )
+            self.optimizer["discriminator"].load_state_dict(
+                state_dict["optimizer"]["discriminator"]
+            )
+            self.scheduler["generator"].load_state_dict(
+                state_dict["scheduler"]["generator"]
+            )
+            self.scheduler["discriminator"].load_state_dict(
+                state_dict["scheduler"]["discriminator"]
+            )
+
+    def _train_step(self, batch):
+        """Train model one step."""
+        # parse batch
+        x, y = batch
+        x = tuple([x_.to(self.device) for x_ in x])
+        y = y.to(self.device)
+
+        #######################
+        #      Generator      #
+        #######################
+        if self.steps > self.config.get("generator_train_start_steps", 0):
+            y_ = self.model["generator"](*x)
+
+            # reconstruct the signal from multi-band signal
+            if self.config["generator_params"]["out_channels"] > 1:
+                y_mb_ = y_
+                y_ = self.criterion["pqmf"].synthesis(y_mb_)
+
+            # initialize
+            gen_loss = 0.0
+
+            # multi-resolution sfft loss
+            if self.config["use_stft_loss"]:
+                sc_loss, mag_loss = self.criterion["stft"](y_, y)
+                gen_loss += sc_loss + mag_loss
+                self.total_train_loss[
+                    "train/spectral_convergence_loss"
+                ] += sc_loss.item()
+                self.total_train_loss[
+                    "train/log_stft_magnitude_loss"
+                ] += mag_loss.item()
+
+            # subband multi-resolution stft loss
+            if self.config["use_subband_stft_loss"]:
+                gen_loss *= 0.5  # for balancing with subband stft loss
+                y_mb = self.criterion["pqmf"].analysis(y)
+                sub_sc_loss, sub_mag_loss = self.criterion["sub_stft"](y_mb_, y_mb)
+                gen_loss += 0.5 * (sub_sc_loss + sub_mag_loss)
+                self.total_train_loss[
+                    "train/sub_spectral_convergence_loss"
+                ] += sub_sc_loss.item()
+                self.total_train_loss[
+                    "train/sub_log_stft_magnitude_loss"
+                ] += sub_mag_loss.item()
+
+            # mel spectrogram loss
+            if self.config["use_mel_loss"]:
+                mel_loss = self.criterion["mel"](y_, y)
+                gen_loss += mel_loss
+                self.total_train_loss["train/mel_loss"] += mel_loss.item()
+
+            # weighting aux loss
+            gen_loss *= self.config.get("lambda_aux", 1.0)
+
+            # adversarial loss
+            if self.steps > self.config["discriminator_train_start_steps"]:
+                p_ = self.model["discriminator"](y_)
+                adv_loss = self.criterion["gen_adv"](p_)
+                self.total_train_loss["train/adversarial_loss"] += adv_loss.item()
+
+                # feature matching loss
+                if self.config["use_feat_match_loss"]:
+                    # no need to track gradients
+                    with torch.no_grad():
+                        p = self.model["discriminator"](y)
+                    fm_loss = self.criterion["feat_match"](p_, p)
+                    self.total_train_loss[
+                        "train/feature_matching_loss"
+                    ] += fm_loss.item()
+                    adv_loss += self.config["lambda_feat_match"] * fm_loss
+
+                # add adversarial loss to generator loss
+                gen_loss += self.config["lambda_adv"] * adv_loss
+
+            self.total_train_loss["train/generator_loss"] += gen_loss.item()
+
+            # update generator
+            self.optimizer["generator"].zero_grad()
+            gen_loss.backward()
+            if self.config["generator_grad_norm"] > 0:
+                torch.nn.utils.clip_grad_norm_(
+                    self.model["generator"].parameters(),
+                    self.config["generator_grad_norm"],
+                )
+            self.optimizer["generator"].step()
+            self.scheduler["generator"].step()
+
+        #######################
+        #    Discriminator    #
+        #######################
+        if self.steps > self.config["discriminator_train_start_steps"]:
+            # re-compute y_ which leads better quality
+            with torch.no_grad():
+                y_ = self.model["generator"](*x)
+            if self.config["generator_params"]["out_channels"] > 1:
+                y_ = self.criterion["pqmf"].synthesis(y_)
+
+            # discriminator loss
+            p = self.model["discriminator"](y)
+            p_ = self.model["discriminator"](y_.detach())
+            real_loss, fake_loss = self.criterion["dis_adv"](p_, p)
+            dis_loss = real_loss + fake_loss
+            self.total_train_loss["train/real_loss"] += real_loss.item()
+            self.total_train_loss["train/fake_loss"] += fake_loss.item()
+            self.total_train_loss["train/discriminator_loss"] += dis_loss.item()
+
+            # update discriminator
+            self.optimizer["discriminator"].zero_grad()
+            dis_loss.backward()
+            if self.config["discriminator_grad_norm"] > 0:
+                torch.nn.utils.clip_grad_norm_(
+                    self.model["discriminator"].parameters(),
+                    self.config["discriminator_grad_norm"],
+                )
+            self.optimizer["discriminator"].step()
+            self.scheduler["discriminator"].step()
+
+        # update counts
+        self.steps += 1
+        self.tqdm.update(1)
+        self._check_train_finish()
+
+    def _train_epoch(self):
+        """Train model one epoch."""
+        for train_steps_per_epoch, batch in enumerate(self.data_loader["train"], 1):
+            # train one step
+            self._train_step(batch)
+
+            # check interval
+            if self.config["rank"] == 0:
+                self._check_log_interval()
+                self._check_eval_interval()
+                self._check_save_interval()
+
+            # check whether training is finished
+            if self.finish_train:
+                return
+
+        # update
+        self.epochs += 1
+        self.train_steps_per_epoch = train_steps_per_epoch
+        logging.info(
+            f"(Steps: {self.steps}) Finished {self.epochs} epoch training "
+            f"({self.train_steps_per_epoch} steps per epoch)."
+        )
+
+        # needed for shuffle in distributed training
+        if self.config["distributed"]:
+            self.sampler["train"].set_epoch(self.epochs)
+
+    @torch.no_grad()
+    def _eval_step(self, batch):
+        """Evaluate model one step."""
+        # parse batch
+        x, y = batch
+        x = tuple([x_.to(self.device) for x_ in x])
+        y = y.to(self.device)
+
+        #######################
+        #      Generator      #
+        #######################
+        y_ = self.model["generator"](*x)
+        if self.config["generator_params"]["out_channels"] > 1:
+            y_mb_ = y_
+            y_ = self.criterion["pqmf"].synthesis(y_mb_)
+
+        # initialize
+        aux_loss = 0.0
+
+        # multi-resolution stft loss
+        if self.config["use_stft_loss"]:
+            sc_loss, mag_loss = self.criterion["stft"](y_, y)
+            aux_loss += sc_loss + mag_loss
+            self.total_eval_loss["eval/spectral_convergence_loss"] += sc_loss.item()
+            self.total_eval_loss["eval/log_stft_magnitude_loss"] += mag_loss.item()
+
+        # subband multi-resolution stft loss
+        if self.config.get("use_subband_stft_loss", False):
+            aux_loss *= 0.5  # for balancing with subband stft loss
+            y_mb = self.criterion["pqmf"].analysis(y)
+            sub_sc_loss, sub_mag_loss = self.criterion["sub_stft"](y_mb_, y_mb)
+            self.total_eval_loss[
+                "eval/sub_spectral_convergence_loss"
+            ] += sub_sc_loss.item()
+            self.total_eval_loss[
+                "eval/sub_log_stft_magnitude_loss"
+            ] += sub_mag_loss.item()
+            aux_loss += 0.5 * (sub_sc_loss + sub_mag_loss)
+
+        # mel spectrogram loss
+        if self.config["use_mel_loss"]:
+            mel_loss = self.criterion["mel"](y_, y)
+            aux_loss += mel_loss
+            self.total_eval_loss["eval/mel_loss"] += mel_loss.item()
+
+        # weighting stft loss
+        aux_loss *= self.config.get("lambda_aux", 1.0)
+
+        # adversarial loss
+        p_ = self.model["discriminator"](y_)
+        adv_loss = self.criterion["gen_adv"](p_)
+        gen_loss = aux_loss + self.config["lambda_adv"] * adv_loss
+
+        # feature matching loss
+        if self.config["use_feat_match_loss"]:
+            p = self.model["discriminator"](y)
+            fm_loss = self.criterion["feat_match"](p_, p)
+            self.total_eval_loss["eval/feature_matching_loss"] += fm_loss.item()
+            gen_loss += (
+                self.config["lambda_adv"] * self.config["lambda_feat_match"] * fm_loss
+            )
+
+        #######################
+        #    Discriminator    #
+        #######################
+        p = self.model["discriminator"](y)
+        p_ = self.model["discriminator"](y_)
+
+        # discriminator loss
+        real_loss, fake_loss = self.criterion["dis_adv"](p_, p)
+        dis_loss = real_loss + fake_loss
+
+        # add to total eval loss
+        self.total_eval_loss["eval/adversarial_loss"] += adv_loss.item()
+        self.total_eval_loss["eval/generator_loss"] += gen_loss.item()
+        self.total_eval_loss["eval/real_loss"] += real_loss.item()
+        self.total_eval_loss["eval/fake_loss"] += fake_loss.item()
+        self.total_eval_loss["eval/discriminator_loss"] += dis_loss.item()
+
+    def _eval_epoch(self):
+        """Evaluate model one epoch."""
+        logging.info(f"(Steps: {self.steps}) Start evaluation.")
+        # change mode
+        for key in self.model.keys():
+            self.model[key].eval()
+
+        # calculate loss for each batch
+        for eval_steps_per_epoch, batch in enumerate(
+            tqdm(self.data_loader["dev"], desc="[eval]"), 1
+        ):
+            # eval one step
+            self._eval_step(batch)
+
+            # save intermediate result
+            if eval_steps_per_epoch == 1:
+                self._genearete_and_save_intermediate_result(batch)
+
+        logging.info(
+            f"(Steps: {self.steps}) Finished evaluation "
+            f"({eval_steps_per_epoch} steps per epoch)."
+        )
+
+        # average loss
+        for key in self.total_eval_loss.keys():
+            self.total_eval_loss[key] /= eval_steps_per_epoch
+            logging.info(
+                f"(Steps: {self.steps}) {key} = {self.total_eval_loss[key]:.4f}."
+            )
+
+        # record
+        self._write_to_tensorboard(self.total_eval_loss)
+
+        # reset
+        self.total_eval_loss = defaultdict(float)
+
+        # restore mode
+        for key in self.model.keys():
+            self.model[key].train()
+
+    @torch.no_grad()
+    def _genearete_and_save_intermediate_result(self, batch):
+        """Generate and save intermediate result."""
+        # delayed import to avoid error related backend error
+        import matplotlib.pyplot as plt
+
+        # generate
+        x_batch, y_batch = batch
+        x_batch = tuple([x.to(self.device) for x in x_batch])
+        y_batch = y_batch.to(self.device)
+        y_batch_ = self.model["generator"](*x_batch)
+        if self.config["generator_params"]["out_channels"] > 1:
+            y_batch_ = self.criterion["pqmf"].synthesis(y_batch_)
+
+        # check directory
+        dirname = os.path.join(self.config["outdir"], f"predictions/{self.steps}steps")
+        if not os.path.exists(dirname):
+            os.makedirs(dirname)
+
+        for idx, (y, y_) in enumerate(zip(y_batch, y_batch_), 1):
+            # convert to ndarray
+            y, y_ = y.view(-1).cpu().numpy(), y_.view(-1).cpu().numpy()
+
+            # plot figure and save it
+            figname = os.path.join(dirname, f"{idx}.png")
+            plt.subplot(2, 1, 1)
+            plt.plot(y)
+            plt.title("groundtruth speech")
+            plt.subplot(2, 1, 2)
+            plt.plot(y_)
+            plt.title(f"generated speech @ {self.steps} steps")
+            plt.tight_layout()
+            plt.savefig(figname)
+            plt.close()
+
+            # save as wavfile
+            y = np.clip(y, -1, 1)
+            y_ = np.clip(y_, -1, 1)
+            sf.write(
+                figname.replace(".png", "_ref.wav"),
+                y,
+                self.config["sampling_rate"],
+                "PCM_16",
+            )
+            sf.write(
+                figname.replace(".png", "_gen.wav"),
+                y_,
+                self.config["sampling_rate"],
+                "PCM_16",
+            )
+
+            if idx >= self.config["num_save_intermediate_results"]:
+                break
+
+    def _write_to_tensorboard(self, loss):
+        """Write to tensorboard."""
+        for key, value in loss.items():
+            self.writer.add_scalar(key, value, self.steps)
+
+    def _check_save_interval(self):
+        if self.steps % self.config["save_interval_steps"] == 0:
+            self.save_checkpoint(
+                os.path.join(self.config["outdir"], f"checkpoint-{self.steps}steps.pkl")
+            )
+            logging.info(f"Successfully saved checkpoint @ {self.steps} steps.")
+
+    def _check_eval_interval(self):
+        if self.steps % self.config["eval_interval_steps"] == 0:
+            self._eval_epoch()
+
+    def _check_log_interval(self):
+        if self.steps % self.config["log_interval_steps"] == 0:
+            for key in self.total_train_loss.keys():
+                self.total_train_loss[key] /= self.config["log_interval_steps"]
+                logging.info(
+                    f"(Steps: {self.steps}) {key} = {self.total_train_loss[key]:.4f}."
+                )
+            self._write_to_tensorboard(self.total_train_loss)
+
+            # reset
+            self.total_train_loss = defaultdict(float)
+
+    def _check_train_finish(self):
+        if self.steps >= self.config["train_max_steps"]:
+            self.finish_train = True
+
+
+class Collater(object):
+    """Customized collater for Pytorch DataLoader in training."""
+
+    def __init__(
+        self,
+        batch_max_steps=20480,
+        hop_size=256,
+        aux_context_window=2,
+        use_noise_input=False,
+    ):
+        """Initialize customized collater for PyTorch DataLoader.
+
+        Args:
+            batch_max_steps (int): The maximum length of input signal in batch.
+            hop_size (int): Hop size of auxiliary features.
+            aux_context_window (int): Context window size for auxiliary feature conv.
+            use_noise_input (bool): Whether to use noise input.
+
+        """
+        if batch_max_steps % hop_size != 0:
+            batch_max_steps += -(batch_max_steps % hop_size)
+        assert batch_max_steps % hop_size == 0
+        self.batch_max_steps = batch_max_steps
+        self.batch_max_frames = batch_max_steps // hop_size
+        self.hop_size = hop_size
+        self.aux_context_window = aux_context_window
+        self.use_noise_input = use_noise_input
+
+        # set useful values in random cutting
+        self.start_offset = aux_context_window
+        self.end_offset = -(self.batch_max_frames + aux_context_window)
+        self.mel_threshold = self.batch_max_frames + 2 * aux_context_window
+
+    def __call__(self, batch):
+        """Convert into batch tensors.
+
+        Args:
+            batch (list): list of tuple of the pair of audio and features.
+
+        Returns:
+            Tensor: Gaussian noise batch (B, 1, T).
+            Tensor: Auxiliary feature batch (B, C, T'), where
+                T = (T' - 2 * aux_context_window) * hop_size.
+            Tensor: Target signal batch (B, 1, T).
+
+        """
+        # check length
+        batch = [
+            self._adjust_length(*b) for b in batch if len(b[1]) > self.mel_threshold
+        ]
+        xs, cs = [b[0] for b in batch], [b[1] for b in batch]
+
+        # make batch with random cut
+        c_lengths = [len(c) for c in cs]
+        start_frames = np.array(
+            [
+                np.random.randint(self.start_offset, cl + self.end_offset)
+                for cl in c_lengths
+            ]
+        )
+        x_starts = start_frames * self.hop_size
+        x_ends = x_starts + self.batch_max_steps
+        c_starts = start_frames - self.aux_context_window
+        c_ends = start_frames + self.batch_max_frames + self.aux_context_window
+        y_batch = [x[start:end] for x, start, end in zip(xs, x_starts, x_ends)]
+        c_batch = [c[start:end] for c, start, end in zip(cs, c_starts, c_ends)]
+
+        # convert each batch to tensor, asuume that each item in batch has the same length
+        y_batch = torch.tensor(y_batch, dtype=torch.float).unsqueeze(1)  # (B, 1, T)
+        c_batch = torch.tensor(c_batch, dtype=torch.float).transpose(2, 1)  # (B, C, T')
+
+        # make input noise signal batch tensor
+        if self.use_noise_input:
+            z_batch = torch.randn(y_batch.size())  # (B, 1, T)
+            return (z_batch, c_batch), y_batch
+        else:
+            return (c_batch,), y_batch
+
+    def _adjust_length(self, x, c):
+        """Adjust the audio and feature lengths.
+
+        Note:
+            Basically we assume that the length of x and c are adjusted
+            through preprocessing stage, but if we use other library processed
+            features, this process will be needed.
+
+        """
+        if len(x) < len(c) * self.hop_size:
+            x = np.pad(x, (0, len(c) * self.hop_size - len(x)), mode="edge")
+
+        # check the legnth is valid
+        assert len(x) == len(c) * self.hop_size
+
+        return x, c
+
+
+def main():
+    """Run training process."""
+    parser = argparse.ArgumentParser(
+        description="Train Parallel WaveGAN (See detail in parallel_wavegan/bin/train.py)."
+    )
+    parser.add_argument(
+        "--train-wav-scp",
+        default=None,
+        type=str,
+        help="kaldi-style wav.scp file for training. "
+        "you need to specify either train-*-scp or train-dumpdir.",
+    )
+    parser.add_argument(
+        "--train-feats-scp",
+        default=None,
+        type=str,
+        help="kaldi-style feats.scp file for training. "
+        "you need to specify either train-*-scp or train-dumpdir.",
+    )
+    parser.add_argument(
+        "--train-segments",
+        default=None,
+        type=str,
+        help="kaldi-style segments file for training.",
+    )
+    parser.add_argument(
+        "--train-dumpdir",
+        default=None,
+        type=str,
+        help="directory including training data. "
+        "you need to specify either train-*-scp or train-dumpdir.",
+    )
+    parser.add_argument(
+        "--dev-wav-scp",
+        default=None,
+        type=str,
+        help="kaldi-style wav.scp file for validation. "
+        "you need to specify either dev-*-scp or dev-dumpdir.",
+    )
+    parser.add_argument(
+        "--dev-feats-scp",
+        default=None,
+        type=str,
+        help="kaldi-style feats.scp file for vaidation. "
+        "you need to specify either dev-*-scp or dev-dumpdir.",
+    )
+    parser.add_argument(
+        "--dev-segments",
+        default=None,
+        type=str,
+        help="kaldi-style segments file for validation.",
+    )
+    parser.add_argument(
+        "--dev-dumpdir",
+        default=None,
+        type=str,
+        help="directory including development data. "
+        "you need to specify either dev-*-scp or dev-dumpdir.",
+    )
+    parser.add_argument(
+        "--outdir",
+        type=str,
+        required=True,
+        help="directory to save checkpoints.",
+    )
+    parser.add_argument(
+        "--config",
+        type=str,
+        required=True,
+        help="yaml format configuration file.",
+    )
+    parser.add_argument(
+        "--pretrain",
+        default="",
+        type=str,
+        nargs="?",
+        help='checkpoint file path to load pretrained params. (default="")',
+    )
+    parser.add_argument(
+        "--resume",
+        default="",
+        type=str,
+        nargs="?",
+        help='checkpoint file path to resume training. (default="")',
+    )
+    parser.add_argument(
+        "--verbose",
+        type=int,
+        default=1,
+        help="logging level. higher is more logging. (default=1)",
+    )
+    parser.add_argument(
+        "--rank",
+        "--local_rank",
+        default=0,
+        type=int,
+        help="rank for distributed training. no need to explictly specify.",
+    )
+    args = parser.parse_args()
+
+    args.distributed = False
+    if not torch.cuda.is_available():
+        device = torch.device("cpu")
+    else:
+        device = torch.device("cuda")
+        # effective when using fixed size inputs
+        # see https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936
+        torch.backends.cudnn.benchmark = True
+        torch.cuda.set_device(args.rank)
+        # setup for distributed training
+        # see example: https://github.com/NVIDIA/apex/tree/master/examples/simple/distributed
+        if "WORLD_SIZE" in os.environ:
+            args.world_size = int(os.environ["WORLD_SIZE"])
+            args.distributed = args.world_size > 1
+        if args.distributed:
+            torch.distributed.init_process_group(backend="nccl", init_method="env://")
+
+    # suppress logging for distributed training
+    if args.rank != 0:
+        sys.stdout = open(os.devnull, "w")
+
+    # set logger
+    if args.verbose > 1:
+        logging.basicConfig(
+            level=logging.DEBUG,
+            stream=sys.stdout,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    elif args.verbose > 0:
+        logging.basicConfig(
+            level=logging.INFO,
+            stream=sys.stdout,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            stream=sys.stdout,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+
+    # check directory existence
+    if not os.path.exists(args.outdir):
+        os.makedirs(args.outdir)
+
+    # check arguments
+    if (args.train_feats_scp is not None and args.train_dumpdir is not None) or (
+        args.train_feats_scp is None and args.train_dumpdir is None
+    ):
+        raise ValueError("Please specify either --train-dumpdir or --train-*-scp.")
+    if (args.dev_feats_scp is not None and args.dev_dumpdir is not None) or (
+        args.dev_feats_scp is None and args.dev_dumpdir is None
+    ):
+        raise ValueError("Please specify either --dev-dumpdir or --dev-*-scp.")
+
+    # load and save config
+    with open(args.config) as f:
+        config = yaml.load(f, Loader=yaml.Loader)
+    config.update(vars(args))
+    config["version"] = parallel_wavegan.__version__  # add version info
+    with open(os.path.join(args.outdir, "config.yml"), "w") as f:
+        yaml.dump(config, f, Dumper=yaml.Dumper)
+    for key, value in config.items():
+        logging.info(f"{key} = {value}")
+
+    # get dataset
+    if config["remove_short_samples"]:
+        mel_length_threshold = config["batch_max_steps"] // config[
+            "hop_size"
+        ] + 2 * config["generator_params"].get("aux_context_window", 0)
+    else:
+        mel_length_threshold = None
+    if args.train_wav_scp is None or args.dev_wav_scp is None:
+        if config["format"] == "hdf5":
+            audio_query, mel_query = "*.h5", "*.h5"
+            audio_load_fn = lambda x: read_hdf5(x, "wave")  # NOQA
+            mel_load_fn = lambda x: read_hdf5(x, "feats")  # NOQA
+        elif config["format"] == "npy":
+            audio_query, mel_query = "*-wave.npy", "*-feats.npy"
+            audio_load_fn = np.load
+            mel_load_fn = np.load
+        else:
+            raise ValueError("support only hdf5 or npy format.")
+    if args.train_dumpdir is not None:
+        train_dataset = AudioMelDataset(
+            root_dir=args.train_dumpdir,
+            audio_query=audio_query,
+            mel_query=mel_query,
+            audio_load_fn=audio_load_fn,
+            mel_load_fn=mel_load_fn,
+            mel_length_threshold=mel_length_threshold,
+            allow_cache=config.get("allow_cache", False),  # keep compatibility
+        )
+    else:
+        train_dataset = AudioMelSCPDataset(
+            wav_scp=args.train_wav_scp,
+            feats_scp=args.train_feats_scp,
+            segments=args.train_segments,
+            mel_length_threshold=mel_length_threshold,
+            allow_cache=config.get("allow_cache", False),  # keep compatibility
+        )
+    logging.info(f"The number of training files = {len(train_dataset)}.")
+    if args.dev_dumpdir is not None:
+        dev_dataset = AudioMelDataset(
+            root_dir=args.dev_dumpdir,
+            audio_query=audio_query,
+            mel_query=mel_query,
+            audio_load_fn=audio_load_fn,
+            mel_load_fn=mel_load_fn,
+            mel_length_threshold=mel_length_threshold,
+            allow_cache=config.get("allow_cache", False),  # keep compatibility
+        )
+    else:
+        dev_dataset = AudioMelSCPDataset(
+            wav_scp=args.dev_wav_scp,
+            feats_scp=args.dev_feats_scp,
+            segments=args.dev_segments,
+            mel_length_threshold=mel_length_threshold,
+            allow_cache=config.get("allow_cache", False),  # keep compatibility
+        )
+    logging.info(f"The number of development files = {len(dev_dataset)}.")
+    dataset = {
+        "train": train_dataset,
+        "dev": dev_dataset,
+    }
+
+    # get data loader
+    collater = Collater(
+        batch_max_steps=config["batch_max_steps"],
+        hop_size=config["hop_size"],
+        # keep compatibility
+        aux_context_window=config["generator_params"].get("aux_context_window", 0),
+        # keep compatibility
+        use_noise_input=config.get("generator_type", "ParallelWaveGANGenerator")
+        in ["ParallelWaveGANGenerator"],
+    )
+    sampler = {"train": None, "dev": None}
+    if args.distributed:
+        # setup sampler for distributed training
+        from torch.utils.data.distributed import DistributedSampler
+
+        sampler["train"] = DistributedSampler(
+            dataset=dataset["train"],
+            num_replicas=args.world_size,
+            rank=args.rank,
+            shuffle=True,
+        )
+        sampler["dev"] = DistributedSampler(
+            dataset=dataset["dev"],
+            num_replicas=args.world_size,
+            rank=args.rank,
+            shuffle=False,
+        )
+    data_loader = {
+        "train": DataLoader(
+            dataset=dataset["train"],
+            shuffle=False if args.distributed else True,
+            collate_fn=collater,
+            batch_size=config["batch_size"],
+            num_workers=config["num_workers"],
+            sampler=sampler["train"],
+            pin_memory=config["pin_memory"],
+        ),
+        "dev": DataLoader(
+            dataset=dataset["dev"],
+            shuffle=False if args.distributed else True,
+            collate_fn=collater,
+            batch_size=config["batch_size"],
+            num_workers=config["num_workers"],
+            sampler=sampler["dev"],
+            pin_memory=config["pin_memory"],
+        ),
+    }
+
+    # define models
+    generator_class = getattr(
+        parallel_wavegan.models,
+        # keep compatibility
+        config.get("generator_type", "ParallelWaveGANGenerator"),
+    )
+    discriminator_class = getattr(
+        parallel_wavegan.models,
+        # keep compatibility
+        config.get("discriminator_type", "ParallelWaveGANDiscriminator"),
+    )
+    model = {
+        "generator": generator_class(
+            **config["generator_params"],
+        ).to(device),
+        "discriminator": discriminator_class(
+            **config["discriminator_params"],
+        ).to(device),
+    }
+
+    # define criterions
+    criterion = {
+        "gen_adv": GeneratorAdversarialLoss(
+            # keep compatibility
+            **config.get("generator_adv_loss_params", {})
+        ).to(device),
+        "dis_adv": DiscriminatorAdversarialLoss(
+            # keep compatibility
+            **config.get("discriminator_adv_loss_params", {})
+        ).to(device),
+    }
+    if config.get("use_stft_loss", True):  # keep compatibility
+        config["use_stft_loss"] = True
+        criterion["stft"] = MultiResolutionSTFTLoss(
+            **config["stft_loss_params"],
+        ).to(device)
+    if config.get("use_subband_stft_loss", False):  # keep compatibility
+        assert config["generator_params"]["out_channels"] > 1
+        criterion["sub_stft"] = MultiResolutionSTFTLoss(
+            **config["subband_stft_loss_params"],
+        ).to(device)
+    else:
+        config["use_subband_stft_loss"] = False
+    if config.get("use_feat_match_loss", False):  # keep compatibility
+        criterion["feat_match"] = FeatureMatchLoss(
+            # keep compatibility
+            **config.get("feat_match_loss_params", {}),
+        ).to(device)
+    else:
+        config["use_feat_match_loss"] = False
+    if config.get("use_mel_loss", False):  # keep compatibility
+        if config.get("mel_loss_params", None) is None:
+            criterion["mel"] = MelSpectrogramLoss(
+                fs=config["sampling_rate"],
+                fft_size=config["fft_size"],
+                hop_size=config["hop_size"],
+                win_length=config["win_length"],
+                window=config["window"],
+                num_mels=config["num_mels"],
+                fmin=config["fmin"],
+                fmax=config["fmax"],
+            ).to(device)
+        else:
+            criterion["mel"] = MelSpectrogramLoss(
+                **config["mel_loss_params"],
+            ).to(device)
+    else:
+        config["use_mel_loss"] = False
+
+    # define special module for subband processing
+    if config["generator_params"]["out_channels"] > 1:
+        criterion["pqmf"] = PQMF(
+            subbands=config["generator_params"]["out_channels"],
+            # keep compatibility
+            **config.get("pqmf_params", {}),
+        ).to(device)
+
+    # define optimizers and schedulers
+    generator_optimizer_class = getattr(
+        parallel_wavegan.optimizers,
+        # keep compatibility
+        config.get("generator_optimizer_type", "RAdam"),
+    )
+    discriminator_optimizer_class = getattr(
+        parallel_wavegan.optimizers,
+        # keep compatibility
+        config.get("discriminator_optimizer_type", "RAdam"),
+    )
+    optimizer = {
+        "generator": generator_optimizer_class(
+            model["generator"].parameters(),
+            **config["generator_optimizer_params"],
+        ),
+        "discriminator": discriminator_optimizer_class(
+            model["discriminator"].parameters(),
+            **config["discriminator_optimizer_params"],
+        ),
+    }
+    generator_scheduler_class = getattr(
+        torch.optim.lr_scheduler,
+        # keep compatibility
+        config.get("generator_scheduler_type", "StepLR"),
+    )
+    discriminator_scheduler_class = getattr(
+        torch.optim.lr_scheduler,
+        # keep compatibility
+        config.get("discriminator_scheduler_type", "StepLR"),
+    )
+    scheduler = {
+        "generator": generator_scheduler_class(
+            optimizer=optimizer["generator"],
+            **config["generator_scheduler_params"],
+        ),
+        "discriminator": discriminator_scheduler_class(
+            optimizer=optimizer["discriminator"],
+            **config["discriminator_scheduler_params"],
+        ),
+    }
+    if args.distributed:
+        # wrap model for distributed training
+        try:
+            from apex.parallel import DistributedDataParallel
+        except ImportError:
+            raise ImportError(
+                "apex is not installed. please check https://github.com/NVIDIA/apex."
+            )
+        model["generator"] = DistributedDataParallel(model["generator"])
+        model["discriminator"] = DistributedDataParallel(model["discriminator"])
+
+    # show settings
+    logging.info(model["generator"])
+    logging.info(model["discriminator"])
+    logging.info(optimizer["generator"])
+    logging.info(optimizer["discriminator"])
+    logging.info(scheduler["generator"])
+    logging.info(scheduler["discriminator"])
+    for criterion_ in criterion.values():
+        logging.info(criterion_)
+
+    # define trainer
+    trainer = Trainer(
+        steps=0,
+        epochs=0,
+        data_loader=data_loader,
+        sampler=sampler,
+        model=model,
+        criterion=criterion,
+        optimizer=optimizer,
+        scheduler=scheduler,
+        config=config,
+        device=device,
+    )
+
+    # load pretrained parameters from checkpoint
+    if len(args.pretrain) != 0:
+        trainer.load_checkpoint(args.pretrain, load_only_params=True)
+        logging.info(f"Successfully load parameters from {args.pretrain}.")
+
+    # resume from checkpoint
+    if len(args.resume) != 0:
+        trainer.load_checkpoint(args.resume)
+        logging.info(f"Successfully resumed from {args.resume}.")
+
+    # run training loop
+    try:
+        trainer.run()
+    finally:
+        trainer.save_checkpoint(
+            os.path.join(config["outdir"], f"checkpoint-{trainer.steps}steps.pkl")
+        )
+        logging.info(f"Successfully saved checkpoint @ {trainer.steps}steps.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ParallelWaveGAN/parallel_wavegan/datasets/__init__.py b/ParallelWaveGAN/parallel_wavegan/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3f7a99a24b12065707ff01eb487298714bcf96a
--- /dev/null
+++ b/ParallelWaveGAN/parallel_wavegan/datasets/__init__.py
@@ -0,0 +1,2 @@
+from .audio_mel_dataset import *  # NOQA
+from .scp_dataset import *  # NOQA
diff --git a/ParallelWaveGAN/parallel_wavegan/datasets/audio_mel_dataset.py b/ParallelWaveGAN/parallel_wavegan/datasets/audio_mel_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccda58b6f76e5e40f3a07f219189be3e7bff88af
--- /dev/null
+++ b/ParallelWaveGAN/parallel_wavegan/datasets/audio_mel_dataset.py
@@ -0,0 +1,343 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+"""Dataset modules."""
+
+import logging
+import os
+
+from multiprocessing import Manager
+
+import numpy as np
+
+from torch.utils.data import Dataset
+
+from parallel_wavegan.utils import find_files
+from parallel_wavegan.utils import read_hdf5
+
+
+class AudioMelDataset(Dataset):
+    """PyTorch compatible audio and mel dataset."""
+
+    def __init__(
+        self,
+        root_dir,
+        audio_query="*.h5",
+        mel_query="*.h5",
+        audio_load_fn=lambda x: read_hdf5(x, "wave"),
+        mel_load_fn=lambda x: read_hdf5(x, "feats"),
+        audio_length_threshold=None,
+        mel_length_threshold=None,
+        return_utt_id=False,
+        allow_cache=False,
+    ):
+        """Initialize dataset.
+
+        Args:
+            root_dir (str): Root directory including dumped files.
+            audio_query (str): Query to find audio files in root_dir.
+            mel_query (str): Query to find feature files in root_dir.
+            audio_load_fn (func): Function to load audio file.
+            mel_load_fn (func): Function to load feature file.
+            audio_length_threshold (int): Threshold to remove short audio files.
+            mel_length_threshold (int): Threshold to remove short feature files.
+            return_utt_id (bool): Whether to return the utterance id with arrays.
+            allow_cache (bool): Whether to allow cache of the loaded files.
+
+        """
+        # find all of audio and mel files
+        audio_files = sorted(find_files(root_dir, audio_query))
+        mel_files = sorted(find_files(root_dir, mel_query))
+
+        # filter by threshold
+        if audio_length_threshold is not None:
+            audio_lengths = [audio_load_fn(f).shape[0] for f in audio_files]
+            idxs = [
+                idx
+                for idx in range(len(audio_files))
+                if audio_lengths[idx] > audio_length_threshold
+            ]
+            if len(audio_files) != len(idxs):
+                logging.warning(
+                    f"Some files are filtered by audio length threshold "
+                    f"({len(audio_files)} -> {len(idxs)})."
+                )
+            audio_files = [audio_files[idx] for idx in idxs]
+            mel_files = [mel_files[idx] for idx in idxs]
+        if mel_length_threshold is not None:
+            mel_lengths = [mel_load_fn(f).shape[0] for f in mel_files]
+            idxs = [
+                idx
+                for idx in range(len(mel_files))
+                if mel_lengths[idx] > mel_length_threshold
+            ]
+            if len(mel_files) != len(idxs):
+                logging.warning(
+                    f"Some files are filtered by mel length threshold "
+                    f"({len(mel_files)} -> {len(idxs)})."
+                )
+            audio_files = [audio_files[idx] for idx in idxs]
+            mel_files = [mel_files[idx] for idx in idxs]
+
+        # assert the number of files
+        assert len(audio_files) != 0, f"Not found any audio files in ${root_dir}."
+        assert len(audio_files) == len(
+            mel_files
+        ), f"Number of audio and mel files are different ({len(audio_files)} vs {len(mel_files)})."
+
+        self.audio_files = audio_files
+        self.audio_load_fn = audio_load_fn
+        self.mel_load_fn = mel_load_fn
+        self.mel_files = mel_files
+        if ".npy" in audio_query:
+            self.utt_ids = [
+                os.path.basename(f).replace("-wave.npy", "") for f in audio_files
+            ]
+        else:
+            self.utt_ids = [
+                os.path.splitext(os.path.basename(f))[0] for f in audio_files
+            ]
+        self.return_utt_id = return_utt_id
+        self.allow_cache = allow_cache
+        if allow_cache:
+            # NOTE(kan-bayashi): Manager is need to share memory in dataloader with num_workers > 0
+            self.manager = Manager()
+            self.caches = self.manager.list()
+            self.caches += [() for _ in range(len(audio_files))]
+
+    def __getitem__(self, idx):
+        """Get specified idx items.
+
+        Args:
+            idx (int): Index of the item.
+
+        Returns:
+            str: Utterance id (only in return_utt_id = True).
+            ndarray: Audio signal (T,).
+            ndarray: Feature (T', C).
+
+        """
+        if self.allow_cache and len(self.caches[idx]) != 0:
+            return self.caches[idx]
+
+        utt_id = self.utt_ids[idx]
+        audio = self.audio_load_fn(self.audio_files[idx])
+        mel = self.mel_load_fn(self.mel_files[idx])
+
+        if self.return_utt_id:
+            items = utt_id, audio, mel
+        else:
+            items = audio, mel
+
+        if self.allow_cache:
+            self.caches[idx] = items
+
+        return items
+
+    def __len__(self):
+        """Return dataset length.
+
+        Returns:
+            int: The length of dataset.
+
+        """
+        return len(self.audio_files)
+
+
+class AudioDataset(Dataset):
+    """PyTorch compatible audio dataset."""
+
+    def __init__(
+        self,
+        root_dir,
+        audio_query="*-wave.npy",
+        audio_length_threshold=None,
+        audio_load_fn=np.load,
+        return_utt_id=False,
+        allow_cache=False,
+    ):
+        """Initialize dataset.
+
+        Args:
+            root_dir (str): Root directory including dumped files.
+            audio_query (str): Query to find audio files in root_dir.
+            audio_load_fn (func): Function to load audio file.
+            audio_length_threshold (int): Threshold to remove short audio files.
+            return_utt_id (bool): Whether to return the utterance id with arrays.
+            allow_cache (bool): Whether to allow cache of the loaded files.
+
+        """
+        # find all of audio and mel files
+        audio_files = sorted(find_files(root_dir, audio_query))
+
+        # filter by threshold
+        if audio_length_threshold is not None:
+            audio_lengths = [audio_load_fn(f).shape[0] for f in audio_files]
+            idxs = [
+                idx
+                for idx in range(len(audio_files))
+                if audio_lengths[idx] > audio_length_threshold
+            ]
+            if len(audio_files) != len(idxs):
+                logging.waning(
+                    f"some files are filtered by audio length threshold "
+                    f"({len(audio_files)} -> {len(idxs)})."
+                )
+            audio_files = [audio_files[idx] for idx in idxs]
+
+        # assert the number of files
+        assert len(audio_files) != 0, f"Not found any audio files in ${root_dir}."
+
+        self.audio_files = audio_files
+        self.audio_load_fn = audio_load_fn
+        self.return_utt_id = return_utt_id
+        if ".npy" in audio_query:
+            self.utt_ids = [
+                os.path.basename(f).replace("-wave.npy", "") for f in audio_files
+            ]
+        else:
+            self.utt_ids = [
+                os.path.splitext(os.path.basename(f))[0] for f in audio_files
+            ]
+        self.allow_cache = allow_cache
+        if allow_cache:
+            # NOTE(kan-bayashi): Manager is need to share memory in dataloader with num_workers > 0
+            self.manager = Manager()
+            self.caches = self.manager.list()
+            self.caches += [() for _ in range(len(audio_files))]
+
+    def __getitem__(self, idx):
+        """Get specified idx items.
+
+        Args:
+            idx (int): Index of the item.
+
+        Returns:
+            str: Utterance id (only in return_utt_id = True).
+            ndarray: Audio (T,).
+
+        """
+        if self.allow_cache and len(self.caches[idx]) != 0:
+            return self.caches[idx]
+
+        utt_id = self.utt_ids[idx]
+        audio = self.audio_load_fn(self.audio_files[idx])
+
+        if self.return_utt_id:
+            items = utt_id, audio
+        else:
+            items = audio
+
+        if self.allow_cache:
+            self.caches[idx] = items
+
+        return items
+
+    def __len__(self):
+        """Return dataset length.
+
+        Returns:
+            int: The length of dataset.
+
+        """
+        return len(self.audio_files)
+
+
+class MelDataset(Dataset):
+    """PyTorch compatible mel dataset."""
+
+    def __init__(
+        self,
+        root_dir,
+        mel_query="*-feats.npy",
+        mel_length_threshold=None,
+        mel_load_fn=np.load,
+        return_utt_id=False,
+        allow_cache=False,
+    ):
+        """Initialize dataset.
+
+        Args:
+            root_dir (str): Root directory including dumped files.
+            mel_query (str): Query to find feature files in root_dir.
+            mel_load_fn (func): Function to load feature file.
+            mel_length_threshold (int): Threshold to remove short feature files.
+            return_utt_id (bool): Whether to return the utterance id with arrays.
+            allow_cache (bool): Whether to allow cache of the loaded files.
+
+        """
+        # find all of the mel files
+        mel_files = sorted(find_files(root_dir, mel_query))
+
+        # filter by threshold
+        if mel_length_threshold is not None:
+            mel_lengths = [mel_load_fn(f).shape[0] for f in mel_files]
+            idxs = [
+                idx
+                for idx in range(len(mel_files))
+                if mel_lengths[idx] > mel_length_threshold
+            ]
+            if len(mel_files) != len(idxs):
+                logging.warning(
+                    f"Some files are filtered by mel length threshold "
+                    f"({len(mel_files)} -> {len(idxs)})."
+                )
+            mel_files = [mel_files[idx] for idx in idxs]
+
+        # assert the number of files
+        assert len(mel_files) != 0, f"Not found any mel files in ${root_dir}."
+
+        self.mel_files = mel_files
+        self.mel_load_fn = mel_load_fn
+        self.utt_ids = [os.path.splitext(os.path.basename(f))[0] for f in mel_files]
+        if ".npy" in mel_query:
+            self.utt_ids = [
+                os.path.basename(f).replace("-feats.npy", "") for f in mel_files
+            ]
+        else:
+            self.utt_ids = [os.path.splitext(os.path.basename(f))[0] for f in mel_files]
+        self.return_utt_id = return_utt_id
+        self.allow_cache = allow_cache
+        if allow_cache:
+            # NOTE(kan-bayashi): Manager is need to share memory in dataloader with num_workers > 0
+            self.manager = Manager()
+            self.caches = self.manager.list()
+            self.caches += [() for _ in range(len(mel_files))]
+
+    def __getitem__(self, idx):
+        """Get specified idx items.
+
+        Args:
+            idx (int): Index of the item.
+
+        Returns:
+            str: Utterance id (only in return_utt_id = True).
+            ndarray: Feature (T', C).
+
+        """
+        if self.allow_cache and len(self.caches[idx]) != 0:
+            return self.caches[idx]
+
+        utt_id = self.utt_ids[idx]
+        mel = self.mel_load_fn(self.mel_files[idx])
+
+        if self.return_utt_id:
+            items = utt_id, mel
+        else:
+            items = mel
+
+        if self.allow_cache:
+            self.caches[idx] = items
+
+        return items
+
+    def __len__(self):
+        """Return dataset length.
+
+        Returns:
+            int: The length of dataset.
+
+        """
+        return len(self.mel_files)
diff --git a/ParallelWaveGAN/parallel_wavegan/datasets/scp_dataset.py b/ParallelWaveGAN/parallel_wavegan/datasets/scp_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f992ec72e0fc6b61a169befc97ee7fc3f6a8cca
--- /dev/null
+++ b/ParallelWaveGAN/parallel_wavegan/datasets/scp_dataset.py
@@ -0,0 +1,356 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+"""Dataset modules based on kaldi-style scp files."""
+
+import logging
+
+from multiprocessing import Manager
+
+import kaldiio
+import numpy as np
+
+from torch.utils.data import Dataset
+
+from parallel_wavegan.utils import HDF5ScpLoader
+from parallel_wavegan.utils import NpyScpLoader
+
+
+def _get_feats_scp_loader(feats_scp):
+    # read the first line of feats.scp file
+    with open(feats_scp) as f:
+        key, value = f.readlines()[0].replace("\n", "").split()
+
+    # check scp type
+    if ":" in value:
+        value_1, value_2 = value.split(":")
+        if value_1.endswith(".ark"):
+            # kaldi-ark case: utt_id_1 /path/to/utt_id_1.ark:index
+            return kaldiio.load_scp(feats_scp)
+        elif value_1.endswith(".h5"):
+            # hdf5 case with path in hdf5: utt_id_1 /path/to/utt_id_1.h5:feats
+            return HDF5ScpLoader(feats_scp)
+        else:
+            raise ValueError("Not supported feats.scp type.")
+    else:
+        if value.endswith(".h5"):
+            # hdf5 case without path in hdf5: utt_id_1 /path/to/utt_id_1.h5
+            return HDF5ScpLoader(feats_scp)
+        elif value.endswith(".npy"):
+            # npy case: utt_id_1 /path/to/utt_id_1.npy
+            return NpyScpLoader(feats_scp)
+        else:
+            raise ValueError("Not supported feats.scp type.")
+
+
+class AudioMelSCPDataset(Dataset):
+    """PyTorch compatible audio and mel dataset based on kaldi-stype scp files."""
+
+    def __init__(
+        self,
+        wav_scp,
+        feats_scp,
+        segments=None,
+        audio_length_threshold=None,
+        mel_length_threshold=None,
+        return_utt_id=False,
+        return_sampling_rate=False,
+        allow_cache=False,
+    ):
+        """Initialize dataset.
+
+        Args:
+            wav_scp (str): Kaldi-style wav.scp file.
+            feats_scp (str): Kaldi-style fests.scp file.
+            segments (str): Kaldi-style segments file.
+            audio_length_threshold (int): Threshold to remove short audio files.
+            mel_length_threshold (int): Threshold to remove short feature files.
+            return_utt_id (bool): Whether to return utterance id.
+            return_sampling_rate (bool): Wheter to return sampling rate.
+            allow_cache (bool): Whether to allow cache of the loaded files.
+
+        """
+        # load scp as lazy dict
+        audio_loader = kaldiio.load_scp(wav_scp, segments=segments)
+        mel_loader = _get_feats_scp_loader(feats_scp)
+        audio_keys = list(audio_loader.keys())
+        mel_keys = list(mel_loader.keys())
+
+        # filter by threshold
+        if audio_length_threshold is not None:
+            audio_lengths = [audio.shape[0] for _, audio in audio_loader.values()]
+            idxs = [
+                idx
+                for idx in range(len(audio_keys))
+                if audio_lengths[idx] > audio_length_threshold
+            ]
+            if len(audio_keys) != len(idxs):
+                logging.warning(
+                    f"Some files are filtered by audio length threshold "
+                    f"({len(audio_keys)} -> {len(idxs)})."
+                )
+            audio_keys = [audio_keys[idx] for idx in idxs]
+            mel_keys = [mel_keys[idx] for idx in idxs]
+        if mel_length_threshold is not None:
+            mel_lengths = [mel.shape[0] for mel in mel_loader.values()]
+            idxs = [
+                idx
+                for idx in range(len(mel_keys))
+                if mel_lengths[idx] > mel_length_threshold
+            ]
+            if len(mel_keys) != len(idxs):
+                logging.warning(
+                    f"Some files are filtered by mel length threshold "
+                    f"({len(mel_keys)} -> {len(idxs)})."
+                )
+            audio_keys = [audio_keys[idx] for idx in idxs]
+            mel_keys = [mel_keys[idx] for idx in idxs]
+
+        # assert the number of files
+        assert len(audio_keys) == len(
+            mel_keys
+        ), f"Number of audio and mel files are different ({len(audio_keys)} vs {len(mel_keys)})."
+
+        self.audio_loader = audio_loader
+        self.mel_loader = mel_loader
+        self.utt_ids = audio_keys
+        self.return_utt_id = return_utt_id
+        self.return_sampling_rate = return_sampling_rate
+        self.allow_cache = allow_cache
+
+        if allow_cache:
+            # NOTE(kan-bayashi): Manager is need to share memory in dataloader with num_workers > 0
+            self.manager = Manager()
+            self.caches = self.manager.list()
+            self.caches += [() for _ in range(len(self.utt_ids))]
+
+    def __getitem__(self, idx):
+        """Get specified idx items.
+
+        Args:
+            idx (int): Index of the item.
+
+        Returns:
+            str: Utterance id (only in return_utt_id = True).
+            ndarray or tuple: Audio signal (T,) or (w/ sampling rate if return_sampling_rate = True).
+            ndarray: Feature (T', C).
+
+        """
+        if self.allow_cache and len(self.caches[idx]) != 0:
+            return self.caches[idx]
+
+        utt_id = self.utt_ids[idx]
+        fs, audio = self.audio_loader[utt_id]
+        mel = self.mel_loader[utt_id]
+
+        # normalize audio signal to be [-1, 1]
+        audio = audio.astype(np.float32)
+        audio /= 1 << (16 - 1)  # assume that wav is PCM 16 bit
+
+        if self.return_sampling_rate:
+            audio = (audio, fs)
+
+        if self.return_utt_id:
+            items = utt_id, audio, mel
+        else:
+            items = audio, mel
+
+        if self.allow_cache:
+            self.caches[idx] = items
+
+        return items
+
+    def __len__(self):
+        """Return dataset length.
+
+        Returns:
+            int: The length of dataset.
+
+        """
+        return len(self.utt_ids)
+
+
+class AudioSCPDataset(Dataset):
+    """PyTorch compatible audio dataset based on kaldi-stype scp files."""
+
+    def __init__(
+        self,
+        wav_scp,
+        segments=None,
+        audio_length_threshold=None,
+        return_utt_id=False,
+        return_sampling_rate=False,
+        allow_cache=False,
+    ):
+        """Initialize dataset.
+
+        Args:
+            wav_scp (str): Kaldi-style wav.scp file.
+            segments (str): Kaldi-style segments file.
+            audio_length_threshold (int): Threshold to remove short audio files.
+            return_utt_id (bool): Whether to return utterance id.
+            return_sampling_rate (bool): Wheter to return sampling rate.
+            allow_cache (bool): Whether to allow cache of the loaded files.
+
+        """
+        # load scp as lazy dict
+        audio_loader = kaldiio.load_scp(wav_scp, segments=segments)
+        audio_keys = list(audio_loader.keys())
+
+        # filter by threshold
+        if audio_length_threshold is not None:
+            audio_lengths = [audio.shape[0] for _, audio in audio_loader.values()]
+            idxs = [
+                idx
+                for idx in range(len(audio_keys))
+                if audio_lengths[idx] > audio_length_threshold
+            ]
+            if len(audio_keys) != len(idxs):
+                logging.warning(
+                    f"Some files are filtered by audio length threshold "
+                    f"({len(audio_keys)} -> {len(idxs)})."
+                )
+            audio_keys = [audio_keys[idx] for idx in idxs]
+
+        self.audio_loader = audio_loader
+        self.utt_ids = audio_keys
+        self.return_utt_id = return_utt_id
+        self.return_sampling_rate = return_sampling_rate
+        self.allow_cache = allow_cache
+
+        if allow_cache:
+            # NOTE(kan-bayashi): Manager is need to share memory in dataloader with num_workers > 0
+            self.manager = Manager()
+            self.caches = self.manager.list()
+            self.caches += [() for _ in range(len(self.utt_ids))]
+
+    def __getitem__(self, idx):
+        """Get specified idx items.
+
+        Args:
+            idx (int): Index of the item.
+
+        Returns:
+            str: Utterance id (only in return_utt_id = True).
+            ndarray or tuple: Audio signal (T,) or (w/ sampling rate if return_sampling_rate = True).
+
+        """
+        if self.allow_cache and len(self.caches[idx]) != 0:
+            return self.caches[idx]
+
+        utt_id = self.utt_ids[idx]
+        fs, audio = self.audio_loader[utt_id]
+
+        # normalize audio signal to be [-1, 1]
+        audio = audio.astype(np.float32)
+        audio /= 1 << (16 - 1)  # assume that wav is PCM 16 bit
+
+        if self.return_sampling_rate:
+            audio = (audio, fs)
+
+        if self.return_utt_id:
+            items = utt_id, audio
+        else:
+            items = audio
+
+        if self.allow_cache:
+            self.caches[idx] = items
+
+        return items
+
+    def __len__(self):
+        """Return dataset length.
+
+        Returns:
+            int: The length of dataset.
+
+        """
+        return len(self.utt_ids)
+
+
+class MelSCPDataset(Dataset):
+    """PyTorch compatible mel dataset based on kaldi-stype scp files."""
+
+    def __init__(
+        self,
+        feats_scp,
+        mel_length_threshold=None,
+        return_utt_id=False,
+        allow_cache=False,
+    ):
+        """Initialize dataset.
+
+        Args:
+            feats_scp (str): Kaldi-style fests.scp file.
+            mel_length_threshold (int): Threshold to remove short feature files.
+            return_utt_id (bool): Whether to return utterance id.
+            allow_cache (bool): Whether to allow cache of the loaded files.
+
+        """
+        # load scp as lazy dict
+        mel_loader = _get_feats_scp_loader(feats_scp)
+        mel_keys = list(mel_loader.keys())
+
+        # filter by threshold
+        if mel_length_threshold is not None:
+            mel_lengths = [mel.shape[0] for mel in mel_loader.values()]
+            idxs = [
+                idx
+                for idx in range(len(mel_keys))
+                if mel_lengths[idx] > mel_length_threshold
+            ]
+            if len(mel_keys) != len(idxs):
+                logging.warning(
+                    f"Some files are filtered by mel length threshold "
+                    f"({len(mel_keys)} -> {len(idxs)})."
+                )
+            mel_keys = [mel_keys[idx] for idx in idxs]
+
+        self.mel_loader = mel_loader
+        self.utt_ids = mel_keys
+        self.return_utt_id = return_utt_id
+        self.allow_cache = allow_cache
+
+        if allow_cache:
+            # NOTE(kan-bayashi): Manager is need to share memory in dataloader with num_workers > 0
+            self.manager = Manager()
+            self.caches = self.manager.list()
+            self.caches += [() for _ in range(len(self.utt_ids))]
+
+    def __getitem__(self, idx):
+        """Get specified idx items.
+
+        Args:
+            idx (int): Index of the item.
+
+        Returns:
+            str: Utterance id (only in return_utt_id = True).
+            ndarray: Feature (T', C).
+
+        """
+        if self.allow_cache and len(self.caches[idx]) != 0:
+            return self.caches[idx]
+
+        utt_id = self.utt_ids[idx]
+        mel = self.mel_loader[utt_id]
+
+        if self.return_utt_id:
+            items = utt_id, mel
+        else:
+            items = mel
+
+        if self.allow_cache:
+            self.caches[idx] = items
+
+        return items
+
+    def __len__(self):
+        """Return dataset length.
+
+        Returns:
+            int: The length of dataset.
+
+        """
+        return len(self.utt_ids)
diff --git a/ParallelWaveGAN/parallel_wavegan/distributed/__init__.py b/ParallelWaveGAN/parallel_wavegan/distributed/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/ParallelWaveGAN/parallel_wavegan/distributed/launch.py b/ParallelWaveGAN/parallel_wavegan/distributed/launch.py
new file mode 100644
index 0000000000000000000000000000000000000000..292f2a92287bfd201815748465727b76d9a5008e
--- /dev/null
+++ b/ParallelWaveGAN/parallel_wavegan/distributed/launch.py
@@ -0,0 +1,163 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""Distributed process launcher.
+
+This code is modified from https://github.com/pytorch/pytorch/blob/v1.3.0/torch/distributed/launch.py.
+
+"""
+import os
+import subprocess
+import sys
+
+from argparse import ArgumentParser
+from argparse import REMAINDER
+
+
+def parse_args():
+    """Parse arguments."""
+    parser = ArgumentParser(
+        description="PyTorch distributed training launch "
+        "helper utilty that will spawn up "
+        "multiple distributed processes"
+    )
+
+    # Optional arguments for the launch helper
+    parser.add_argument(
+        "--nnodes",
+        type=int,
+        default=1,
+        help="The number of nodes to use for distributed " "training",
+    )
+    parser.add_argument(
+        "--node_rank",
+        type=int,
+        default=0,
+        help="The rank of the node for multi-node distributed " "training",
+    )
+    parser.add_argument(
+        "--nproc_per_node",
+        type=int,
+        default=1,
+        help="The number of processes to launch on each node, "
+        "for GPU training, this is recommended to be set "
+        "to the number of GPUs in your system so that "
+        "each process can be bound to a single GPU.",
+    )
+    parser.add_argument(
+        "--master_addr",
+        default="127.0.0.1",
+        type=str,
+        help="Master node (rank 0)'s address, should be either "
+        "the IP address or the hostname of node 0, for "
+        "single node multi-proc training, the "
+        "--master_addr can simply be 127.0.0.1",
+    )
+    parser.add_argument(
+        "--master_port",
+        default=29500,
+        type=int,
+        help="Master node (rank 0)'s free port that needs to "
+        "be used for communciation during distributed "
+        "training",
+    )
+    parser.add_argument(
+        "--use_env",
+        default=False,
+        action="store_true",
+        help="Use environment variable to pass "
+        "'local rank'. For legacy reasons, the default value is False. "
+        "If set to True, the script will not pass "
+        "--local_rank as argument, and will instead set LOCAL_RANK.",
+    )
+    parser.add_argument(
+        "-m",
+        "--module",
+        default=False,
+        action="store_true",
+        help="Changes each process to interpret the launch script "
+        "as a python module, executing with the same behavior as"
+        "'python -m'.",
+    )
+    parser.add_argument(
+        "-c",
+        "--command",
+        default=False,
+        action="store_true",
+        help="Changes each process to interpret the launch script " "as a command.",
+    )
+
+    # positional
+    parser.add_argument(
+        "training_script",
+        type=str,
+        help="The full path to the single GPU training "
+        "program/script/command to be launched in parallel, "
+        "followed by all the arguments for the "
+        "training script",
+    )
+
+    # rest from the training program
+    parser.add_argument("training_script_args", nargs=REMAINDER)
+    return parser.parse_args()
+
+
+def main():
+    """Launch distributed processes."""
+    args = parse_args()
+
+    # world size in terms of number of processes
+    dist_world_size = args.nproc_per_node * args.nnodes
+
+    # set PyTorch distributed related environmental variables
+    current_env = os.environ.copy()
+    current_env["MASTER_ADDR"] = args.master_addr
+    current_env["MASTER_PORT"] = str(args.master_port)
+    current_env["WORLD_SIZE"] = str(dist_world_size)
+
+    processes = []
+
+    if "OMP_NUM_THREADS" not in os.environ and args.nproc_per_node > 1:
+        current_env["OMP_NUM_THREADS"] = str(1)
+        print(
+            "*****************************************\n"
+            "Setting OMP_NUM_THREADS environment variable for each process "
+            "to be {} in default, to avoid your system being overloaded, "
+            "please further tune the variable for optimal performance in "
+            "your application as needed. \n"
+            "*****************************************".format(
+                current_env["OMP_NUM_THREADS"]
+            )
+        )
+
+    for local_rank in range(0, args.nproc_per_node):
+        # each process's rank
+        dist_rank = args.nproc_per_node * args.node_rank + local_rank
+        current_env["RANK"] = str(dist_rank)
+        current_env["LOCAL_RANK"] = str(local_rank)
+
+        # spawn the processes
+        if args.command:
+            cmd = [args.training_script]
+        else:
+            cmd = [sys.executable, "-u"]
+            if args.module:
+                cmd.append("-m")
+            cmd.append(args.training_script)
+
+        if not args.use_env:
+            cmd.append("--local_rank={}".format(local_rank))
+
+        cmd.extend(args.training_script_args)
+
+        process = subprocess.Popen(cmd, env=current_env)
+        processes.append(process)
+
+    for process in processes:
+        process.wait()
+        if process.returncode != 0:
+            raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ParallelWaveGAN/parallel_wavegan/layers/__init__.py b/ParallelWaveGAN/parallel_wavegan/layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac0b7f142ce105f662f69f3e0c5d4967b5c86c22
--- /dev/null
+++ b/ParallelWaveGAN/parallel_wavegan/layers/__init__.py
@@ -0,0 +1,6 @@
+from .causal_conv import *  # NOQA
+from .pqmf import *  # NOQA
+from .residual_block import *  # NOQA
+from .residual_stack import *  # NOQA
+from .tade_res_block import *  # NOQA
+from .upsample import *  # NOQA
diff --git a/ParallelWaveGAN/parallel_wavegan/layers/causal_conv.py b/ParallelWaveGAN/parallel_wavegan/layers/causal_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..abf51b8e95dc5eaefb8938aac10d77a07d85dca6
--- /dev/null
+++ b/ParallelWaveGAN/parallel_wavegan/layers/causal_conv.py
@@ -0,0 +1,66 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2020 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+"""Causal convolusion layer modules."""
+
+
+import torch
+
+
+class CausalConv1d(torch.nn.Module):
+    """CausalConv1d module with customized initialization."""
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        dilation=1,
+        bias=True,
+        pad="ConstantPad1d",
+        pad_params={"value": 0.0},
+    ):
+        """Initialize CausalConv1d module."""
+        super(CausalConv1d, self).__init__()
+        self.pad = getattr(torch.nn, pad)((kernel_size - 1) * dilation, **pad_params)
+        self.conv = torch.nn.Conv1d(
+            in_channels, out_channels, kernel_size, dilation=dilation, bias=bias
+        )
+
+    def forward(self, x):
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input tensor (B, in_channels, T).
+
+        Returns:
+            Tensor: Output tensor (B, out_channels, T).
+
+        """
+        return self.conv(self.pad(x))[:, :, : x.size(2)]
+
+
+class CausalConvTranspose1d(torch.nn.Module):
+    """CausalConvTranspose1d module with customized initialization."""
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride, bias=True):
+        """Initialize CausalConvTranspose1d module."""
+        super(CausalConvTranspose1d, self).__init__()
+        self.deconv = torch.nn.ConvTranspose1d(
+            in_channels, out_channels, kernel_size, stride, bias=bias
+        )
+        self.stride = stride
+
+    def forward(self, x):
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input tensor (B, in_channels, T_in).
+
+        Returns:
+            Tensor: Output tensor (B, out_channels, T_out).
+
+        """
+        return self.deconv(x)[:, :, : -self.stride]
diff --git a/ParallelWaveGAN/parallel_wavegan/layers/pqmf.py b/ParallelWaveGAN/parallel_wavegan/layers/pqmf.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bd46a3ca1e1ef272f7ac21c4bad22e0391f6555
--- /dev/null
+++ b/ParallelWaveGAN/parallel_wavegan/layers/pqmf.py
@@ -0,0 +1,150 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2020 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+"""Pseudo QMF modules."""
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from scipy.signal import kaiser
+
+
+def design_prototype_filter(taps=62, cutoff_ratio=0.142, beta=9.0):
+    """Design prototype filter for PQMF.
+
+    This method is based on `A Kaiser window approach for the design of prototype
+    filters of cosine modulated filterbanks`_.
+
+    Args:
+        taps (int): The number of filter taps.
+        cutoff_ratio (float): Cut-off frequency ratio.
+        beta (float): Beta coefficient for kaiser window.
+
+    Returns:
+        ndarray: Impluse response of prototype filter (taps + 1,).
+
+    .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`:
+        https://ieeexplore.ieee.org/abstract/document/681427
+
+    """
+    # check the arguments are valid
+    assert taps % 2 == 0, "The number of taps mush be even number."
+    assert 0.0 < cutoff_ratio < 1.0, "Cutoff ratio must be > 0.0 and < 1.0."
+
+    # make initial filter
+    omega_c = np.pi * cutoff_ratio
+    with np.errstate(invalid="ignore"):
+        h_i = np.sin(omega_c * (np.arange(taps + 1) - 0.5 * taps)) / (
+            np.pi * (np.arange(taps + 1) - 0.5 * taps)
+        )
+    h_i[taps // 2] = np.cos(0) * cutoff_ratio  # fix nan due to indeterminate form
+
+    # apply kaiser window
+    w = kaiser(taps + 1, beta)
+    h = h_i * w
+
+    return h
+
+
+class PQMF(torch.nn.Module):
+    """PQMF module.
+
+    This module is based on `Near-perfect-reconstruction pseudo-QMF banks`_.
+
+    .. _`Near-perfect-reconstruction pseudo-QMF banks`:
+        https://ieeexplore.ieee.org/document/258122
+
+    """
+
+    def __init__(self, subbands=4, taps=62, cutoff_ratio=0.142, beta=9.0):
+        """Initilize PQMF module.
+
+        The cutoff_ratio and beta parameters are optimized for #subbands = 4.
+        See dicussion in https://github.com/kan-bayashi/ParallelWaveGAN/issues/195.
+
+        Args:
+            subbands (int): The number of subbands.
+            taps (int): The number of filter taps.
+            cutoff_ratio (float): Cut-off frequency ratio.
+            beta (float): Beta coefficient for kaiser window.
+
+        """
+        super(PQMF, self).__init__()
+
+        # build analysis & synthesis filter coefficients
+        h_proto = design_prototype_filter(taps, cutoff_ratio, beta)
+        h_analysis = np.zeros((subbands, len(h_proto)))
+        h_synthesis = np.zeros((subbands, len(h_proto)))
+        for k in range(subbands):
+            h_analysis[k] = (
+                2
+                * h_proto
+                * np.cos(
+                    (2 * k + 1)
+                    * (np.pi / (2 * subbands))
+                    * (np.arange(taps + 1) - (taps / 2))
+                    + (-1) ** k * np.pi / 4
+                )
+            )
+            h_synthesis[k] = (
+                2
+                * h_proto
+                * np.cos(
+                    (2 * k + 1)
+                    * (np.pi / (2 * subbands))
+                    * (np.arange(taps + 1) - (taps / 2))
+                    - (-1) ** k * np.pi / 4
+                )
+            )
+
+        # convert to tensor
+        analysis_filter = torch.from_numpy(h_analysis).float().unsqueeze(1)
+        synthesis_filter = torch.from_numpy(h_synthesis).float().unsqueeze(0)
+
+        # register coefficients as beffer
+        self.register_buffer("analysis_filter", analysis_filter)
+        self.register_buffer("synthesis_filter", synthesis_filter)
+
+        # filter for downsampling & upsampling
+        updown_filter = torch.zeros((subbands, subbands, subbands)).float()
+        for k in range(subbands):
+            updown_filter[k, k, 0] = 1.0
+        self.register_buffer("updown_filter", updown_filter)
+        self.subbands = subbands
+
+        # keep padding info
+        self.pad_fn = torch.nn.ConstantPad1d(taps // 2, 0.0)
+
+    def analysis(self, x):
+        """Analysis with PQMF.
+
+        Args:
+            x (Tensor): Input tensor (B, 1, T).
+
+        Returns:
+            Tensor: Output tensor (B, subbands, T // subbands).
+
+        """
+        x = F.conv1d(self.pad_fn(x), self.analysis_filter)
+        return F.conv1d(x, self.updown_filter, stride=self.subbands)
+
+    def synthesis(self, x):
+        """Synthesis with PQMF.
+
+        Args:
+            x (Tensor): Input tensor (B, subbands, T // subbands).
+
+        Returns:
+            Tensor: Output tensor (B, 1, T).
+
+        """
+        # NOTE(kan-bayashi): Power will be dreased so here multipy by # subbands.
+        #   Not sure this is the correct way, it is better to check again.
+        # TODO(kan-bayashi): Understand the reconstruction procedure
+        x = F.conv_transpose1d(
+            x, self.updown_filter * self.subbands, stride=self.subbands
+        )
+        return F.conv1d(self.pad_fn(x), self.synthesis_filter)
diff --git a/ParallelWaveGAN/parallel_wavegan/layers/residual_block.py b/ParallelWaveGAN/parallel_wavegan/layers/residual_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0e9d6d240213ec897d4872d4a7d2b5d7d1158af
--- /dev/null
+++ b/ParallelWaveGAN/parallel_wavegan/layers/residual_block.py
@@ -0,0 +1,222 @@
+# -*- coding: utf-8 -*-
+
+"""Residual block modules.
+
+References:
+    - https://github.com/r9y9/wavenet_vocoder
+    - https://github.com/jik876/hifi-gan
+
+"""
+
+import math
+
+import torch
+import torch.nn.functional as F
+
+
+class Conv1d(torch.nn.Conv1d):
+    """Conv1d module with customized initialization."""
+
+    def __init__(self, *args, **kwargs):
+        """Initialize Conv1d module."""
+        super(Conv1d, self).__init__(*args, **kwargs)
+
+    def reset_parameters(self):
+        """Reset parameters."""
+        torch.nn.init.kaiming_normal_(self.weight, nonlinearity="relu")
+        if self.bias is not None:
+            torch.nn.init.constant_(self.bias, 0.0)
+
+
+class Conv1d1x1(Conv1d):
+    """1x1 Conv1d with customized initialization."""
+
+    def __init__(self, in_channels, out_channels, bias):
+        """Initialize 1x1 Conv1d module."""
+        super(Conv1d1x1, self).__init__(
+            in_channels, out_channels, kernel_size=1, padding=0, dilation=1, bias=bias
+        )
+
+
+class WaveNetResidualBlock(torch.nn.Module):
+    """Residual block module in WaveNet."""
+
+    def __init__(
+        self,
+        kernel_size=3,
+        residual_channels=64,
+        gate_channels=128,
+        skip_channels=64,
+        aux_channels=80,
+        dropout=0.0,
+        dilation=1,
+        bias=True,
+        use_causal_conv=False,
+    ):
+        """Initialize WaveNetResidualBlock module.
+
+        Args:
+            kernel_size (int): Kernel size of dilation convolution layer.
+            residual_channels (int): Number of channels for residual connection.
+            skip_channels (int): Number of channels for skip connection.
+            aux_channels (int): Local conditioning channels i.e. auxiliary input dimension.
+            dropout (float): Dropout probability.
+            dilation (int): Dilation factor.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            use_causal_conv (bool): Whether to use use_causal_conv or non-use_causal_conv convolution.
+
+        """
+        super().__init__()
+        self.dropout = dropout
+        # no future time stamps available
+        if use_causal_conv:
+            padding = (kernel_size - 1) * dilation
+        else:
+            assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
+            padding = (kernel_size - 1) // 2 * dilation
+        self.use_causal_conv = use_causal_conv
+
+        # dilation conv
+        self.conv = Conv1d(
+            residual_channels,
+            gate_channels,
+            kernel_size,
+            padding=padding,
+            dilation=dilation,
+            bias=bias,
+        )
+
+        # local conditioning
+        if aux_channels > 0:
+            self.conv1x1_aux = Conv1d1x1(aux_channels, gate_channels, bias=False)
+        else:
+            self.conv1x1_aux = None
+
+        # conv output is split into two groups
+        gate_out_channels = gate_channels // 2
+        self.conv1x1_out = Conv1d1x1(gate_out_channels, residual_channels, bias=bias)
+        self.conv1x1_skip = Conv1d1x1(gate_out_channels, skip_channels, bias=bias)
+
+    def forward(self, x, c):
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input tensor (B, residual_channels, T).
+            c (Tensor): Local conditioning auxiliary tensor (B, aux_channels, T).
+
+        Returns:
+            Tensor: Output tensor for residual connection (B, residual_channels, T).
+            Tensor: Output tensor for skip connection (B, skip_channels, T).
+
+        """
+        residual = x
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        x = self.conv(x)
+
+        # remove future time steps if use_causal_conv conv
+        x = x[:, :, : residual.size(-1)] if self.use_causal_conv else x
+
+        # split into two part for gated activation
+        splitdim = 1
+        xa, xb = x.split(x.size(splitdim) // 2, dim=splitdim)
+
+        # local conditioning
+        if c is not None:
+            assert self.conv1x1_aux is not None
+            c = self.conv1x1_aux(c)
+            ca, cb = c.split(c.size(splitdim) // 2, dim=splitdim)
+            xa, xb = xa + ca, xb + cb
+
+        x = torch.tanh(xa) * torch.sigmoid(xb)
+
+        # for skip connection
+        s = self.conv1x1_skip(x)
+
+        # for residual connection
+        x = (self.conv1x1_out(x) + residual) * math.sqrt(0.5)
+
+        return x, s
+
+
+class HiFiGANResidualBlock(torch.nn.Module):
+    """Residual block module in HiFiGAN."""
+
+    def __init__(
+        self,
+        kernel_size=3,
+        channels=512,
+        dilations=(1, 3, 5),
+        bias=True,
+        use_additional_convs=True,
+        nonlinear_activation="LeakyReLU",
+        nonlinear_activation_params={"negative_slope": 0.1},
+    ):
+        """Initialize HiFiGANResidualBlock module.
+
+        Args:
+            kernel_size (int): Kernel size of dilation convolution layer.
+            channels (int): Number of channels for convolution layer.
+            dilations (List[int]): List of dilation factors.
+            use_additional_convs (bool): Whether to use additional convolution layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+
+        """
+        super().__init__()
+        self.use_additional_convs = use_additional_convs
+        self.convs1 = torch.nn.ModuleList()
+        if use_additional_convs:
+            self.convs2 = torch.nn.ModuleList()
+        assert kernel_size % 2 == 1, "Kernel size must be odd number."
+        for dilation in dilations:
+            self.convs1 += [
+                torch.nn.Sequential(
+                    getattr(torch.nn, nonlinear_activation)(
+                        **nonlinear_activation_params
+                    ),
+                    torch.nn.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation,
+                        bias=bias,
+                        padding=(kernel_size - 1) // 2 * dilation,
+                    ),
+                )
+            ]
+            if use_additional_convs:
+                self.convs2 += [
+                    torch.nn.Sequential(
+                        getattr(torch.nn, nonlinear_activation)(
+                            **nonlinear_activation_params
+                        ),
+                        torch.nn.Conv1d(
+                            channels,
+                            channels,
+                            kernel_size,
+                            1,
+                            dilation=1,
+                            bias=bias,
+                            padding=(kernel_size - 1) // 2,
+                        ),
+                    )
+                ]
+
+    def forward(self, x):
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input tensor (B, channels, T).
+
+        Returns:
+            Tensor: Output tensor (B, channels, T).
+
+        """
+        for idx in range(len(self.convs1)):
+            xt = self.convs1[idx](x)
+            if self.use_additional_convs:
+                xt = self.convs2[idx](xt)
+            x = xt + x
+        return x
diff --git a/ParallelWaveGAN/parallel_wavegan/layers/residual_stack.py b/ParallelWaveGAN/parallel_wavegan/layers/residual_stack.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1d2788b92c3dbcf1a5791462ca6481f96e5ca73
--- /dev/null
+++ b/ParallelWaveGAN/parallel_wavegan/layers/residual_stack.py
@@ -0,0 +1,85 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2020 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+"""Residual stack module in MelGAN."""
+
+import torch
+
+from parallel_wavegan.layers import CausalConv1d
+
+
+class ResidualStack(torch.nn.Module):
+    """Residual stack module introduced in MelGAN."""
+
+    def __init__(
+        self,
+        kernel_size=3,
+        channels=32,
+        dilation=1,
+        bias=True,
+        nonlinear_activation="LeakyReLU",
+        nonlinear_activation_params={"negative_slope": 0.2},
+        pad="ReflectionPad1d",
+        pad_params={},
+        use_causal_conv=False,
+    ):
+        """Initialize ResidualStack module.
+
+        Args:
+            kernel_size (int): Kernel size of dilation convolution layer.
+            channels (int): Number of channels of convolution layers.
+            dilation (int): Dilation factor.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            pad (str): Padding function module name before dilated convolution layer.
+            pad_params (dict): Hyperparameters for padding function.
+            use_causal_conv (bool): Whether to use causal convolution.
+
+        """
+        super(ResidualStack, self).__init__()
+
+        # defile residual stack part
+        if not use_causal_conv:
+            assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
+            self.stack = torch.nn.Sequential(
+                getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
+                getattr(torch.nn, pad)((kernel_size - 1) // 2 * dilation, **pad_params),
+                torch.nn.Conv1d(
+                    channels, channels, kernel_size, dilation=dilation, bias=bias
+                ),
+                getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
+                torch.nn.Conv1d(channels, channels, 1, bias=bias),
+            )
+        else:
+            self.stack = torch.nn.Sequential(
+                getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
+                CausalConv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    dilation=dilation,
+                    bias=bias,
+                    pad=pad,
+                    pad_params=pad_params,
+                ),
+                getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
+                torch.nn.Conv1d(channels, channels, 1, bias=bias),
+            )
+
+        # defile extra layer for skip connection
+        self.skip_layer = torch.nn.Conv1d(channels, channels, 1, bias=bias)
+
+    def forward(self, c):
+        """Calculate forward propagation.
+
+        Args:
+            c (Tensor): Input tensor (B, channels, T).
+
+        Returns:
+            Tensor: Output tensor (B, chennels, T).
+
+        """
+        return self.stack(c) + self.skip_layer(c)
diff --git a/ParallelWaveGAN/parallel_wavegan/layers/tade_res_block.py b/ParallelWaveGAN/parallel_wavegan/layers/tade_res_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcad421c351d50a582559bf2395ce0559f667737
--- /dev/null
+++ b/ParallelWaveGAN/parallel_wavegan/layers/tade_res_block.py
@@ -0,0 +1,160 @@
+# Copyright 2021 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+"""StyleMelGAN's TADEResBlock Modules."""
+
+from functools import partial
+
+import torch
+
+
+class TADELayer(torch.nn.Module):
+    """TADE Layer module."""
+
+    def __init__(
+        self,
+        in_channels=64,
+        aux_channels=80,
+        kernel_size=9,
+        bias=True,
+        upsample_factor=2,
+        upsample_mode="nearest",
+    ):
+        """Initilize TADE layer."""
+        super().__init__()
+        self.norm = torch.nn.InstanceNorm1d(in_channels)
+        self.aux_conv = torch.nn.Sequential(
+            torch.nn.Conv1d(
+                aux_channels,
+                in_channels,
+                kernel_size,
+                1,
+                bias=bias,
+                padding=(kernel_size - 1) // 2,
+            ),
+            # NOTE(kan-bayashi): Use non-linear activation?
+        )
+        self.gated_conv = torch.nn.Sequential(
+            torch.nn.Conv1d(
+                in_channels,
+                in_channels * 2,
+                kernel_size,
+                1,
+                bias=bias,
+                padding=(kernel_size - 1) // 2,
+            ),
+            # NOTE(kan-bayashi): Use non-linear activation?
+        )
+        self.upsample = torch.nn.Upsample(
+            scale_factor=upsample_factor, mode=upsample_mode
+        )
+
+    def forward(self, x, c):
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input tensor (B, in_channels, T).
+            c (Tensor): Auxiliary input tensor (B, aux_channels, T').
+
+        Returns:
+            Tensor: Output tensor (B, in_channels, T * in_upsample_factor).
+            Tensor: Upsampled aux tensor (B, in_channels, T * aux_upsample_factor).
+
+        """
+        x = self.norm(x)
+        c = self.upsample(c)
+        c = self.aux_conv(c)
+        cg = self.gated_conv(c)
+        cg1, cg2 = cg.split(cg.size(1) // 2, dim=1)
+        # NOTE(kan-bayashi): Use upsample for noise input here?
+        y = cg1 * self.upsample(x) + cg2
+        # NOTE(kan-bayashi): Return upsampled aux here?
+        return y, c
+
+
+class TADEResBlock(torch.nn.Module):
+    """TADEResBlock module."""
+
+    def __init__(
+        self,
+        in_channels=64,
+        aux_channels=80,
+        kernel_size=9,
+        dilation=2,
+        bias=True,
+        upsample_factor=2,
+        upsample_mode="nearest",
+        gated_function="softmax",
+    ):
+        """Initialize TADEResBlock module."""
+        super().__init__()
+        self.tade1 = TADELayer(
+            in_channels=in_channels,
+            aux_channels=aux_channels,
+            kernel_size=kernel_size,
+            bias=bias,
+            # NOTE(kan-bayashi): Use upsample in the first TADE layer?
+            upsample_factor=1,
+            upsample_mode=upsample_mode,
+        )
+        self.gated_conv1 = torch.nn.Conv1d(
+            in_channels,
+            in_channels * 2,
+            kernel_size,
+            1,
+            bias=bias,
+            padding=(kernel_size - 1) // 2,
+        )
+        self.tade2 = TADELayer(
+            in_channels=in_channels,
+            aux_channels=in_channels,
+            kernel_size=kernel_size,
+            bias=bias,
+            upsample_factor=upsample_factor,
+            upsample_mode=upsample_mode,
+        )
+        self.gated_conv2 = torch.nn.Conv1d(
+            in_channels,
+            in_channels * 2,
+            kernel_size,
+            1,
+            bias=bias,
+            dilation=dilation,
+            padding=(kernel_size - 1) // 2 * dilation,
+        )
+        self.upsample = torch.nn.Upsample(
+            scale_factor=upsample_factor, mode=upsample_mode
+        )
+        if gated_function == "softmax":
+            self.gated_function = partial(torch.softmax, dim=1)
+        elif gated_function == "sigmoid":
+            self.gated_function = torch.sigmoid
+        else:
+            raise ValueError(f"{gated_function} is not supported.")
+
+    def forward(self, x, c):
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input tensor (B, in_channels, T).
+            c (Tensor): Auxiliary input tensor (B, aux_channels, T').
+
+        Returns:
+            Tensor: Output tensor (B, in_channels, T * in_upsample_factor).
+            Tensor: Upsampled auxirialy tensor (B, in_channels, T * in_upsample_factor).
+
+        """
+        residual = x
+
+        x, c = self.tade1(x, c)
+        x = self.gated_conv1(x)
+        xa, xb = x.split(x.size(1) // 2, dim=1)
+        x = self.gated_function(xa) * torch.tanh(xb)
+
+        x, c = self.tade2(x, c)
+        x = self.gated_conv2(x)
+        xa, xb = x.split(x.size(1) // 2, dim=1)
+        x = self.gated_function(xa) * torch.tanh(xb)
+
+        # NOTE(kan-bayashi): Return upsampled aux here?
+        return self.upsample(residual) + x, c
diff --git a/ParallelWaveGAN/parallel_wavegan/layers/tf_layers.py b/ParallelWaveGAN/parallel_wavegan/layers/tf_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..e06ffc013837e52596336ce8e45fcfedf78c0666
--- /dev/null
+++ b/ParallelWaveGAN/parallel_wavegan/layers/tf_layers.py
@@ -0,0 +1,140 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2020 MINH ANH (@dathudeptrai)
+#  MIT License (https://opensource.org/licenses/MIT)
+
+"""Tensorflow Layer modules complatible with pytorch."""
+
+import tensorflow as tf
+
+
+class TFReflectionPad1d(tf.keras.layers.Layer):
+    """Tensorflow ReflectionPad1d module."""
+
+    def __init__(self, padding_size):
+        """Initialize TFReflectionPad1d module.
+
+        Args:
+            padding_size (int): Padding size.
+
+        """
+        super(TFReflectionPad1d, self).__init__()
+        self.padding_size = padding_size
+
+    @tf.function
+    def call(self, x):
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input tensor (B, T, 1, C).
+
+        Returns:
+            Tensor: Padded tensor (B, T + 2 * padding_size, 1, C).
+
+        """
+        return tf.pad(
+            x,
+            [[0, 0], [self.padding_size, self.padding_size], [0, 0], [0, 0]],
+            "REFLECT",
+        )
+
+
+class TFConvTranspose1d(tf.keras.layers.Layer):
+    """Tensorflow ConvTranspose1d module."""
+
+    def __init__(self, channels, kernel_size, stride, padding):
+        """Initialize TFConvTranspose1d( module.
+
+        Args:
+            channels (int): Number of channels.
+            kernel_size (int): kernel size.
+            strides (int): Stride width.
+            padding (str): Padding type ("same" or "valid").
+
+        """
+        super(TFConvTranspose1d, self).__init__()
+        self.conv1d_transpose = tf.keras.layers.Conv2DTranspose(
+            filters=channels,
+            kernel_size=(kernel_size, 1),
+            strides=(stride, 1),
+            padding=padding,
+        )
+
+    @tf.function
+    def call(self, x):
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input tensor (B, T, 1, C).
+
+        Returns:
+            Tensors: Output tensor (B, T', 1, C').
+
+        """
+        x = self.conv1d_transpose(x)
+        return x
+
+
+class TFResidualStack(tf.keras.layers.Layer):
+    """Tensorflow ResidualStack module."""
+
+    def __init__(
+        self,
+        kernel_size,
+        channels,
+        dilation,
+        bias,
+        nonlinear_activation,
+        nonlinear_activation_params,
+        padding,
+    ):
+        """Initialize TFResidualStack module.
+
+        Args:
+            kernel_size (int): Kernel size.
+            channles (int): Number of channels.
+            dilation (int): Dilation ine.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            padding (str): Padding type ("same" or "valid").
+
+        """
+        super(TFResidualStack, self).__init__()
+        self.block = [
+            getattr(tf.keras.layers, nonlinear_activation)(
+                **nonlinear_activation_params
+            ),
+            TFReflectionPad1d(dilation),
+            tf.keras.layers.Conv2D(
+                filters=channels,
+                kernel_size=(kernel_size, 1),
+                dilation_rate=(dilation, 1),
+                use_bias=bias,
+                padding="valid",
+            ),
+            getattr(tf.keras.layers, nonlinear_activation)(
+                **nonlinear_activation_params
+            ),
+            tf.keras.layers.Conv2D(filters=channels, kernel_size=1, use_bias=bias),
+        ]
+        self.shortcut = tf.keras.layers.Conv2D(
+            filters=channels, kernel_size=1, use_bias=bias
+        )
+
+    @tf.function
+    def call(self, x):
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input tensor (B, T, 1, C).
+
+        Returns:
+            Tensor: Output tensor (B, T, 1, C).
+
+        """
+        _x = tf.identity(x)
+        for i, layer in enumerate(self.block):
+            _x = layer(_x)
+        shortcut = self.shortcut(x)
+        return shortcut + _x
diff --git a/ParallelWaveGAN/parallel_wavegan/layers/upsample.py b/ParallelWaveGAN/parallel_wavegan/layers/upsample.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cc9f2d77cfc8dd8e6f2f08353d607a6665b9394
--- /dev/null
+++ b/ParallelWaveGAN/parallel_wavegan/layers/upsample.py
@@ -0,0 +1,194 @@
+# -*- coding: utf-8 -*-
+
+"""Upsampling module.
+
+This code is modified from https://github.com/r9y9/wavenet_vocoder.
+
+"""
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from parallel_wavegan.layers import Conv1d
+
+
+class Stretch2d(torch.nn.Module):
+    """Stretch2d module."""
+
+    def __init__(self, x_scale, y_scale, mode="nearest"):
+        """Initialize Stretch2d module.
+
+        Args:
+            x_scale (int): X scaling factor (Time axis in spectrogram).
+            y_scale (int): Y scaling factor (Frequency axis in spectrogram).
+            mode (str): Interpolation mode.
+
+        """
+        super(Stretch2d, self).__init__()
+        self.x_scale = x_scale
+        self.y_scale = y_scale
+        self.mode = mode
+
+    def forward(self, x):
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input tensor (B, C, F, T).
+
+        Returns:
+            Tensor: Interpolated tensor (B, C, F * y_scale, T * x_scale),
+
+        """
+        return F.interpolate(
+            x, scale_factor=(self.y_scale, self.x_scale), mode=self.mode
+        )
+
+
+class Conv2d(torch.nn.Conv2d):
+    """Conv2d module with customized initialization."""
+
+    def __init__(self, *args, **kwargs):
+        """Initialize Conv2d module."""
+        super(Conv2d, self).__init__(*args, **kwargs)
+
+    def reset_parameters(self):
+        """Reset parameters."""
+        self.weight.data.fill_(1.0 / np.prod(self.kernel_size))
+        if self.bias is not None:
+            torch.nn.init.constant_(self.bias, 0.0)
+
+
+class UpsampleNetwork(torch.nn.Module):
+    """Upsampling network module."""
+
+    def __init__(
+        self,
+        upsample_scales,
+        nonlinear_activation=None,
+        nonlinear_activation_params={},
+        interpolate_mode="nearest",
+        freq_axis_kernel_size=1,
+        use_causal_conv=False,
+    ):
+        """Initialize upsampling network module.
+
+        Args:
+            upsample_scales (list): List of upsampling scales.
+            nonlinear_activation (str): Activation function name.
+            nonlinear_activation_params (dict): Arguments for specified activation function.
+            interpolate_mode (str): Interpolation mode.
+            freq_axis_kernel_size (int): Kernel size in the direction of frequency axis.
+
+        """
+        super(UpsampleNetwork, self).__init__()
+        self.use_causal_conv = use_causal_conv
+        self.up_layers = torch.nn.ModuleList()
+        for scale in upsample_scales:
+            # interpolation layer
+            stretch = Stretch2d(scale, 1, interpolate_mode)
+            self.up_layers += [stretch]
+
+            # conv layer
+            assert (
+                freq_axis_kernel_size - 1
+            ) % 2 == 0, "Not support even number freq axis kernel size."
+            freq_axis_padding = (freq_axis_kernel_size - 1) // 2
+            kernel_size = (freq_axis_kernel_size, scale * 2 + 1)
+            if use_causal_conv:
+                padding = (freq_axis_padding, scale * 2)
+            else:
+                padding = (freq_axis_padding, scale)
+            conv = Conv2d(1, 1, kernel_size=kernel_size, padding=padding, bias=False)
+            self.up_layers += [conv]
+
+            # nonlinear
+            if nonlinear_activation is not None:
+                nonlinear = getattr(torch.nn, nonlinear_activation)(
+                    **nonlinear_activation_params
+                )
+                self.up_layers += [nonlinear]
+
+    def forward(self, c):
+        """Calculate forward propagation.
+
+        Args:
+            c : Input tensor (B, C, T).
+
+        Returns:
+            Tensor: Upsampled tensor (B, C, T'), where T' = T * prod(upsample_scales).
+
+        """
+        c = c.unsqueeze(1)  # (B, 1, C, T)
+        for f in self.up_layers:
+            if self.use_causal_conv and isinstance(f, Conv2d):
+                c = f(c)[..., : c.size(-1)]
+            else:
+                c = f(c)
+        return c.squeeze(1)  # (B, C, T')
+
+
+class ConvInUpsampleNetwork(torch.nn.Module):
+    """Convolution + upsampling network module."""
+
+    def __init__(
+        self,
+        upsample_scales,
+        nonlinear_activation=None,
+        nonlinear_activation_params={},
+        interpolate_mode="nearest",
+        freq_axis_kernel_size=1,
+        aux_channels=80,
+        aux_context_window=0,
+        use_causal_conv=False,
+    ):
+        """Initialize convolution + upsampling network module.
+
+        Args:
+            upsample_scales (list): List of upsampling scales.
+            nonlinear_activation (str): Activation function name.
+            nonlinear_activation_params (dict): Arguments for specified activation function.
+            mode (str): Interpolation mode.
+            freq_axis_kernel_size (int): Kernel size in the direction of frequency axis.
+            aux_channels (int): Number of channels of pre-convolutional layer.
+            aux_context_window (int): Context window size of the pre-convolutional layer.
+            use_causal_conv (bool): Whether to use causal structure.
+
+        """
+        super(ConvInUpsampleNetwork, self).__init__()
+        self.aux_context_window = aux_context_window
+        self.use_causal_conv = use_causal_conv and aux_context_window > 0
+        # To capture wide-context information in conditional features
+        kernel_size = (
+            aux_context_window + 1 if use_causal_conv else 2 * aux_context_window + 1
+        )
+        # NOTE(kan-bayashi): Here do not use padding because the input is already padded
+        self.conv_in = Conv1d(
+            aux_channels, aux_channels, kernel_size=kernel_size, bias=False
+        )
+        self.upsample = UpsampleNetwork(
+            upsample_scales=upsample_scales,
+            nonlinear_activation=nonlinear_activation,
+            nonlinear_activation_params=nonlinear_activation_params,
+            interpolate_mode=interpolate_mode,
+            freq_axis_kernel_size=freq_axis_kernel_size,
+            use_causal_conv=use_causal_conv,
+        )
+
+    def forward(self, c):
+        """Calculate forward propagation.
+
+        Args:
+            c : Input tensor (B, C, T').
+
+        Returns:
+            Tensor: Upsampled tensor (B, C, T),
+                where T = (T' - aux_context_window * 2) * prod(upsample_scales).
+
+        Note:
+            The length of inputs considers the context window size.
+
+        """
+        c_ = self.conv_in(c)
+        c = c_[:, :, : -self.aux_context_window] if self.use_causal_conv else c_
+        return self.upsample(c)
diff --git a/ParallelWaveGAN/parallel_wavegan/losses/__init__.py b/ParallelWaveGAN/parallel_wavegan/losses/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..adb36e634a0f4f769663e31b86d205a90dc141bc
--- /dev/null
+++ b/ParallelWaveGAN/parallel_wavegan/losses/__init__.py
@@ -0,0 +1,4 @@
+from .adversarial_loss import *  # NOQA
+from .feat_match_loss import *  # NOQA
+from .mel_loss import *  # NOQA
+from .stft_loss import *  # NOQA
diff --git a/ParallelWaveGAN/parallel_wavegan/losses/adversarial_loss.py b/ParallelWaveGAN/parallel_wavegan/losses/adversarial_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7624fa95e61261e9ded6ff3e6e39828fa878e0e
--- /dev/null
+++ b/ParallelWaveGAN/parallel_wavegan/losses/adversarial_loss.py
@@ -0,0 +1,123 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2021 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+"""Adversarial loss modules."""
+
+import torch
+import torch.nn.functional as F
+
+
+class GeneratorAdversarialLoss(torch.nn.Module):
+    """Generator adversarial loss module."""
+
+    def __init__(
+        self,
+        average_by_discriminators=True,
+        loss_type="mse",
+    ):
+        """Initialize GeneratorAversarialLoss module."""
+        super().__init__()
+        self.average_by_discriminators = average_by_discriminators
+        assert loss_type in ["mse", "hinge"], f"{loss_type} is not supported."
+        if loss_type == "mse":
+            self.criterion = self._mse_loss
+        else:
+            self.criterion = self._hinge_loss
+
+    def forward(self, outputs):
+        """Calcualate generator adversarial loss.
+
+        Args:
+            outputs (Tensor or list): Discriminator outputs or list of
+                discriminator outputs.
+
+        Returns:
+            Tensor: Generator adversarial loss value.
+
+        """
+        if isinstance(outputs, (tuple, list)):
+            adv_loss = 0.0
+            for i, outputs_ in enumerate(outputs):
+                if isinstance(outputs_, (tuple, list)):
+                    # NOTE(kan-bayashi): case including feature maps
+                    outputs_ = outputs_[-1]
+                adv_loss += self.criterion(outputs_)
+            if self.average_by_discriminators:
+                adv_loss /= i + 1
+        else:
+            adv_loss = self.criterion(outputs)
+
+        return adv_loss
+
+    def _mse_loss(self, x):
+        return F.mse_loss(x, x.new_ones(x.size()))
+
+    def _hinge_loss(self, x):
+        return -x.mean()
+
+
+class DiscriminatorAdversarialLoss(torch.nn.Module):
+    """Discriminator adversarial loss module."""
+
+    def __init__(
+        self,
+        average_by_discriminators=True,
+        loss_type="mse",
+    ):
+        """Initialize DiscriminatorAversarialLoss module."""
+        super().__init__()
+        self.average_by_discriminators = average_by_discriminators
+        assert loss_type in ["mse", "hinge"], f"{loss_type} is not supported."
+        if loss_type == "mse":
+            self.fake_criterion = self._mse_fake_loss
+            self.real_criterion = self._mse_real_loss
+        else:
+            self.fake_criterion = self._hinge_fake_loss
+            self.real_criterion = self._hinge_real_loss
+
+    def forward(self, outputs_hat, outputs):
+        """Calcualate discriminator adversarial loss.
+
+        Args:
+            outputs_hat (Tensor or list): Discriminator outputs or list of
+                discriminator outputs calculated from generator outputs.
+            outputs (Tensor or list): Discriminator outputs or list of
+                discriminator outputs calculated from groundtruth.
+
+        Returns:
+            Tensor: Discriminator real loss value.
+            Tensor: Discriminator fake loss value.
+
+        """
+        if isinstance(outputs, (tuple, list)):
+            real_loss = 0.0
+            fake_loss = 0.0
+            for i, (outputs_hat_, outputs_) in enumerate(zip(outputs_hat, outputs)):
+                if isinstance(outputs_hat_, (tuple, list)):
+                    # NOTE(kan-bayashi): case including feature maps
+                    outputs_hat_ = outputs_hat_[-1]
+                    outputs_ = outputs_[-1]
+                real_loss += self.real_criterion(outputs_)
+                fake_loss += self.fake_criterion(outputs_hat_)
+            if self.average_by_discriminators:
+                fake_loss /= i + 1
+                real_loss /= i + 1
+        else:
+            real_loss = self.real_criterion(outputs)
+            fake_loss = self.fake_criterion(outputs_hat)
+
+        return real_loss, fake_loss
+
+    def _mse_real_loss(self, x):
+        return F.mse_loss(x, x.new_ones(x.size()))
+
+    def _mse_fake_loss(self, x):
+        return F.mse_loss(x, x.new_zeros(x.size()))
+
+    def _hinge_real_loss(self, x):
+        return -torch.mean(torch.min(x - 1, x.new_zeros(x.size())))
+
+    def _hinge_fake_loss(self, x):
+        return -torch.mean(torch.min(-x - 1, x.new_zeros(x.size())))
diff --git a/ParallelWaveGAN/parallel_wavegan/losses/feat_match_loss.py b/ParallelWaveGAN/parallel_wavegan/losses/feat_match_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cee14db09b89b631d2a315ec6dd01f6d2f5a65c
--- /dev/null
+++ b/ParallelWaveGAN/parallel_wavegan/losses/feat_match_loss.py
@@ -0,0 +1,54 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2021 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+"""Feature matching loss modules."""
+
+import torch
+import torch.nn.functional as F
+
+
+class FeatureMatchLoss(torch.nn.Module):
+    """Feature matching loss module."""
+
+    def __init__(
+        self,
+        average_by_layers=True,
+        average_by_discriminators=True,
+        include_final_outputs=False,
+    ):
+        """Initialize FeatureMatchLoss module."""
+        super().__init__()
+        self.average_by_layers = average_by_layers
+        self.average_by_discriminators = average_by_discriminators
+        self.include_final_outputs = include_final_outputs
+
+    def forward(self, feats_hat, feats):
+        """Calcualate feature matching loss.
+
+        Args:
+            feats_hat (list): List of list of discriminator outputs
+                calcuated from generater outputs.
+            feats (list): List of list of discriminator outputs
+                calcuated from groundtruth.
+
+        Returns:
+            Tensor: Feature matching loss value.
+
+        """
+        feat_match_loss = 0.0
+        for i, (feats_hat_, feats_) in enumerate(zip(feats_hat, feats)):
+            feat_match_loss_ = 0.0
+            if not self.include_final_outputs:
+                feats_hat_ = feats_hat_[:-1]
+                feats_ = feats_[:-1]
+            for j, (feat_hat_, feat_) in enumerate(zip(feats_hat_, feats_)):
+                feat_match_loss_ += F.l1_loss(feat_hat_, feat_.detach())
+            if self.average_by_layers:
+                feat_match_loss_ /= j + 1
+            feat_match_loss += feat_match_loss_
+        if self.average_by_discriminators:
+            feat_match_loss /= i + 1
+
+        return feat_match_loss
diff --git a/ParallelWaveGAN/parallel_wavegan/losses/mel_loss.py b/ParallelWaveGAN/parallel_wavegan/losses/mel_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..58b12bb76a4e9755d749ae83ba520ca2a3dbea2b
--- /dev/null
+++ b/ParallelWaveGAN/parallel_wavegan/losses/mel_loss.py
@@ -0,0 +1,166 @@
+# Copyright 2021 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+"""Mel-spectrogram loss modules."""
+
+from distutils.version import LooseVersion
+
+import librosa
+import torch
+import torch.nn.functional as F
+
+
+is_pytorch_17plus = LooseVersion(torch.__version__) >= LooseVersion("1.7")
+
+
+class MelSpectrogram(torch.nn.Module):
+    """Calculate Mel-spectrogram."""
+
+    def __init__(
+        self,
+        fs=22050,
+        fft_size=1024,
+        hop_size=256,
+        win_length=None,
+        window="hann",
+        num_mels=80,
+        fmin=80,
+        fmax=7600,
+        center=True,
+        normalized=False,
+        onesided=True,
+        eps=1e-10,
+        log_base=10.0,
+    ):
+        """Initialize MelSpectrogram module."""
+        super().__init__()
+        self.fft_size = fft_size
+        if win_length is None:
+            self.win_length = fft_size
+        else:
+            self.win_length = win_length
+        self.hop_size = hop_size
+        self.center = center
+        self.normalized = normalized
+        self.onesided = onesided
+        if window is not None and not hasattr(torch, f"{window}_window"):
+            raise ValueError(f"{window} window is not implemented")
+        self.window = window
+        self.eps = eps
+
+        fmin = 0 if fmin is None else fmin
+        fmax = fs / 2 if fmax is None else fmax
+        melmat = librosa.filters.mel(
+            sr=fs,
+            n_fft=fft_size,
+            n_mels=num_mels,
+            fmin=fmin,
+            fmax=fmax,
+        )
+        self.register_buffer("melmat", torch.from_numpy(melmat.T).float())
+        self.stft_params = {
+            "n_fft": self.fft_size,
+            "win_length": self.win_length,
+            "hop_length": self.hop_size,
+            "center": self.center,
+            "normalized": self.normalized,
+            "onesided": self.onesided,
+        }
+        if is_pytorch_17plus:
+            self.stft_params["return_complex"] = False
+
+        self.log_base = log_base
+        if self.log_base is None:
+            self.log = torch.log
+        elif self.log_base == 2.0:
+            self.log = torch.log2
+        elif self.log_base == 10.0:
+            self.log = torch.log10
+        else:
+            raise ValueError(f"log_base: {log_base} is not supported.")
+
+    def forward(self, x):
+        """Calculate Mel-spectrogram.
+
+        Args:
+            x (Tensor): Input waveform tensor (B, T) or (B, 1, T).
+
+        Returns:
+            Tensor: Mel-spectrogram (B, #mels, #frames).
+
+        """
+        if x.dim() == 3:
+            # (B, C, T) -> (B*C, T)
+            x = x.reshape(-1, x.size(2))
+
+        if self.window is not None:
+            window_func = getattr(torch, f"{self.window}_window")
+            window = window_func(self.win_length, dtype=x.dtype, device=x.device)
+        else:
+            window = None
+
+        x_stft = torch.stft(x, window=window, **self.stft_params)
+        # (B, #freqs, #frames, 2) -> (B, $frames, #freqs, 2)
+        x_stft = x_stft.transpose(1, 2)
+        x_power = x_stft[..., 0] ** 2 + x_stft[..., 1] ** 2
+        x_amp = torch.sqrt(torch.clamp(x_power, min=self.eps))
+
+        x_mel = torch.matmul(x_amp, self.melmat)
+        x_mel = torch.clamp(x_mel, min=self.eps)
+
+        return self.log(x_mel).transpose(1, 2)
+
+
+class MelSpectrogramLoss(torch.nn.Module):
+    """Mel-spectrogram loss."""
+
+    def __init__(
+        self,
+        fs=22050,
+        fft_size=1024,
+        hop_size=256,
+        win_length=None,
+        window="hann",
+        num_mels=80,
+        fmin=80,
+        fmax=7600,
+        center=True,
+        normalized=False,
+        onesided=True,
+        eps=1e-10,
+        log_base=10.0,
+    ):
+        """Initialize Mel-spectrogram loss."""
+        super().__init__()
+        self.mel_spectrogram = MelSpectrogram(
+            fs=fs,
+            fft_size=fft_size,
+            hop_size=hop_size,
+            win_length=win_length,
+            window=window,
+            num_mels=num_mels,
+            fmin=fmin,
+            fmax=fmax,
+            center=center,
+            normalized=normalized,
+            onesided=onesided,
+            eps=eps,
+            log_base=log_base,
+        )
+
+    def forward(self, y_hat, y):
+        """Calculate Mel-spectrogram loss.
+
+        Args:
+            y_hat (Tensor): Generated single tensor (B, 1, T).
+            y (Tensor): Groundtruth single tensor (B, 1, T).
+
+        Returns:
+            Tensor: Mel-spectrogram loss value.
+
+        """
+        mel_hat = self.mel_spectrogram(y_hat)
+        mel = self.mel_spectrogram(y)
+        mel_loss = F.l1_loss(mel_hat, mel)
+
+        return mel_loss
diff --git a/ParallelWaveGAN/parallel_wavegan/losses/stft_loss.py b/ParallelWaveGAN/parallel_wavegan/losses/stft_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5923559d6cae5c335b6febc8b8e2124ce0c4487
--- /dev/null
+++ b/ParallelWaveGAN/parallel_wavegan/losses/stft_loss.py
@@ -0,0 +1,170 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+"""STFT-based Loss modules."""
+
+import torch
+import torch.nn.functional as F
+
+from distutils.version import LooseVersion
+
+is_pytorch_17plus = LooseVersion(torch.__version__) >= LooseVersion("1.7")
+
+
+def stft(x, fft_size, hop_size, win_length, window):
+    """Perform STFT and convert to magnitude spectrogram.
+
+    Args:
+        x (Tensor): Input signal tensor (B, T).
+        fft_size (int): FFT size.
+        hop_size (int): Hop size.
+        win_length (int): Window length.
+        window (str): Window function type.
+
+    Returns:
+        Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
+
+    """
+    if is_pytorch_17plus:
+        x_stft = torch.stft(
+            x, fft_size, hop_size, win_length, window, return_complex=False
+        )
+    else:
+        x_stft = torch.stft(x, fft_size, hop_size, win_length, window)
+    real = x_stft[..., 0]
+    imag = x_stft[..., 1]
+
+    # NOTE(kan-bayashi): clamp is needed to avoid nan or inf
+    return torch.sqrt(torch.clamp(real ** 2 + imag ** 2, min=1e-7)).transpose(2, 1)
+
+
+class SpectralConvergenceLoss(torch.nn.Module):
+    """Spectral convergence loss module."""
+
+    def __init__(self):
+        """Initilize spectral convergence loss module."""
+        super(SpectralConvergenceLoss, self).__init__()
+
+    def forward(self, x_mag, y_mag):
+        """Calculate forward propagation.
+
+        Args:
+            x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+            y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
+
+        Returns:
+            Tensor: Spectral convergence loss value.
+
+        """
+        return torch.norm(y_mag - x_mag, p="fro") / torch.norm(y_mag, p="fro")
+
+
+class LogSTFTMagnitudeLoss(torch.nn.Module):
+    """Log STFT magnitude loss module."""
+
+    def __init__(self):
+        """Initilize los STFT magnitude loss module."""
+        super(LogSTFTMagnitudeLoss, self).__init__()
+
+    def forward(self, x_mag, y_mag):
+        """Calculate forward propagation.
+
+        Args:
+            x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+            y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
+
+        Returns:
+            Tensor: Log STFT magnitude loss value.
+
+        """
+        return F.l1_loss(torch.log(y_mag), torch.log(x_mag))
+
+
+class STFTLoss(torch.nn.Module):
+    """STFT loss module."""
+
+    def __init__(
+        self, fft_size=1024, shift_size=120, win_length=600, window="hann_window"
+    ):
+        """Initialize STFT loss module."""
+        super(STFTLoss, self).__init__()
+        self.fft_size = fft_size
+        self.shift_size = shift_size
+        self.win_length = win_length
+        self.spectral_convergence_loss = SpectralConvergenceLoss()
+        self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss()
+        # NOTE(kan-bayashi): Use register_buffer to fix #223
+        self.register_buffer("window", getattr(torch, window)(win_length))
+
+    def forward(self, x, y):
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Predicted signal (B, T).
+            y (Tensor): Groundtruth signal (B, T).
+
+        Returns:
+            Tensor: Spectral convergence loss value.
+            Tensor: Log STFT magnitude loss value.
+
+        """
+        x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window)
+        y_mag = stft(y, self.fft_size, self.shift_size, self.win_length, self.window)
+        sc_loss = self.spectral_convergence_loss(x_mag, y_mag)
+        mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)
+
+        return sc_loss, mag_loss
+
+
+class MultiResolutionSTFTLoss(torch.nn.Module):
+    """Multi resolution STFT loss module."""
+
+    def __init__(
+        self,
+        fft_sizes=[1024, 2048, 512],
+        hop_sizes=[120, 240, 50],
+        win_lengths=[600, 1200, 240],
+        window="hann_window",
+    ):
+        """Initialize Multi resolution STFT loss module.
+
+        Args:
+            fft_sizes (list): List of FFT sizes.
+            hop_sizes (list): List of hop sizes.
+            win_lengths (list): List of window lengths.
+            window (str): Window function type.
+
+        """
+        super(MultiResolutionSTFTLoss, self).__init__()
+        assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
+        self.stft_losses = torch.nn.ModuleList()
+        for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
+            self.stft_losses += [STFTLoss(fs, ss, wl, window)]
+
+    def forward(self, x, y):
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Predicted signal (B, T) or (B, #subband, T).
+            y (Tensor): Groundtruth signal (B, T) or (B, #subband, T).
+
+        Returns:
+            Tensor: Multi resolution spectral convergence loss value.
+            Tensor: Multi resolution log STFT magnitude loss value.
+
+        """
+        if len(x.shape) == 3:
+            x = x.view(-1, x.size(2))  # (B, C, T) -> (B x C, T)
+            y = y.view(-1, y.size(2))  # (B, C, T) -> (B x C, T)
+        sc_loss = 0.0
+        mag_loss = 0.0
+        for f in self.stft_losses:
+            sc_l, mag_l = f(x, y)
+            sc_loss += sc_l
+            mag_loss += mag_l
+        sc_loss /= len(self.stft_losses)
+        mag_loss /= len(self.stft_losses)
+
+        return sc_loss, mag_loss
diff --git a/ParallelWaveGAN/parallel_wavegan/models/__init__.py b/ParallelWaveGAN/parallel_wavegan/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4744280022aa23edb6cc3cd866855ae418621a0f
--- /dev/null
+++ b/ParallelWaveGAN/parallel_wavegan/models/__init__.py
@@ -0,0 +1,4 @@
+from .hifigan import *  # NOQA
+from .melgan import *  # NOQA
+from .parallel_wavegan import *  # NOQA
+from .style_melgan import *  # NOQA
diff --git a/ParallelWaveGAN/parallel_wavegan/models/hifigan.py b/ParallelWaveGAN/parallel_wavegan/models/hifigan.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0b0287e962b1a49675906ccec087451e7c4391a
--- /dev/null
+++ b/ParallelWaveGAN/parallel_wavegan/models/hifigan.py
@@ -0,0 +1,732 @@
+# -*- coding: utf-8 -*-
+
+"""HiFi-GAN Modules.
+
+This code is based on https://github.com/jik876/hifi-gan.
+
+"""
+
+import copy
+import logging
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from parallel_wavegan.layers import HiFiGANResidualBlock as ResidualBlock
+from parallel_wavegan.utils import read_hdf5
+
+
+class HiFiGANGenerator(torch.nn.Module):
+    """HiFiGAN generator module."""
+
+    def __init__(
+        self,
+        in_channels=80,
+        out_channels=1,
+        channels=512,
+        kernel_size=7,
+        upsample_scales=(8, 8, 2, 2),
+        upsample_kernel_sizes=(16, 16, 4, 4),
+        resblock_kernel_sizes=(3, 7, 11),
+        resblock_dilations=[(1, 3, 5), (1, 3, 5), (1, 3, 5)],
+        use_additional_convs=True,
+        bias=True,
+        nonlinear_activation="LeakyReLU",
+        nonlinear_activation_params={"negative_slope": 0.1},
+        use_weight_norm=True,
+    ):
+        """Initialize HiFiGANGenerator module.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            channels (int): Number of hidden representation channels.
+            kernel_size (int): Kernel size of initial and final conv layer.
+            upsample_scales (list): List of upsampling scales.
+            upsample_kernel_sizes (list): List of kernel sizes for upsampling layers.
+            resblock_kernel_sizes (list): List of kernel sizes for residual blocks.
+            resblock_dilations (list): List of dilation list for residual blocks.
+            use_additional_convs (bool): Whether to use additional conv layers in residual blocks.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            use_weight_norm (bool): Whether to use weight norm.
+                If set to true, it will be applied to all of the conv layers.
+
+        """
+        super().__init__()
+
+        # check hyperparameters are valid
+        assert kernel_size % 2 == 1, "Kernel size must be odd number."
+        assert len(upsample_scales) == len(upsample_kernel_sizes)
+        assert len(resblock_dilations) == len(resblock_kernel_sizes)
+
+        # define modules
+        self.num_upsamples = len(upsample_kernel_sizes)
+        self.num_blocks = len(resblock_kernel_sizes)
+        self.input_conv = torch.nn.Conv1d(
+            in_channels,
+            channels,
+            kernel_size,
+            1,
+            padding=(kernel_size - 1) // 2,
+        )
+        self.upsamples = torch.nn.ModuleList()
+        self.blocks = torch.nn.ModuleList()
+        for i in range(len(upsample_kernel_sizes)):
+            assert upsample_kernel_sizes[i] == 2 * upsample_scales[i]
+            self.upsamples += [
+                torch.nn.Sequential(
+                    getattr(torch.nn, nonlinear_activation)(
+                        **nonlinear_activation_params
+                    ),
+                    torch.nn.ConvTranspose1d(
+                        channels // (2 ** i),
+                        channels // (2 ** (i + 1)),
+                        upsample_kernel_sizes[i],
+                        upsample_scales[i],
+                        padding=upsample_scales[i] // 2 + upsample_scales[i] % 2,
+                        output_padding=upsample_scales[i] % 2,
+                    ),
+                )
+            ]
+            for j in range(len(resblock_kernel_sizes)):
+                self.blocks += [
+                    ResidualBlock(
+                        kernel_size=resblock_kernel_sizes[j],
+                        channels=channels // (2 ** (i + 1)),
+                        dilations=resblock_dilations[j],
+                        bias=bias,
+                        use_additional_convs=use_additional_convs,
+                        nonlinear_activation=nonlinear_activation,
+                        nonlinear_activation_params=nonlinear_activation_params,
+                    )
+                ]
+        self.output_conv = torch.nn.Sequential(
+            # NOTE(kan-bayashi): follow official implementation but why
+            #   using different slope parameter here? (0.1 vs. 0.01)
+            torch.nn.LeakyReLU(),
+            torch.nn.Conv1d(
+                channels // (2 ** (i + 1)),
+                out_channels,
+                kernel_size,
+                1,
+                padding=(kernel_size - 1) // 2,
+            ),
+            torch.nn.Tanh(),
+        )
+
+        # apply weight norm
+        if use_weight_norm:
+            self.apply_weight_norm()
+
+        # reset parameters
+        self.reset_parameters()
+
+    def forward(self, c):
+        """Calculate forward propagation.
+
+        Args:
+            c (Tensor): Input tensor (B, in_channels, T).
+
+        Returns:
+            Tensor: Output tensor (B, out_channels, T).
+
+        """
+        c = self.input_conv(c)
+        for i in range(self.num_upsamples):
+            c = self.upsamples[i](c)
+            cs = 0.0  # initialize
+            for j in range(self.num_blocks):
+                cs += self.blocks[i * self.num_blocks + j](c)
+            c = cs / self.num_blocks
+        c = self.output_conv(c)
+
+        return c
+
+    def reset_parameters(self):
+        """Reset parameters.
+
+        This initialization follows the official implementation manner.
+        https://github.com/jik876/hifi-gan/blob/master/models.py
+
+        """
+
+        def _reset_parameters(m):
+            if isinstance(m, (torch.nn.Conv1d, torch.nn.ConvTranspose1d)):
+                m.weight.data.normal_(0.0, 0.01)
+                logging.debug(f"Reset parameters in {m}.")
+
+        self.apply(_reset_parameters)
+
+    def remove_weight_norm(self):
+        """Remove weight normalization module from all of the layers."""
+
+        def _remove_weight_norm(m):
+            try:
+                logging.debug(f"Weight norm is removed from {m}.")
+                torch.nn.utils.remove_weight_norm(m)
+            except ValueError:  # this module didn't have weight norm
+                return
+
+        self.apply(_remove_weight_norm)
+
+    def apply_weight_norm(self):
+        """Apply weight normalization module from all of the layers."""
+
+        def _apply_weight_norm(m):
+            if isinstance(m, torch.nn.Conv1d) or isinstance(
+                m, torch.nn.ConvTranspose1d
+            ):
+                torch.nn.utils.weight_norm(m)
+                logging.debug(f"Weight norm is applied to {m}.")
+
+        self.apply(_apply_weight_norm)
+
+    def register_stats(self, stats):
+        """Register stats for de-normalization as buffer.
+
+        Args:
+            stats (str): Path of statistics file (".npy" or ".h5").
+
+        """
+        assert stats.endswith(".h5") or stats.endswith(".npy")
+        if stats.endswith(".h5"):
+            mean = read_hdf5(stats, "mean").reshape(-1)
+            scale = read_hdf5(stats, "scale").reshape(-1)
+        else:
+            mean = np.load(stats)[0].reshape(-1)
+            scale = np.load(stats)[1].reshape(-1)
+        self.register_buffer("mean", torch.from_numpy(mean).float())
+        self.register_buffer("scale", torch.from_numpy(scale).float())
+        logging.info("Successfully registered stats as buffer.")
+
+    def inference(self, c, normalize_before=False):
+        """Perform inference.
+
+        Args:
+            c (Union[Tensor, ndarray]): Input tensor (T, in_channels).
+            normalize_before (bool): Whether to perform normalization.
+
+        Returns:
+            Tensor: Output tensor (T ** prod(upsample_scales), out_channels).
+
+        """
+        if not isinstance(c, torch.Tensor):
+            c = torch.tensor(c, dtype=torch.float).to(next(self.parameters()).device)
+        if normalize_before:
+            c = (c - self.mean) / self.scale
+        c = self.forward(c.transpose(1, 0).unsqueeze(0))
+        return c.squeeze(0).transpose(1, 0)
+
+
+class HiFiGANPeriodDiscriminator(torch.nn.Module):
+    """HiFiGAN period discriminator module."""
+
+    def __init__(
+        self,
+        in_channels=1,
+        out_channels=1,
+        period=3,
+        kernel_sizes=[5, 3],
+        channels=32,
+        downsample_scales=[3, 3, 3, 3, 1],
+        max_downsample_channels=1024,
+        bias=True,
+        nonlinear_activation="LeakyReLU",
+        nonlinear_activation_params={"negative_slope": 0.1},
+        use_weight_norm=True,
+        use_spectral_norm=False,
+    ):
+        """Initialize HiFiGANPeriodDiscriminator module.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            period (int): Period.
+            kernel_sizes (list): Kernel sizes of initial conv layers and the final conv layer.
+            channels (int): Number of initial channels.
+            downsample_scales (list): List of downsampling scales.
+            max_downsample_channels (int): Number of maximum downsampling channels.
+            use_additional_convs (bool): Whether to use additional conv layers in residual blocks.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            use_weight_norm (bool): Whether to use weight norm.
+                If set to true, it will be applied to all of the conv layers.
+            use_spectral_norm (bool): Whether to use spectral norm.
+                If set to true, it will be applied to all of the conv layers.
+
+        """
+        super().__init__()
+        assert len(kernel_sizes) == 2
+        assert kernel_sizes[0] % 2 == 1, "Kernel size must be odd number."
+        assert kernel_sizes[1] % 2 == 1, "Kernel size must be odd number."
+
+        self.period = period
+        self.convs = torch.nn.ModuleList()
+        in_chs = in_channels
+        out_chs = channels
+        for downsample_scale in downsample_scales:
+            self.convs += [
+                torch.nn.Sequential(
+                    torch.nn.Conv2d(
+                        in_chs,
+                        out_chs,
+                        (kernel_sizes[0], 1),
+                        (downsample_scale, 1),
+                        padding=((kernel_sizes[0] - 1) // 2, 0),
+                    ),
+                    getattr(torch.nn, nonlinear_activation)(
+                        **nonlinear_activation_params
+                    ),
+                )
+            ]
+            in_chs = out_chs
+            # NOTE(kan-bayashi): Use downsample_scale + 1?
+            out_chs = min(out_chs * 4, max_downsample_channels)
+        self.output_conv = torch.nn.Conv2d(
+            out_chs,
+            out_channels,
+            (kernel_sizes[1] - 1, 1),
+            1,
+            padding=((kernel_sizes[1] - 1) // 2, 0),
+        )
+
+        if use_weight_norm and use_spectral_norm:
+            raise ValueError("Either use use_weight_norm or use_spectral_norm.")
+
+        # apply weight norm
+        if use_weight_norm:
+            self.apply_weight_norm()
+
+        # apply spectral norm
+        if use_spectral_norm:
+            self.apply_spectral_norm()
+
+    def forward(self, x):
+        """Calculate forward propagation.
+
+        Args:
+            c (Tensor): Input tensor (B, in_channels, T).
+
+        Returns:
+            list: List of each layer's tensors.
+
+        """
+        # transform 1d to 2d -> (B, C, T/P, P)
+        b, c, t = x.shape
+        if t % self.period != 0:
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t += n_pad
+        x = x.view(b, c, t // self.period, self.period)
+
+        # forward conv
+        outs = []
+        for layer in self.convs:
+            x = layer(x)
+            outs += [x]
+        x = self.output_conv(x)
+        x = torch.flatten(x, 1, -1)
+        outs += [x]
+
+        return outs
+
+    def apply_weight_norm(self):
+        """Apply weight normalization module from all of the layers."""
+
+        def _apply_weight_norm(m):
+            if isinstance(m, torch.nn.Conv2d):
+                torch.nn.utils.weight_norm(m)
+                logging.debug(f"Weight norm is applied to {m}.")
+
+        self.apply(_apply_weight_norm)
+
+    def apply_spectral_norm(self):
+        """Apply spectral normalization module from all of the layers."""
+
+        def _apply_spectral_norm(m):
+            if isinstance(m, torch.nn.Conv2d):
+                torch.nn.utils.spectral_norm(m)
+                logging.debug(f"Spectral norm is applied to {m}.")
+
+        self.apply(_apply_spectral_norm)
+
+
+class HiFiGANMultiPeriodDiscriminator(torch.nn.Module):
+    """HiFiGAN multi-period discriminator module."""
+
+    def __init__(
+        self,
+        periods=[2, 3, 5, 7, 11],
+        discriminator_params={
+            "in_channels": 1,
+            "out_channels": 1,
+            "kernel_sizes": [5, 3],
+            "channels": 32,
+            "downsample_scales": [3, 3, 3, 3, 1],
+            "max_downsample_channels": 1024,
+            "bias": True,
+            "nonlinear_activation": "LeakyReLU",
+            "nonlinear_activation_params": {"negative_slope": 0.1},
+            "use_weight_norm": True,
+            "use_spectral_norm": False,
+        },
+    ):
+        """Initialize HiFiGANMultiPeriodDiscriminator module.
+
+        Args:
+            periods (list): List of periods.
+            discriminator_params (dict): Parameters for hifi-gan period discriminator module.
+                The period parameter will be overwritten.
+
+        """
+        super().__init__()
+        self.discriminators = torch.nn.ModuleList()
+        for period in periods:
+            params = copy.deepcopy(discriminator_params)
+            params["period"] = period
+            self.discriminators += [HiFiGANPeriodDiscriminator(**params)]
+
+    def forward(self, x):
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+
+        Returns:
+            List: List of list of each discriminator outputs, which consists of each layer output tensors.
+
+        """
+        outs = []
+        for f in self.discriminators:
+            outs += [f(x)]
+
+        return outs
+
+
+class HiFiGANScaleDiscriminator(torch.nn.Module):
+    """HiFi-GAN scale discriminator module."""
+
+    def __init__(
+        self,
+        in_channels=1,
+        out_channels=1,
+        kernel_sizes=[15, 41, 5, 3],
+        channels=128,
+        max_downsample_channels=1024,
+        max_groups=16,
+        bias=True,
+        downsample_scales=[2, 2, 4, 4, 1],
+        nonlinear_activation="LeakyReLU",
+        nonlinear_activation_params={"negative_slope": 0.1},
+        use_weight_norm=True,
+        use_spectral_norm=False,
+    ):
+        """Initilize HiFiGAN scale discriminator module.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            kernel_sizes (list): List of four kernel sizes. The first will be used for the first conv layer,
+                and the second is for downsampling part, and the remaining two are for output layers.
+            channels (int): Initial number of channels for conv layer.
+            max_downsample_channels (int): Maximum number of channels for downsampling layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            downsample_scales (list): List of downsampling scales.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            use_weight_norm (bool): Whether to use weight norm.
+                If set to true, it will be applied to all of the conv layers.
+            use_spectral_norm (bool): Whether to use spectral norm.
+                If set to true, it will be applied to all of the conv layers.
+
+        """
+        super().__init__()
+        self.layers = torch.nn.ModuleList()
+
+        # check kernel size is valid
+        assert len(kernel_sizes) == 4
+        for ks in kernel_sizes:
+            assert ks % 2 == 1
+
+        # add first layer
+        self.layers += [
+            torch.nn.Sequential(
+                torch.nn.Conv1d(
+                    in_channels,
+                    channels,
+                    # NOTE(kan-bayashi): Use always the same kernel size
+                    kernel_sizes[0],
+                    bias=bias,
+                    padding=(kernel_sizes[0] - 1) // 2,
+                ),
+                getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
+            )
+        ]
+
+        # add downsample layers
+        in_chs = channels
+        out_chs = channels
+        # NOTE(kan-bayashi): Remove hard coding?
+        groups = 4
+        for downsample_scale in downsample_scales:
+            self.layers += [
+                torch.nn.Sequential(
+                    torch.nn.Conv1d(
+                        in_chs,
+                        out_chs,
+                        kernel_size=kernel_sizes[1],
+                        stride=downsample_scale,
+                        padding=(kernel_sizes[1] - 1) // 2,
+                        groups=groups,
+                        bias=bias,
+                    ),
+                    getattr(torch.nn, nonlinear_activation)(
+                        **nonlinear_activation_params
+                    ),
+                )
+            ]
+            in_chs = out_chs
+            # NOTE(kan-bayashi): Remove hard coding?
+            out_chs = min(in_chs * 2, max_downsample_channels)
+            # NOTE(kan-bayashi): Remove hard coding?
+            groups = min(groups * 4, max_groups)
+
+        # add final layers
+        out_chs = min(in_chs * 2, max_downsample_channels)
+        self.layers += [
+            torch.nn.Sequential(
+                torch.nn.Conv1d(
+                    in_chs,
+                    out_chs,
+                    kernel_size=kernel_sizes[2],
+                    stride=1,
+                    padding=(kernel_sizes[2] - 1) // 2,
+                    bias=bias,
+                ),
+                getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
+            )
+        ]
+        self.layers += [
+            torch.nn.Conv1d(
+                out_chs,
+                out_channels,
+                kernel_size=kernel_sizes[3],
+                stride=1,
+                padding=(kernel_sizes[3] - 1) // 2,
+                bias=bias,
+            ),
+        ]
+
+        if use_weight_norm and use_spectral_norm:
+            raise ValueError("Either use use_weight_norm or use_spectral_norm.")
+
+        # apply weight norm
+        if use_weight_norm:
+            self.apply_weight_norm()
+
+        # apply spectral norm
+        if use_spectral_norm:
+            self.apply_spectral_norm()
+
+    def forward(self, x):
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+
+        Returns:
+            List: List of output tensors of each layer.
+
+        """
+        outs = []
+        for f in self.layers:
+            x = f(x)
+            outs += [x]
+
+        return outs
+
+    def apply_weight_norm(self):
+        """Apply weight normalization module from all of the layers."""
+
+        def _apply_weight_norm(m):
+            if isinstance(m, torch.nn.Conv2d):
+                torch.nn.utils.weight_norm(m)
+                logging.debug(f"Weight norm is applied to {m}.")
+
+        self.apply(_apply_weight_norm)
+
+    def apply_spectral_norm(self):
+        """Apply spectral normalization module from all of the layers."""
+
+        def _apply_spectral_norm(m):
+            if isinstance(m, torch.nn.Conv2d):
+                torch.nn.utils.spectral_norm(m)
+                logging.debug(f"Spectral norm is applied to {m}.")
+
+        self.apply(_apply_spectral_norm)
+
+
+class HiFiGANMultiScaleDiscriminator(torch.nn.Module):
+    """HiFi-GAN multi-scale discriminator module."""
+
+    def __init__(
+        self,
+        scales=3,
+        downsample_pooling="AvgPool1d",
+        # follow the official implementation setting
+        downsample_pooling_params={
+            "kernel_size": 4,
+            "stride": 2,
+            "padding": 2,
+        },
+        discriminator_params={
+            "in_channels": 1,
+            "out_channels": 1,
+            "kernel_sizes": [15, 41, 5, 3],
+            "channels": 128,
+            "max_downsample_channels": 1024,
+            "max_groups": 16,
+            "bias": True,
+            "downsample_scales": [2, 2, 4, 4, 1],
+            "nonlinear_activation": "LeakyReLU",
+            "nonlinear_activation_params": {"negative_slope": 0.1},
+        },
+        follow_official_norm=False,
+    ):
+        """Initilize HiFiGAN multi-scale discriminator module.
+
+        Args:
+            scales (int): Number of multi-scales.
+            downsample_pooling (str): Pooling module name for downsampling of the inputs.
+            downsample_pooling_params (dict): Parameters for the above pooling module.
+            discriminator_params (dict): Parameters for hifi-gan scale discriminator module.
+            follow_official_norm (bool): Whether to follow the norm setting of the official
+                implementaion. The first discriminator uses spectral norm and the other
+                discriminators use weight norm.
+
+        """
+        super().__init__()
+        self.discriminators = torch.nn.ModuleList()
+
+        # add discriminators
+        for i in range(scales):
+            params = copy.deepcopy(discriminator_params)
+            if follow_official_norm:
+                if i == 0:
+                    params["use_weight_norm"] = False
+                    params["use_spectral_norm"] = True
+                else:
+                    params["use_weight_norm"] = True
+                    params["use_spectral_norm"] = False
+            self.discriminators += [HiFiGANScaleDiscriminator(**params)]
+        self.pooling = getattr(torch.nn, downsample_pooling)(
+            **downsample_pooling_params
+        )
+
+    def forward(self, x):
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+
+        Returns:
+            List: List of list of each discriminator outputs, which consists of each layer output tensors.
+
+        """
+        outs = []
+        for f in self.discriminators:
+            outs += [f(x)]
+            x = self.pooling(x)
+
+        return outs
+
+
+class HiFiGANMultiScaleMultiPeriodDiscriminator(torch.nn.Module):
+    """HiFi-GAN multi-scale + multi-period discriminator module."""
+
+    def __init__(
+        self,
+        # Multi-scale discriminator related
+        scales=3,
+        scale_downsample_pooling="AvgPool1d",
+        scale_downsample_pooling_params={
+            "kernel_size": 4,
+            "stride": 2,
+            "padding": 2,
+        },
+        scale_discriminator_params={
+            "in_channels": 1,
+            "out_channels": 1,
+            "kernel_sizes": [15, 41, 5, 3],
+            "channels": 128,
+            "max_downsample_channels": 1024,
+            "max_groups": 16,
+            "bias": True,
+            "downsample_scales": [2, 2, 4, 4, 1],
+            "nonlinear_activation": "LeakyReLU",
+            "nonlinear_activation_params": {"negative_slope": 0.1},
+        },
+        follow_official_norm=True,
+        # Multi-period discriminator related
+        periods=[2, 3, 5, 7, 11],
+        period_discriminator_params={
+            "in_channels": 1,
+            "out_channels": 1,
+            "kernel_sizes": [5, 3],
+            "channels": 32,
+            "downsample_scales": [3, 3, 3, 3, 1],
+            "max_downsample_channels": 1024,
+            "bias": True,
+            "nonlinear_activation": "LeakyReLU",
+            "nonlinear_activation_params": {"negative_slope": 0.1},
+            "use_weight_norm": True,
+            "use_spectral_norm": False,
+        },
+    ):
+        """Initilize HiFiGAN multi-scale + multi-period discriminator module.
+
+        Args:
+            scales (int): Number of multi-scales.
+            scale_downsample_pooling (str): Pooling module name for downsampling of the inputs.
+            scale_downsample_pooling_params (dict): Parameters for the above pooling module.
+            scale_discriminator_params (dict): Parameters for hifi-gan scale discriminator module.
+            follow_official_norm (bool): Whether to follow the norm setting of the official
+                implementaion. The first discriminator uses spectral norm and the other
+                discriminators use weight norm.
+            periods (list): List of periods.
+            period_discriminator_params (dict): Parameters for hifi-gan period discriminator module.
+                The period parameter will be overwritten.
+
+        """
+        super().__init__()
+        self.msd = HiFiGANMultiScaleDiscriminator(
+            scales=scales,
+            downsample_pooling=scale_downsample_pooling,
+            downsample_pooling_params=scale_downsample_pooling_params,
+            discriminator_params=scale_discriminator_params,
+            follow_official_norm=follow_official_norm,
+        )
+        self.mpd = HiFiGANMultiPeriodDiscriminator(
+            periods=periods,
+            discriminator_params=period_discriminator_params,
+        )
+
+    def forward(self, x):
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+
+        Returns:
+            List: List of list of each discriminator outputs,
+                which consists of each layer output tensors.
+                Multi scale and multi period ones are concatenated.
+
+        """
+        msd_outs = self.msd(x)
+        mpd_outs = self.mpd(x)
+        return msd_outs + mpd_outs
diff --git a/ParallelWaveGAN/parallel_wavegan/models/melgan.py b/ParallelWaveGAN/parallel_wavegan/models/melgan.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbc9c4e478560657a64655f6be3223344131ea2b
--- /dev/null
+++ b/ParallelWaveGAN/parallel_wavegan/models/melgan.py
@@ -0,0 +1,516 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2020 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+"""MelGAN Modules."""
+
+import logging
+
+import numpy as np
+import torch
+
+from parallel_wavegan.layers import CausalConv1d
+from parallel_wavegan.layers import CausalConvTranspose1d
+from parallel_wavegan.layers import ResidualStack
+from parallel_wavegan.utils import read_hdf5
+
+
+class MelGANGenerator(torch.nn.Module):
+    """MelGAN generator module."""
+
+    def __init__(
+        self,
+        in_channels=80,
+        out_channels=1,
+        kernel_size=7,
+        channels=512,
+        bias=True,
+        upsample_scales=[8, 8, 2, 2],
+        stack_kernel_size=3,
+        stacks=3,
+        nonlinear_activation="LeakyReLU",
+        nonlinear_activation_params={"negative_slope": 0.2},
+        pad="ReflectionPad1d",
+        pad_params={},
+        use_final_nonlinear_activation=True,
+        use_weight_norm=True,
+        use_causal_conv=False,
+    ):
+        """Initialize MelGANGenerator module.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            kernel_size (int): Kernel size of initial and final conv layer.
+            channels (int): Initial number of channels for conv layer.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            upsample_scales (list): List of upsampling scales.
+            stack_kernel_size (int): Kernel size of dilated conv layers in residual stack.
+            stacks (int): Number of stacks in a single residual stack.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            pad (str): Padding function module name before dilated convolution layer.
+            pad_params (dict): Hyperparameters for padding function.
+            use_final_nonlinear_activation (torch.nn.Module): Activation function for the final layer.
+            use_weight_norm (bool): Whether to use weight norm.
+                If set to true, it will be applied to all of the conv layers.
+            use_causal_conv (bool): Whether to use causal convolution.
+
+        """
+        super(MelGANGenerator, self).__init__()
+
+        # check hyper parameters is valid
+        assert channels >= np.prod(upsample_scales)
+        assert channels % (2 ** len(upsample_scales)) == 0
+        if not use_causal_conv:
+            assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
+
+        # add initial layer
+        layers = []
+        if not use_causal_conv:
+            layers += [
+                getattr(torch.nn, pad)((kernel_size - 1) // 2, **pad_params),
+                torch.nn.Conv1d(in_channels, channels, kernel_size, bias=bias),
+            ]
+        else:
+            layers += [
+                CausalConv1d(
+                    in_channels,
+                    channels,
+                    kernel_size,
+                    bias=bias,
+                    pad=pad,
+                    pad_params=pad_params,
+                ),
+            ]
+
+        for i, upsample_scale in enumerate(upsample_scales):
+            # add upsampling layer
+            layers += [
+                getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params)
+            ]
+            if not use_causal_conv:
+                layers += [
+                    torch.nn.ConvTranspose1d(
+                        channels // (2 ** i),
+                        channels // (2 ** (i + 1)),
+                        upsample_scale * 2,
+                        stride=upsample_scale,
+                        padding=upsample_scale // 2 + upsample_scale % 2,
+                        output_padding=upsample_scale % 2,
+                        bias=bias,
+                    )
+                ]
+            else:
+                layers += [
+                    CausalConvTranspose1d(
+                        channels // (2 ** i),
+                        channels // (2 ** (i + 1)),
+                        upsample_scale * 2,
+                        stride=upsample_scale,
+                        bias=bias,
+                    )
+                ]
+
+            # add residual stack
+            for j in range(stacks):
+                layers += [
+                    ResidualStack(
+                        kernel_size=stack_kernel_size,
+                        channels=channels // (2 ** (i + 1)),
+                        dilation=stack_kernel_size ** j,
+                        bias=bias,
+                        nonlinear_activation=nonlinear_activation,
+                        nonlinear_activation_params=nonlinear_activation_params,
+                        pad=pad,
+                        pad_params=pad_params,
+                        use_causal_conv=use_causal_conv,
+                    )
+                ]
+
+        # add final layer
+        layers += [
+            getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params)
+        ]
+        if not use_causal_conv:
+            layers += [
+                getattr(torch.nn, pad)((kernel_size - 1) // 2, **pad_params),
+                torch.nn.Conv1d(
+                    channels // (2 ** (i + 1)), out_channels, kernel_size, bias=bias
+                ),
+            ]
+        else:
+            layers += [
+                CausalConv1d(
+                    channels // (2 ** (i + 1)),
+                    out_channels,
+                    kernel_size,
+                    bias=bias,
+                    pad=pad,
+                    pad_params=pad_params,
+                ),
+            ]
+        if use_final_nonlinear_activation:
+            layers += [torch.nn.Tanh()]
+
+        # define the model as a single function
+        self.melgan = torch.nn.Sequential(*layers)
+
+        # apply weight norm
+        if use_weight_norm:
+            self.apply_weight_norm()
+
+        # reset parameters
+        self.reset_parameters()
+
+        # initialize pqmf for inference
+        self.pqmf = None
+
+    def forward(self, c):
+        """Calculate forward propagation.
+
+        Args:
+            c (Tensor): Input tensor (B, channels, T).
+
+        Returns:
+            Tensor: Output tensor (B, 1, T ** prod(upsample_scales)).
+
+        """
+        return self.melgan(c)
+
+    def remove_weight_norm(self):
+        """Remove weight normalization module from all of the layers."""
+
+        def _remove_weight_norm(m):
+            try:
+                logging.debug(f"Weight norm is removed from {m}.")
+                torch.nn.utils.remove_weight_norm(m)
+            except ValueError:  # this module didn't have weight norm
+                return
+
+        self.apply(_remove_weight_norm)
+
+    def apply_weight_norm(self):
+        """Apply weight normalization module from all of the layers."""
+
+        def _apply_weight_norm(m):
+            if isinstance(m, torch.nn.Conv1d) or isinstance(
+                m, torch.nn.ConvTranspose1d
+            ):
+                torch.nn.utils.weight_norm(m)
+                logging.debug(f"Weight norm is applied to {m}.")
+
+        self.apply(_apply_weight_norm)
+
+    def reset_parameters(self):
+        """Reset parameters.
+
+        This initialization follows official implementation manner.
+        https://github.com/descriptinc/melgan-neurips/blob/master/mel2wav/modules.py
+
+        """
+
+        def _reset_parameters(m):
+            if isinstance(m, torch.nn.Conv1d) or isinstance(
+                m, torch.nn.ConvTranspose1d
+            ):
+                m.weight.data.normal_(0.0, 0.02)
+                logging.debug(f"Reset parameters in {m}.")
+
+        self.apply(_reset_parameters)
+
+    def register_stats(self, stats):
+        """Register stats for de-normalization as buffer.
+
+        Args:
+            stats (str): Path of statistics file (".npy" or ".h5").
+
+        """
+        assert stats.endswith(".h5") or stats.endswith(".npy")
+        if stats.endswith(".h5"):
+            mean = read_hdf5(stats, "mean").reshape(-1)
+            scale = read_hdf5(stats, "scale").reshape(-1)
+        else:
+            mean = np.load(stats)[0].reshape(-1)
+            scale = np.load(stats)[1].reshape(-1)
+        self.register_buffer("mean", torch.from_numpy(mean).float())
+        self.register_buffer("scale", torch.from_numpy(scale).float())
+        logging.info("Successfully registered stats as buffer.")
+
+    def inference(self, c, normalize_before=False):
+        """Perform inference.
+
+        Args:
+            c (Union[Tensor, ndarray]): Input tensor (T, in_channels).
+            normalize_before (bool): Whether to perform normalization.
+
+        Returns:
+            Tensor: Output tensor (T ** prod(upsample_scales), out_channels).
+
+        """
+        if not isinstance(c, torch.Tensor):
+            c = torch.tensor(c, dtype=torch.float).to(next(self.parameters()).device)
+        if normalize_before:
+            c = (c - self.mean) / self.scale
+        c = self.melgan(c.transpose(1, 0).unsqueeze(0))
+        if self.pqmf is not None:
+            c = self.pqmf.synthesis(c)
+        return c.squeeze(0).transpose(1, 0)
+
+
+class MelGANDiscriminator(torch.nn.Module):
+    """MelGAN discriminator module."""
+
+    def __init__(
+        self,
+        in_channels=1,
+        out_channels=1,
+        kernel_sizes=[5, 3],
+        channels=16,
+        max_downsample_channels=1024,
+        bias=True,
+        downsample_scales=[4, 4, 4, 4],
+        nonlinear_activation="LeakyReLU",
+        nonlinear_activation_params={"negative_slope": 0.2},
+        pad="ReflectionPad1d",
+        pad_params={},
+    ):
+        """Initilize MelGAN discriminator module.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            kernel_sizes (list): List of two kernel sizes. The prod will be used for the first conv layer,
+                and the first and the second kernel sizes will be used for the last two layers.
+                For example if kernel_sizes = [5, 3], the first layer kernel size will be 5 * 3 = 15,
+                the last two layers' kernel size will be 5 and 3, respectively.
+            channels (int): Initial number of channels for conv layer.
+            max_downsample_channels (int): Maximum number of channels for downsampling layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            downsample_scales (list): List of downsampling scales.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            pad (str): Padding function module name before dilated convolution layer.
+            pad_params (dict): Hyperparameters for padding function.
+
+        """
+        super(MelGANDiscriminator, self).__init__()
+        self.layers = torch.nn.ModuleList()
+
+        # check kernel size is valid
+        assert len(kernel_sizes) == 2
+        assert kernel_sizes[0] % 2 == 1
+        assert kernel_sizes[1] % 2 == 1
+
+        # add first layer
+        self.layers += [
+            torch.nn.Sequential(
+                getattr(torch.nn, pad)((np.prod(kernel_sizes) - 1) // 2, **pad_params),
+                torch.nn.Conv1d(
+                    in_channels, channels, np.prod(kernel_sizes), bias=bias
+                ),
+                getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
+            )
+        ]
+
+        # add downsample layers
+        in_chs = channels
+        for downsample_scale in downsample_scales:
+            out_chs = min(in_chs * downsample_scale, max_downsample_channels)
+            self.layers += [
+                torch.nn.Sequential(
+                    torch.nn.Conv1d(
+                        in_chs,
+                        out_chs,
+                        kernel_size=downsample_scale * 10 + 1,
+                        stride=downsample_scale,
+                        padding=downsample_scale * 5,
+                        groups=in_chs // 4,
+                        bias=bias,
+                    ),
+                    getattr(torch.nn, nonlinear_activation)(
+                        **nonlinear_activation_params
+                    ),
+                )
+            ]
+            in_chs = out_chs
+
+        # add final layers
+        out_chs = min(in_chs * 2, max_downsample_channels)
+        self.layers += [
+            torch.nn.Sequential(
+                torch.nn.Conv1d(
+                    in_chs,
+                    out_chs,
+                    kernel_sizes[0],
+                    padding=(kernel_sizes[0] - 1) // 2,
+                    bias=bias,
+                ),
+                getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
+            )
+        ]
+        self.layers += [
+            torch.nn.Conv1d(
+                out_chs,
+                out_channels,
+                kernel_sizes[1],
+                padding=(kernel_sizes[1] - 1) // 2,
+                bias=bias,
+            ),
+        ]
+
+    def forward(self, x):
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+
+        Returns:
+            List: List of output tensors of each layer.
+
+        """
+        outs = []
+        for f in self.layers:
+            x = f(x)
+            outs += [x]
+
+        return outs
+
+
+class MelGANMultiScaleDiscriminator(torch.nn.Module):
+    """MelGAN multi-scale discriminator module."""
+
+    def __init__(
+        self,
+        in_channels=1,
+        out_channels=1,
+        scales=3,
+        downsample_pooling="AvgPool1d",
+        # follow the official implementation setting
+        downsample_pooling_params={
+            "kernel_size": 4,
+            "stride": 2,
+            "padding": 1,
+            "count_include_pad": False,
+        },
+        kernel_sizes=[5, 3],
+        channels=16,
+        max_downsample_channels=1024,
+        bias=True,
+        downsample_scales=[4, 4, 4, 4],
+        nonlinear_activation="LeakyReLU",
+        nonlinear_activation_params={"negative_slope": 0.2},
+        pad="ReflectionPad1d",
+        pad_params={},
+        use_weight_norm=True,
+    ):
+        """Initilize MelGAN multi-scale discriminator module.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            scales (int): Number of multi-scales.
+            downsample_pooling (str): Pooling module name for downsampling of the inputs.
+            downsample_pooling_params (dict): Parameters for the above pooling module.
+            kernel_sizes (list): List of two kernel sizes. The sum will be used for the first conv layer,
+                and the first and the second kernel sizes will be used for the last two layers.
+            channels (int): Initial number of channels for conv layer.
+            max_downsample_channels (int): Maximum number of channels for downsampling layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            downsample_scales (list): List of downsampling scales.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            pad (str): Padding function module name before dilated convolution layer.
+            pad_params (dict): Hyperparameters for padding function.
+            use_causal_conv (bool): Whether to use causal convolution.
+
+        """
+        super(MelGANMultiScaleDiscriminator, self).__init__()
+        self.discriminators = torch.nn.ModuleList()
+
+        # add discriminators
+        for _ in range(scales):
+            self.discriminators += [
+                MelGANDiscriminator(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    kernel_sizes=kernel_sizes,
+                    channels=channels,
+                    max_downsample_channels=max_downsample_channels,
+                    bias=bias,
+                    downsample_scales=downsample_scales,
+                    nonlinear_activation=nonlinear_activation,
+                    nonlinear_activation_params=nonlinear_activation_params,
+                    pad=pad,
+                    pad_params=pad_params,
+                )
+            ]
+        self.pooling = getattr(torch.nn, downsample_pooling)(
+            **downsample_pooling_params
+        )
+
+        # apply weight norm
+        if use_weight_norm:
+            self.apply_weight_norm()
+
+        # reset parameters
+        self.reset_parameters()
+
+    def forward(self, x):
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+
+        Returns:
+            List: List of list of each discriminator outputs, which consists of each layer output tensors.
+
+        """
+        outs = []
+        for f in self.discriminators:
+            outs += [f(x)]
+            x = self.pooling(x)
+
+        return outs
+
+    def remove_weight_norm(self):
+        """Remove weight normalization module from all of the layers."""
+
+        def _remove_weight_norm(m):
+            try:
+                logging.debug(f"Weight norm is removed from {m}.")
+                torch.nn.utils.remove_weight_norm(m)
+            except ValueError:  # this module didn't have weight norm
+                return
+
+        self.apply(_remove_weight_norm)
+
+    def apply_weight_norm(self):
+        """Apply weight normalization module from all of the layers."""
+
+        def _apply_weight_norm(m):
+            if isinstance(m, torch.nn.Conv1d) or isinstance(
+                m, torch.nn.ConvTranspose1d
+            ):
+                torch.nn.utils.weight_norm(m)
+                logging.debug(f"Weight norm is applied to {m}.")
+
+        self.apply(_apply_weight_norm)
+
+    def reset_parameters(self):
+        """Reset parameters.
+
+        This initialization follows official implementation manner.
+        https://github.com/descriptinc/melgan-neurips/blob/master/mel2wav/modules.py
+
+        """
+
+        def _reset_parameters(m):
+            if isinstance(m, torch.nn.Conv1d) or isinstance(
+                m, torch.nn.ConvTranspose1d
+            ):
+                m.weight.data.normal_(0.0, 0.02)
+                logging.debug(f"Reset parameters in {m}.")
+
+        self.apply(_reset_parameters)
diff --git a/ParallelWaveGAN/parallel_wavegan/models/parallel_wavegan.py b/ParallelWaveGAN/parallel_wavegan/models/parallel_wavegan.py
new file mode 100644
index 0000000000000000000000000000000000000000..8dcfabfb50102383189ee8c7a7740fb2ec8bac3e
--- /dev/null
+++ b/ParallelWaveGAN/parallel_wavegan/models/parallel_wavegan.py
@@ -0,0 +1,516 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+"""Parallel WaveGAN Modules."""
+
+import logging
+import math
+
+import numpy as np
+import torch
+
+from parallel_wavegan.layers import Conv1d
+from parallel_wavegan.layers import Conv1d1x1
+from parallel_wavegan.layers import upsample
+from parallel_wavegan.layers import WaveNetResidualBlock as ResidualBlock
+from parallel_wavegan import models
+from parallel_wavegan.utils import read_hdf5
+
+
+class ParallelWaveGANGenerator(torch.nn.Module):
+    """Parallel WaveGAN Generator module."""
+
+    def __init__(
+        self,
+        in_channels=1,
+        out_channels=1,
+        kernel_size=3,
+        layers=30,
+        stacks=3,
+        residual_channels=64,
+        gate_channels=128,
+        skip_channels=64,
+        aux_channels=80,
+        aux_context_window=2,
+        dropout=0.0,
+        bias=True,
+        use_weight_norm=True,
+        use_causal_conv=False,
+        upsample_conditional_features=True,
+        upsample_net="ConvInUpsampleNetwork",
+        upsample_params={"upsample_scales": [4, 4, 4, 4]},
+    ):
+        """Initialize Parallel WaveGAN Generator module.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            kernel_size (int): Kernel size of dilated convolution.
+            layers (int): Number of residual block layers.
+            stacks (int): Number of stacks i.e., dilation cycles.
+            residual_channels (int): Number of channels in residual conv.
+            gate_channels (int):  Number of channels in gated conv.
+            skip_channels (int): Number of channels in skip conv.
+            aux_channels (int): Number of channels for auxiliary feature conv.
+            aux_context_window (int): Context window size for auxiliary feature.
+            dropout (float): Dropout rate. 0.0 means no dropout applied.
+            bias (bool): Whether to use bias parameter in conv layer.
+            use_weight_norm (bool): Whether to use weight norm.
+                If set to true, it will be applied to all of the conv layers.
+            use_causal_conv (bool): Whether to use causal structure.
+            upsample_conditional_features (bool): Whether to use upsampling network.
+            upsample_net (str): Upsampling network architecture.
+            upsample_params (dict): Upsampling network parameters.
+
+        """
+        super(ParallelWaveGANGenerator, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.aux_channels = aux_channels
+        self.aux_context_window = aux_context_window
+        self.layers = layers
+        self.stacks = stacks
+        self.kernel_size = kernel_size
+
+        # check the number of layers and stacks
+        assert layers % stacks == 0
+        layers_per_stack = layers // stacks
+
+        # define first convolution
+        self.first_conv = Conv1d1x1(in_channels, residual_channels, bias=True)
+
+        # define conv + upsampling network
+        if upsample_conditional_features:
+            upsample_params.update(
+                {
+                    "use_causal_conv": use_causal_conv,
+                }
+            )
+            if upsample_net == "MelGANGenerator":
+                assert aux_context_window == 0
+                upsample_params.update(
+                    {
+                        "use_weight_norm": False,  # not to apply twice
+                        "use_final_nonlinear_activation": False,
+                    }
+                )
+                self.upsample_net = getattr(models, upsample_net)(**upsample_params)
+            else:
+                if upsample_net == "ConvInUpsampleNetwork":
+                    upsample_params.update(
+                        {
+                            "aux_channels": aux_channels,
+                            "aux_context_window": aux_context_window,
+                        }
+                    )
+                self.upsample_net = getattr(upsample, upsample_net)(**upsample_params)
+            self.upsample_factor = np.prod(upsample_params["upsample_scales"])
+        else:
+            self.upsample_net = None
+            self.upsample_factor = 1
+
+        # define residual blocks
+        self.conv_layers = torch.nn.ModuleList()
+        for layer in range(layers):
+            dilation = 2 ** (layer % layers_per_stack)
+            conv = ResidualBlock(
+                kernel_size=kernel_size,
+                residual_channels=residual_channels,
+                gate_channels=gate_channels,
+                skip_channels=skip_channels,
+                aux_channels=aux_channels,
+                dilation=dilation,
+                dropout=dropout,
+                bias=bias,
+                use_causal_conv=use_causal_conv,
+            )
+            self.conv_layers += [conv]
+
+        # define output layers
+        self.last_conv_layers = torch.nn.ModuleList(
+            [
+                torch.nn.ReLU(inplace=True),
+                Conv1d1x1(skip_channels, skip_channels, bias=True),
+                torch.nn.ReLU(inplace=True),
+                Conv1d1x1(skip_channels, out_channels, bias=True),
+            ]
+        )
+
+        # apply weight norm
+        if use_weight_norm:
+            self.apply_weight_norm()
+
+    def forward(self, x, c):
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+            c (Tensor): Local conditioning auxiliary features (B, C ,T').
+
+        Returns:
+            Tensor: Output tensor (B, out_channels, T)
+
+        """
+        # perform upsampling
+        if c is not None and self.upsample_net is not None:
+            c = self.upsample_net(c)
+            assert c.size(-1) == x.size(-1)
+
+        # encode to hidden representation
+        x = self.first_conv(x)
+        skips = 0
+        for f in self.conv_layers:
+            x, h = f(x, c)
+            skips += h
+        skips *= math.sqrt(1.0 / len(self.conv_layers))
+
+        # apply final layers
+        x = skips
+        for f in self.last_conv_layers:
+            x = f(x)
+
+        return x
+
+    def remove_weight_norm(self):
+        """Remove weight normalization module from all of the layers."""
+
+        def _remove_weight_norm(m):
+            try:
+                logging.debug(f"Weight norm is removed from {m}.")
+                torch.nn.utils.remove_weight_norm(m)
+            except ValueError:  # this module didn't have weight norm
+                return
+
+        self.apply(_remove_weight_norm)
+
+    def apply_weight_norm(self):
+        """Apply weight normalization module from all of the layers."""
+
+        def _apply_weight_norm(m):
+            if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Conv2d):
+                torch.nn.utils.weight_norm(m)
+                logging.debug(f"Weight norm is applied to {m}.")
+
+        self.apply(_apply_weight_norm)
+
+    @staticmethod
+    def _get_receptive_field_size(
+        layers, stacks, kernel_size, dilation=lambda x: 2 ** x
+    ):
+        assert layers % stacks == 0
+        layers_per_cycle = layers // stacks
+        dilations = [dilation(i % layers_per_cycle) for i in range(layers)]
+        return (kernel_size - 1) * sum(dilations) + 1
+
+    @property
+    def receptive_field_size(self):
+        """Return receptive field size."""
+        return self._get_receptive_field_size(
+            self.layers, self.stacks, self.kernel_size
+        )
+
+    def register_stats(self, stats):
+        """Register stats for de-normalization as buffer.
+
+        Args:
+            stats (str): Path of statistics file (".npy" or ".h5").
+
+        """
+        assert stats.endswith(".h5") or stats.endswith(".npy")
+        if stats.endswith(".h5"):
+            mean = read_hdf5(stats, "mean").reshape(-1)
+            scale = read_hdf5(stats, "scale").reshape(-1)
+        else:
+            mean = np.load(stats)[0].reshape(-1)
+            scale = np.load(stats)[1].reshape(-1)
+        self.register_buffer("mean", torch.from_numpy(mean).float())
+        self.register_buffer("scale", torch.from_numpy(scale).float())
+        logging.info("Successfully registered stats as buffer.")
+
+    def inference(self, c=None, x=None, normalize_before=False):
+        """Perform inference.
+
+        Args:
+            c (Union[Tensor, ndarray]): Local conditioning auxiliary features (T' ,C).
+            x (Union[Tensor, ndarray]): Input noise signal (T, 1).
+            normalize_before (bool): Whether to perform normalization.
+
+        Returns:
+            Tensor: Output tensor (T, out_channels)
+
+        """
+        if x is not None:
+            if not isinstance(x, torch.Tensor):
+                x = torch.tensor(x, dtype=torch.float).to(
+                    next(self.parameters()).device
+                )
+            x = x.transpose(1, 0).unsqueeze(0)
+        else:
+            assert c is not None
+            x = torch.randn(1, 1, len(c) * self.upsample_factor).to(
+                next(self.parameters()).device
+            )
+        if c is not None:
+            if not isinstance(c, torch.Tensor):
+                c = torch.tensor(c, dtype=torch.float).to(
+                    next(self.parameters()).device
+                )
+            if normalize_before:
+                c = (c - self.mean) / self.scale
+            c = c.transpose(1, 0).unsqueeze(0)
+            c = torch.nn.ReplicationPad1d(self.aux_context_window)(c)
+        return self.forward(x, c).squeeze(0).transpose(1, 0)
+
+
+class ParallelWaveGANDiscriminator(torch.nn.Module):
+    """Parallel WaveGAN Discriminator module."""
+
+    def __init__(
+        self,
+        in_channels=1,
+        out_channels=1,
+        kernel_size=3,
+        layers=10,
+        conv_channels=64,
+        dilation_factor=1,
+        nonlinear_activation="LeakyReLU",
+        nonlinear_activation_params={"negative_slope": 0.2},
+        bias=True,
+        use_weight_norm=True,
+    ):
+        """Initialize Parallel WaveGAN Discriminator module.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            kernel_size (int): Number of output channels.
+            layers (int): Number of conv layers.
+            conv_channels (int): Number of chnn layers.
+            dilation_factor (int): Dilation factor. For example, if dilation_factor = 2,
+                the dilation will be 2, 4, 8, ..., and so on.
+            nonlinear_activation (str): Nonlinear function after each conv.
+            nonlinear_activation_params (dict): Nonlinear function parameters
+            bias (bool): Whether to use bias parameter in conv.
+            use_weight_norm (bool) Whether to use weight norm.
+                If set to true, it will be applied to all of the conv layers.
+
+        """
+        super(ParallelWaveGANDiscriminator, self).__init__()
+        assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
+        assert dilation_factor > 0, "Dilation factor must be > 0."
+        self.conv_layers = torch.nn.ModuleList()
+        conv_in_channels = in_channels
+        for i in range(layers - 1):
+            if i == 0:
+                dilation = 1
+            else:
+                dilation = i if dilation_factor == 1 else dilation_factor ** i
+                conv_in_channels = conv_channels
+            padding = (kernel_size - 1) // 2 * dilation
+            conv_layer = [
+                Conv1d(
+                    conv_in_channels,
+                    conv_channels,
+                    kernel_size=kernel_size,
+                    padding=padding,
+                    dilation=dilation,
+                    bias=bias,
+                ),
+                getattr(torch.nn, nonlinear_activation)(
+                    inplace=True, **nonlinear_activation_params
+                ),
+            ]
+            self.conv_layers += conv_layer
+        padding = (kernel_size - 1) // 2
+        last_conv_layer = Conv1d(
+            conv_in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            padding=padding,
+            bias=bias,
+        )
+        self.conv_layers += [last_conv_layer]
+
+        # apply weight norm
+        if use_weight_norm:
+            self.apply_weight_norm()
+
+    def forward(self, x):
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+
+        Returns:
+            Tensor: Output tensor (B, 1, T)
+
+        """
+        for f in self.conv_layers:
+            x = f(x)
+        return x
+
+    def apply_weight_norm(self):
+        """Apply weight normalization module from all of the layers."""
+
+        def _apply_weight_norm(m):
+            if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Conv2d):
+                torch.nn.utils.weight_norm(m)
+                logging.debug(f"Weight norm is applied to {m}.")
+
+        self.apply(_apply_weight_norm)
+
+    def remove_weight_norm(self):
+        """Remove weight normalization module from all of the layers."""
+
+        def _remove_weight_norm(m):
+            try:
+                logging.debug(f"Weight norm is removed from {m}.")
+                torch.nn.utils.remove_weight_norm(m)
+            except ValueError:  # this module didn't have weight norm
+                return
+
+        self.apply(_remove_weight_norm)
+
+
+class ResidualParallelWaveGANDiscriminator(torch.nn.Module):
+    """Parallel WaveGAN Discriminator module."""
+
+    def __init__(
+        self,
+        in_channels=1,
+        out_channels=1,
+        kernel_size=3,
+        layers=30,
+        stacks=3,
+        residual_channels=64,
+        gate_channels=128,
+        skip_channels=64,
+        dropout=0.0,
+        bias=True,
+        use_weight_norm=True,
+        use_causal_conv=False,
+        nonlinear_activation="LeakyReLU",
+        nonlinear_activation_params={"negative_slope": 0.2},
+    ):
+        """Initialize Parallel WaveGAN Discriminator module.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            kernel_size (int): Kernel size of dilated convolution.
+            layers (int): Number of residual block layers.
+            stacks (int): Number of stacks i.e., dilation cycles.
+            residual_channels (int): Number of channels in residual conv.
+            gate_channels (int):  Number of channels in gated conv.
+            skip_channels (int): Number of channels in skip conv.
+            dropout (float): Dropout rate. 0.0 means no dropout applied.
+            bias (bool): Whether to use bias parameter in conv.
+            use_weight_norm (bool): Whether to use weight norm.
+                If set to true, it will be applied to all of the conv layers.
+            use_causal_conv (bool): Whether to use causal structure.
+            nonlinear_activation_params (dict): Nonlinear function parameters
+
+        """
+        super(ResidualParallelWaveGANDiscriminator, self).__init__()
+        assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.layers = layers
+        self.stacks = stacks
+        self.kernel_size = kernel_size
+
+        # check the number of layers and stacks
+        assert layers % stacks == 0
+        layers_per_stack = layers // stacks
+
+        # define first convolution
+        self.first_conv = torch.nn.Sequential(
+            Conv1d1x1(in_channels, residual_channels, bias=True),
+            getattr(torch.nn, nonlinear_activation)(
+                inplace=True, **nonlinear_activation_params
+            ),
+        )
+
+        # define residual blocks
+        self.conv_layers = torch.nn.ModuleList()
+        for layer in range(layers):
+            dilation = 2 ** (layer % layers_per_stack)
+            conv = ResidualBlock(
+                kernel_size=kernel_size,
+                residual_channels=residual_channels,
+                gate_channels=gate_channels,
+                skip_channels=skip_channels,
+                aux_channels=-1,
+                dilation=dilation,
+                dropout=dropout,
+                bias=bias,
+                use_causal_conv=use_causal_conv,
+            )
+            self.conv_layers += [conv]
+
+        # define output layers
+        self.last_conv_layers = torch.nn.ModuleList(
+            [
+                getattr(torch.nn, nonlinear_activation)(
+                    inplace=True, **nonlinear_activation_params
+                ),
+                Conv1d1x1(skip_channels, skip_channels, bias=True),
+                getattr(torch.nn, nonlinear_activation)(
+                    inplace=True, **nonlinear_activation_params
+                ),
+                Conv1d1x1(skip_channels, out_channels, bias=True),
+            ]
+        )
+
+        # apply weight norm
+        if use_weight_norm:
+            self.apply_weight_norm()
+
+    def forward(self, x):
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+
+        Returns:
+            Tensor: Output tensor (B, 1, T)
+
+        """
+        x = self.first_conv(x)
+
+        skips = 0
+        for f in self.conv_layers:
+            x, h = f(x, None)
+            skips += h
+        skips *= math.sqrt(1.0 / len(self.conv_layers))
+
+        # apply final layers
+        x = skips
+        for f in self.last_conv_layers:
+            x = f(x)
+        return x
+
+    def apply_weight_norm(self):
+        """Apply weight normalization module from all of the layers."""
+
+        def _apply_weight_norm(m):
+            if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Conv2d):
+                torch.nn.utils.weight_norm(m)
+                logging.debug(f"Weight norm is applied to {m}.")
+
+        self.apply(_apply_weight_norm)
+
+    def remove_weight_norm(self):
+        """Remove weight normalization module from all of the layers."""
+
+        def _remove_weight_norm(m):
+            try:
+                logging.debug(f"Weight norm is removed from {m}.")
+                torch.nn.utils.remove_weight_norm(m)
+            except ValueError:  # this module didn't have weight norm
+                return
+
+        self.apply(_remove_weight_norm)
diff --git a/ParallelWaveGAN/parallel_wavegan/models/style_melgan.py b/ParallelWaveGAN/parallel_wavegan/models/style_melgan.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e01ac8f856abd29cad60d7db4c8f4c25d2c121a
--- /dev/null
+++ b/ParallelWaveGAN/parallel_wavegan/models/style_melgan.py
@@ -0,0 +1,363 @@
+# Copyright 2021 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+"""StyleMelGAN Modules."""
+
+import copy
+import logging
+import math
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from parallel_wavegan.layers import PQMF
+from parallel_wavegan.layers import TADEResBlock
+from parallel_wavegan.models import MelGANDiscriminator as BaseDiscriminator
+from parallel_wavegan.utils import read_hdf5
+
+
+class StyleMelGANGenerator(torch.nn.Module):
+    """Style MelGAN generator module."""
+
+    def __init__(
+        self,
+        in_channels=128,
+        aux_channels=80,
+        channels=64,
+        out_channels=1,
+        kernel_size=9,
+        dilation=2,
+        bias=True,
+        noise_upsample_scales=[11, 2, 2, 2],
+        noise_upsample_activation="LeakyReLU",
+        noise_upsample_activation_params={"negative_slope": 0.2},
+        upsample_scales=[2, 2, 2, 2, 2, 2, 2, 2, 1],
+        upsample_mode="nearest",
+        gated_function="softmax",
+        use_weight_norm=True,
+    ):
+        """Initilize Style MelGAN generator.
+
+        Args:
+            in_channels (int): Number of input noise channels.
+            aux_channels (int): Number of auxiliary input channels.
+            channels (int): Number of channels for conv layer.
+            out_channels (int): Number of output channels.
+            kernel_size (int): Kernel size of conv layers.
+            dilation (int): Dilation factor for conv layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            noise_upsample_scales (list): List of noise upsampling scales.
+            noise_upsample_activation (str): Activation function module name for noise upsampling.
+            noise_upsample_activation_params (dict): Hyperparameters for the above activation function.
+            upsample_scales (list): List of upsampling scales.
+            upsample_mode (str): Upsampling mode in TADE layer.
+            gated_function (str): Gated function in TADEResBlock ("softmax" or "sigmoid").
+            use_weight_norm (bool): Whether to use weight norm.
+                If set to true, it will be applied to all of the conv layers.
+
+        """
+        super().__init__()
+
+        self.in_channels = in_channels
+
+        noise_upsample = []
+        in_chs = in_channels
+        for noise_upsample_scale in noise_upsample_scales:
+            # NOTE(kan-bayashi): How should we design noise upsampling part?
+            noise_upsample += [
+                torch.nn.ConvTranspose1d(
+                    in_chs,
+                    channels,
+                    noise_upsample_scale * 2,
+                    stride=noise_upsample_scale,
+                    padding=noise_upsample_scale // 2 + noise_upsample_scale % 2,
+                    output_padding=noise_upsample_scale % 2,
+                    bias=bias,
+                )
+            ]
+            noise_upsample += [
+                getattr(torch.nn, noise_upsample_activation)(
+                    **noise_upsample_activation_params
+                )
+            ]
+            in_chs = channels
+        self.noise_upsample = torch.nn.Sequential(*noise_upsample)
+        self.noise_upsample_factor = np.prod(noise_upsample_scales)
+
+        self.blocks = torch.nn.ModuleList()
+        aux_chs = aux_channels
+        for upsample_scale in upsample_scales:
+            self.blocks += [
+                TADEResBlock(
+                    in_channels=channels,
+                    aux_channels=aux_chs,
+                    kernel_size=kernel_size,
+                    dilation=dilation,
+                    bias=bias,
+                    upsample_factor=upsample_scale,
+                    upsample_mode=upsample_mode,
+                    gated_function=gated_function,
+                ),
+            ]
+            aux_chs = channels
+        self.upsample_factor = np.prod(upsample_scales)
+
+        self.output_conv = torch.nn.Sequential(
+            torch.nn.Conv1d(
+                channels,
+                out_channels,
+                kernel_size,
+                1,
+                bias=bias,
+                padding=(kernel_size - 1) // 2,
+            ),
+            torch.nn.Tanh(),
+        )
+
+        # apply weight norm
+        if use_weight_norm:
+            self.apply_weight_norm()
+
+        # reset parameters
+        self.reset_parameters()
+
+    def forward(self, c, z=None):
+        """Calculate forward propagation.
+
+        Args:
+            c (Tensor): Auxiliary input tensor (B, channels, T).
+            z (Tensor): Input noise tensor (B, in_channels, 1).
+
+        Returns:
+            Tensor: Output tensor (B, out_channels, T ** prod(upsample_scales)).
+
+        """
+        if z is None:
+            z = torch.randn(c.size(0), self.in_channels, 1).to(
+                device=c.device,
+                dtype=c.dtype,
+            )
+        x = self.noise_upsample(z)
+        for block in self.blocks:
+            x, c = block(x, c)
+        x = self.output_conv(x)
+        return x
+
+    def remove_weight_norm(self):
+        """Remove weight normalization module from all of the layers."""
+
+        def _remove_weight_norm(m):
+            try:
+                logging.debug(f"Weight norm is removed from {m}.")
+                torch.nn.utils.remove_weight_norm(m)
+            except ValueError:  # this module didn't have weight norm
+                return
+
+        self.apply(_remove_weight_norm)
+
+    def apply_weight_norm(self):
+        """Apply weight normalization module from all of the layers."""
+
+        def _apply_weight_norm(m):
+            if isinstance(m, torch.nn.Conv1d) or isinstance(
+                m, torch.nn.ConvTranspose1d
+            ):
+                torch.nn.utils.weight_norm(m)
+                logging.debug(f"Weight norm is applied to {m}.")
+
+        self.apply(_apply_weight_norm)
+
+    def reset_parameters(self):
+        """Reset parameters."""
+
+        def _reset_parameters(m):
+            if isinstance(m, torch.nn.Conv1d) or isinstance(
+                m, torch.nn.ConvTranspose1d
+            ):
+                m.weight.data.normal_(0.0, 0.02)
+                logging.debug(f"Reset parameters in {m}.")
+
+        self.apply(_reset_parameters)
+
+    def register_stats(self, stats):
+        """Register stats for de-normalization as buffer.
+
+        Args:
+            stats (str): Path of statistics file (".npy" or ".h5").
+
+        """
+        assert stats.endswith(".h5") or stats.endswith(".npy")
+        if stats.endswith(".h5"):
+            mean = read_hdf5(stats, "mean").reshape(-1)
+            scale = read_hdf5(stats, "scale").reshape(-1)
+        else:
+            mean = np.load(stats)[0].reshape(-1)
+            scale = np.load(stats)[1].reshape(-1)
+        self.register_buffer("mean", torch.from_numpy(mean).float())
+        self.register_buffer("scale", torch.from_numpy(scale).float())
+        logging.info("Successfully registered stats as buffer.")
+
+    def inference(self, c, normalize_before=False):
+        """Perform inference.
+
+        Args:
+            c (Union[Tensor, ndarray]): Input tensor (T, in_channels).
+            normalize_before (bool): Whether to perform normalization.
+
+        Returns:
+            Tensor: Output tensor (T ** prod(upsample_scales), out_channels).
+
+        """
+        if not isinstance(c, torch.Tensor):
+            c = torch.tensor(c, dtype=torch.float).to(next(self.parameters()).device)
+        if normalize_before:
+            c = (c - self.mean) / self.scale
+        c = c.transpose(1, 0).unsqueeze(0)
+
+        # prepare noise input
+        noise_size = (
+            1,
+            self.in_channels,
+            math.ceil(c.size(2) / self.noise_upsample_factor),
+        )
+        noise = torch.randn(*noise_size, dtype=torch.float).to(
+            next(self.parameters()).device
+        )
+        x = self.noise_upsample(noise)
+
+        # NOTE(kan-bayashi): To remove pop noise at the end of audio, perform padding
+        #    for feature sequence and after generation cut the generated audio. This
+        #    requires additional computation but it can prevent pop noise.
+        total_length = c.size(2) * self.upsample_factor
+        c = F.pad(c, (0, x.size(2) - c.size(2)), "replicate")
+
+        # This version causes pop noise.
+        # x = x[:, :, :c.size(2)]
+
+        for block in self.blocks:
+            x, c = block(x, c)
+        x = self.output_conv(x)[..., :total_length]
+
+        return x.squeeze(0).transpose(1, 0)
+
+
+class StyleMelGANDiscriminator(torch.nn.Module):
+    """Style MelGAN disciminator module."""
+
+    def __init__(
+        self,
+        repeats=2,
+        window_sizes=[512, 1024, 2048, 4096],
+        pqmf_params=[
+            [1, None, None, None],
+            [2, 62, 0.26700, 9.0],
+            [4, 62, 0.14200, 9.0],
+            [8, 62, 0.07949, 9.0],
+        ],
+        discriminator_params={
+            "out_channels": 1,
+            "kernel_sizes": [5, 3],
+            "channels": 16,
+            "max_downsample_channels": 512,
+            "bias": True,
+            "downsample_scales": [4, 4, 4, 1],
+            "nonlinear_activation": "LeakyReLU",
+            "nonlinear_activation_params": {"negative_slope": 0.2},
+            "pad": "ReflectionPad1d",
+            "pad_params": {},
+        },
+        use_weight_norm=True,
+    ):
+        """Initilize Style MelGAN discriminator.
+
+        Args:
+            repeats (int): Number of repititons to apply RWD.
+            window_sizes (list): List of random window sizes.
+            pqmf_params (list): List of list of Parameters for PQMF modules
+            discriminator_params (dict): Parameters for base discriminator module.
+            use_weight_nom (bool): Whether to apply weight normalization.
+
+        """
+        super().__init__()
+
+        # window size check
+        assert len(window_sizes) == len(pqmf_params)
+        sizes = [ws // p[0] for ws, p in zip(window_sizes, pqmf_params)]
+        assert len(window_sizes) == sum([sizes[0] == size for size in sizes])
+
+        self.repeats = repeats
+        self.window_sizes = window_sizes
+        self.pqmfs = torch.nn.ModuleList()
+        self.discriminators = torch.nn.ModuleList()
+        for pqmf_param in pqmf_params:
+            d_params = copy.deepcopy(discriminator_params)
+            d_params["in_channels"] = pqmf_param[0]
+            if pqmf_param[0] == 1:
+                self.pqmfs += [torch.nn.Identity()]
+            else:
+                self.pqmfs += [PQMF(*pqmf_param)]
+            self.discriminators += [BaseDiscriminator(**d_params)]
+
+        # apply weight norm
+        if use_weight_norm:
+            self.apply_weight_norm()
+
+        # reset parameters
+        self.reset_parameters()
+
+    def forward(self, x):
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input tensor (B, 1, T).
+
+        Returns:
+            List: List of discriminator outputs, #items in the list will be
+                equal to repeats * #discriminators.
+
+        """
+        outs = []
+        for _ in range(self.repeats):
+            outs += self._forward(x)
+
+        return outs
+
+    def _forward(self, x):
+        outs = []
+        for idx, (ws, pqmf, disc) in enumerate(
+            zip(self.window_sizes, self.pqmfs, self.discriminators)
+        ):
+            # NOTE(kan-bayashi): Is it ok to apply different window for real and fake samples?
+            start_idx = np.random.randint(x.size(-1) - ws)
+            x_ = x[:, :, start_idx : start_idx + ws]
+            if idx == 0:
+                x_ = pqmf(x_)
+            else:
+                x_ = pqmf.analysis(x_)
+            outs += [disc(x_)]
+        return outs
+
+    def apply_weight_norm(self):
+        """Apply weight normalization module from all of the layers."""
+
+        def _apply_weight_norm(m):
+            if isinstance(m, torch.nn.Conv1d) or isinstance(
+                m, torch.nn.ConvTranspose1d
+            ):
+                torch.nn.utils.weight_norm(m)
+                logging.debug(f"Weight norm is applied to {m}.")
+
+        self.apply(_apply_weight_norm)
+
+    def reset_parameters(self):
+        """Reset parameters."""
+
+        def _reset_parameters(m):
+            if isinstance(m, torch.nn.Conv1d) or isinstance(
+                m, torch.nn.ConvTranspose1d
+            ):
+                m.weight.data.normal_(0.0, 0.02)
+                logging.debug(f"Reset parameters in {m}.")
+
+        self.apply(_reset_parameters)
diff --git a/ParallelWaveGAN/parallel_wavegan/models/tf_models.py b/ParallelWaveGAN/parallel_wavegan/models/tf_models.py
new file mode 100644
index 0000000000000000000000000000000000000000..286da21cd679f6ad3cb66b0fd94d22ca344ba31a
--- /dev/null
+++ b/ParallelWaveGAN/parallel_wavegan/models/tf_models.py
@@ -0,0 +1,137 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2020 MINH ANH (@dathudeptrai)
+#  MIT License (https://opensource.org/licenses/MIT)
+
+"""Tensorflow MelGAN modules complatible with pytorch."""
+
+import tensorflow as tf
+
+import numpy as np
+
+from parallel_wavegan.layers.tf_layers import TFConvTranspose1d
+from parallel_wavegan.layers.tf_layers import TFReflectionPad1d
+from parallel_wavegan.layers.tf_layers import TFResidualStack
+
+
+class TFMelGANGenerator(tf.keras.layers.Layer):
+    """Tensorflow MelGAN generator module."""
+
+    def __init__(
+        self,
+        in_channels=80,
+        out_channels=1,
+        kernel_size=7,
+        channels=512,
+        bias=True,
+        upsample_scales=[8, 8, 2, 2],
+        stack_kernel_size=3,
+        stacks=3,
+        nonlinear_activation="LeakyReLU",
+        nonlinear_activation_params={"alpha": 0.2},
+        pad="ReflectionPad1d",
+        pad_params={},
+        use_final_nonlinear_activation=True,
+        use_weight_norm=True,
+        use_causal_conv=False,
+    ):
+        """Initialize TFMelGANGenerator module.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            kernel_size (int): Kernel size of initial and final conv layer.
+            channels (int): Initial number of channels for conv layer.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            upsample_scales (list): List of upsampling scales.
+            stack_kernel_size (int): Kernel size of dilated conv layers in residual stack.
+            stacks (int): Number of stacks in a single residual stack.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            pad (str): Padding function module name before dilated convolution layer.
+            pad_params (dict): Hyperparameters for padding function.
+            use_final_nonlinear_activation (torch.nn.Module): Activation function for the final layer.
+            use_weight_norm (bool): No effect but keep it as is to be the same as pytorch version.
+            use_causal_conv (bool): Whether to use causal convolution.
+
+        """
+        super(TFMelGANGenerator, self).__init__()
+
+        # check hyper parameters is valid
+        assert not use_causal_conv, "Not supported yet."
+        assert channels >= np.prod(upsample_scales)
+        assert channels % (2 ** len(upsample_scales)) == 0
+        assert pad == "ReflectionPad1d", f"Not supported (pad={pad})."
+
+        # add initial layer
+        layers = []
+        layers += [
+            TFReflectionPad1d((kernel_size - 1) // 2),
+            tf.keras.layers.Conv2D(
+                filters=channels,
+                kernel_size=(kernel_size, 1),
+                padding="valid",
+                use_bias=bias,
+            ),
+        ]
+
+        for i, upsample_scale in enumerate(upsample_scales):
+            # add upsampling layer
+            layers += [
+                getattr(tf.keras.layers, nonlinear_activation)(
+                    **nonlinear_activation_params
+                ),
+                TFConvTranspose1d(
+                    channels=channels // (2 ** (i + 1)),
+                    kernel_size=upsample_scale * 2,
+                    stride=upsample_scale,
+                    padding="same",
+                ),
+            ]
+
+            # add residual stack
+            for j in range(stacks):
+                layers += [
+                    TFResidualStack(
+                        kernel_size=stack_kernel_size,
+                        channels=channels // (2 ** (i + 1)),
+                        dilation=stack_kernel_size ** j,
+                        bias=bias,
+                        nonlinear_activation=nonlinear_activation,
+                        nonlinear_activation_params=nonlinear_activation_params,
+                        padding="same",
+                    )
+                ]
+
+        # add final layer
+        layers += [
+            getattr(tf.keras.layers, nonlinear_activation)(
+                **nonlinear_activation_params
+            ),
+            TFReflectionPad1d((kernel_size - 1) // 2),
+            tf.keras.layers.Conv2D(
+                filters=out_channels, kernel_size=(kernel_size, 1), use_bias=bias
+            ),
+        ]
+        if use_final_nonlinear_activation:
+            layers += [tf.keras.layers.Activation("tanh")]
+
+        self.melgan = tf.keras.models.Sequential(layers)
+
+    # TODO(kan-bayashi): Fix hard coded dimension
+    @tf.function(
+        input_signature=[tf.TensorSpec(shape=[None, None, 80], dtype=tf.float32)]
+    )
+    def call(self, c):
+        """Calculate forward propagation.
+
+        Args:
+            c (Tensor): Input tensor (B, T, in_channels).
+
+        Returns:
+            Tensor: Output tensor (B, T ** prod(upsample_scales), out_channels).
+
+        """
+        c = tf.expand_dims(c, 2)
+        c = self.melgan(c)
+        return c[:, :, 0, :]
diff --git a/ParallelWaveGAN/parallel_wavegan/optimizers/__init__.py b/ParallelWaveGAN/parallel_wavegan/optimizers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..db777e82841eb9e5cbcb28ba46634b6807c986a4
--- /dev/null
+++ b/ParallelWaveGAN/parallel_wavegan/optimizers/__init__.py
@@ -0,0 +1,3 @@
+from torch.optim import *  # NOQA
+
+from .radam import *  # NOQA
diff --git a/ParallelWaveGAN/parallel_wavegan/optimizers/radam.py b/ParallelWaveGAN/parallel_wavegan/optimizers/radam.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9d35dced9dc6ccd950b4f96ec36cd52559c26c2
--- /dev/null
+++ b/ParallelWaveGAN/parallel_wavegan/optimizers/radam.py
@@ -0,0 +1,100 @@
+# -*- coding: utf-8 -*-
+
+"""RAdam optimizer.
+
+This code is drived from https://github.com/LiyuanLucasLiu/RAdam.
+"""
+
+import math
+import torch
+
+from torch.optim.optimizer import Optimizer
+
+
+class RAdam(Optimizer):
+    """Rectified Adam optimizer."""
+
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
+        """Initilize RAdam optimizer."""
+        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
+        self.buffer = [[None, None, None] for ind in range(10)]
+        super(RAdam, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        """Set state."""
+        super(RAdam, self).__setstate__(state)
+
+    def step(self, closure=None):
+        """Run one step."""
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+
+            for p in group["params"]:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data.float()
+                if grad.is_sparse:
+                    raise RuntimeError("RAdam does not support sparse gradients")
+
+                p_data_fp32 = p.data.float()
+
+                state = self.state[p]
+
+                if len(state) == 0:
+                    state["step"] = 0
+                    state["exp_avg"] = torch.zeros_like(p_data_fp32)
+                    state["exp_avg_sq"] = torch.zeros_like(p_data_fp32)
+                else:
+                    state["exp_avg"] = state["exp_avg"].type_as(p_data_fp32)
+                    state["exp_avg_sq"] = state["exp_avg_sq"].type_as(p_data_fp32)
+
+                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
+                beta1, beta2 = group["betas"]
+
+                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+                exp_avg.mul_(beta1).add_(1 - beta1, grad)
+
+                state["step"] += 1
+                buffered = self.buffer[int(state["step"] % 10)]
+                if state["step"] == buffered[0]:
+                    N_sma, step_size = buffered[1], buffered[2]
+                else:
+                    buffered[0] = state["step"]
+                    beta2_t = beta2 ** state["step"]
+                    N_sma_max = 2 / (1 - beta2) - 1
+                    N_sma = N_sma_max - 2 * state["step"] * beta2_t / (1 - beta2_t)
+                    buffered[1] = N_sma
+
+                    # more conservative since it's an approximated value
+                    if N_sma >= 5:
+                        step_size = math.sqrt(
+                            (1 - beta2_t)
+                            * (N_sma - 4)
+                            / (N_sma_max - 4)
+                            * (N_sma - 2)
+                            / N_sma
+                            * N_sma_max
+                            / (N_sma_max - 2)
+                        ) / (
+                            1 - beta1 ** state["step"]
+                        )  # NOQA
+                    else:
+                        step_size = 1.0 / (1 - beta1 ** state["step"])
+                    buffered[2] = step_size
+
+                if group["weight_decay"] != 0:
+                    p_data_fp32.add_(-group["weight_decay"] * group["lr"], p_data_fp32)
+
+                # more conservative since it's an approximated value
+                if N_sma >= 5:
+                    denom = exp_avg_sq.sqrt().add_(group["eps"])
+                    p_data_fp32.addcdiv_(-step_size * group["lr"], exp_avg, denom)
+                else:
+                    p_data_fp32.add_(-step_size * group["lr"], exp_avg)
+
+                p.data.copy_(p_data_fp32)
+
+        return loss
diff --git a/ParallelWaveGAN/parallel_wavegan/utils/__init__.py b/ParallelWaveGAN/parallel_wavegan/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8fa95a020706b5412c3959fbf6e5980019c0d5f
--- /dev/null
+++ b/ParallelWaveGAN/parallel_wavegan/utils/__init__.py
@@ -0,0 +1 @@
+from .utils import *  # NOQA
diff --git a/ParallelWaveGAN/parallel_wavegan/utils/utils.py b/ParallelWaveGAN/parallel_wavegan/utils/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..43cfab8385839c25bff60e34ec5de571622b28b9
--- /dev/null
+++ b/ParallelWaveGAN/parallel_wavegan/utils/utils.py
@@ -0,0 +1,394 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+"""Utility functions."""
+
+import fnmatch
+import logging
+import os
+import sys
+import tarfile
+
+from distutils.version import LooseVersion
+from filelock import FileLock
+
+import h5py
+import numpy as np
+import torch
+import yaml
+
+PRETRAINED_MODEL_LIST = {
+    "ljspeech_parallel_wavegan.v1": "1PdZv37JhAQH6AwNh31QlqruqrvjTBq7U",
+    "ljspeech_parallel_wavegan.v1.long": "1A9TsrD9fHxFviJVFjCk5W6lkzWXwhftv",
+    "ljspeech_parallel_wavegan.v1.no_limit": "1CdWKSiKoFNPZyF1lo7Dsj6cPKmfLJe72",
+    "ljspeech_parallel_wavegan.v3": "1-oZpwpWZMMolDYsCqeL12dFkXSBD9VBq",
+    "ljspeech_melgan.v1": "1i7-FPf9LPsYLHM6yNPoJdw5Q9d28C-ip",
+    "ljspeech_melgan.v1.long": "1x1b_R7d2561nqweK3FPb2muTdcFIYTu6",
+    "ljspeech_melgan.v3": "1J5gJ_FUZhOAKiRFWiAK6FcO5Z6oYJbmQ",
+    "ljspeech_melgan.v3.long": "124JnaLcRe7TsuAGh3XIClS3C7Wom9AU2",
+    "ljspeech_full_band_melgan.v2": "1Kb7q5zBeQ30Wsnma0X23G08zvgDG5oen",
+    "ljspeech_multi_band_melgan.v2": "1b70pJefKI8DhGYz4SxbEHpxm92tj1_qC",
+    "ljspeech_hifigan.v1": "1i6-hR_ksEssCYNlNII86v3AoeA1JcuWD",
+    "ljspeech_style_melgan.v1": "10aJSZfmCAobQJgRGio6cNyw6Xlgmme9-",
+    "jsut_parallel_wavegan.v1": "1qok91A6wuubuz4be-P9R2zKhNmQXG0VQ",
+    "jsut_multi_band_melgan.v2": "1chTt-76q2p69WPpZ1t1tt8szcM96IKad",
+    "jsut_hifigan.v1": "1vdgqTu9YKyGMCn-G7H2fI6UBC_4_55XB",
+    "jsut_style_melgan.v1": "1VIkjSxYxAGUVEvJxNLaOaJ7Twe48SH-s",
+    "csmsc_parallel_wavegan.v1": "1QTOAokhD5dtRnqlMPTXTW91-CG7jf74e",
+    "csmsc_multi_band_melgan.v2": "1G6trTmt0Szq-jWv2QDhqglMdWqQxiXQT",
+    "csmsc_hifigan.v1": "1fVKGEUrdhGjIilc21Sf0jODulAq6D1qY",
+    "csmsc_style_melgan.v1": "1kGUC_b9oVSv24vZRi66AAbSNUKJmbSCX",
+    "arctic_slt_parallel_wavegan.v1": "1_MXePg40-7DTjD0CDVzyduwQuW_O9aA1",
+    "jnas_parallel_wavegan.v1": "1D2TgvO206ixdLI90IqG787V6ySoXLsV_",
+    "vctk_parallel_wavegan.v1": "1bqEFLgAroDcgUy5ZFP4g2O2MwcwWLEca",
+    "vctk_parallel_wavegan.v1.long": "1tO4-mFrZ3aVYotgg7M519oobYkD4O_0-",
+    "vctk_multi_band_melgan.v2": "10PRQpHMFPE7RjF-MHYqvupK9S0xwBlJ_",
+    "vctk_hifigan.v1": "1oVOC4Vf0DYLdDp4r7GChfgj7Xh5xd0ex",
+    "vctk_style_melgan.v1": "14ThSEgjvl_iuFMdEGuNp7d3DulJHS9Mk",
+    "libritts_parallel_wavegan.v1": "1zHQl8kUYEuZ_i1qEFU6g2MEu99k3sHmR",
+    "libritts_parallel_wavegan.v1.long": "1b9zyBYGCCaJu0TIus5GXoMF8M3YEbqOw",
+    "libritts_multi_band_melgan.v2": "1kIDSBjrQvAsRewHPiFwBZ3FDelTWMp64",
+    "libritts_hifigan.v1": "1_TVFIvVtMn-Z4NiQrtrS20uSJOvBsnu1",
+    "libritts_style_melgan.v1": "1yuQakiMP0ECdB55IoxEGCbXDnNkWCoBg",
+    "kss_parallel_wavegan.v1": "1mLtQAzZHLiGSWguKCGG0EZa4C_xUO5gX",
+    "hui_acg_hokuspokus_parallel_wavegan.v1": "1irKf3okMLau56WNeOnhr2ZfSVESyQCGS",
+    "ruslan_parallel_wavegan.v1": "1M3UM6HN6wrfSe5jdgXwBnAIl_lJzLzuI",
+}
+
+
+def find_files(root_dir, query="*.wav", include_root_dir=True):
+    """Find files recursively.
+
+    Args:
+        root_dir (str): Root root_dir to find.
+        query (str): Query to find.
+        include_root_dir (bool): If False, root_dir name is not included.
+
+    Returns:
+        list: List of found filenames.
+
+    """
+    files = []
+    for root, dirnames, filenames in os.walk(root_dir, followlinks=True):
+        for filename in fnmatch.filter(filenames, query):
+            files.append(os.path.join(root, filename))
+    if not include_root_dir:
+        files = [file_.replace(root_dir + "/", "") for file_ in files]
+
+    return files
+
+
+def read_hdf5(hdf5_name, hdf5_path):
+    """Read hdf5 dataset.
+
+    Args:
+        hdf5_name (str): Filename of hdf5 file.
+        hdf5_path (str): Dataset name in hdf5 file.
+
+    Return:
+        any: Dataset values.
+
+    """
+    if not os.path.exists(hdf5_name):
+        logging.error(f"There is no such a hdf5 file ({hdf5_name}).")
+        sys.exit(1)
+
+    hdf5_file = h5py.File(hdf5_name, "r")
+
+    if hdf5_path not in hdf5_file:
+        logging.error(f"There is no such a data in hdf5 file. ({hdf5_path})")
+        sys.exit(1)
+
+    hdf5_data = hdf5_file[hdf5_path][()]
+    hdf5_file.close()
+
+    return hdf5_data
+
+
+def write_hdf5(hdf5_name, hdf5_path, write_data, is_overwrite=True):
+    """Write dataset to hdf5.
+
+    Args:
+        hdf5_name (str): Hdf5 dataset filename.
+        hdf5_path (str): Dataset path in hdf5.
+        write_data (ndarray): Data to write.
+        is_overwrite (bool): Whether to overwrite dataset.
+
+    """
+    # convert to numpy array
+    write_data = np.array(write_data)
+
+    # check folder existence
+    folder_name, _ = os.path.split(hdf5_name)
+    if not os.path.exists(folder_name) and len(folder_name) != 0:
+        os.makedirs(folder_name)
+
+    # check hdf5 existence
+    if os.path.exists(hdf5_name):
+        # if already exists, open with r+ mode
+        hdf5_file = h5py.File(hdf5_name, "r+")
+        # check dataset existence
+        if hdf5_path in hdf5_file:
+            if is_overwrite:
+                logging.warning(
+                    "Dataset in hdf5 file already exists. " "recreate dataset in hdf5."
+                )
+                hdf5_file.__delitem__(hdf5_path)
+            else:
+                logging.error(
+                    "Dataset in hdf5 file already exists. "
+                    "if you want to overwrite, please set is_overwrite = True."
+                )
+                hdf5_file.close()
+                sys.exit(1)
+    else:
+        # if not exists, open with w mode
+        hdf5_file = h5py.File(hdf5_name, "w")
+
+    # write data to hdf5
+    hdf5_file.create_dataset(hdf5_path, data=write_data)
+    hdf5_file.flush()
+    hdf5_file.close()
+
+
+class HDF5ScpLoader(object):
+    """Loader class for a fests.scp file of hdf5 file.
+
+    Examples:
+        key1 /some/path/a.h5:feats
+        key2 /some/path/b.h5:feats
+        key3 /some/path/c.h5:feats
+        key4 /some/path/d.h5:feats
+        ...
+        >>> loader = HDF5ScpLoader("hdf5.scp")
+        >>> array = loader["key1"]
+
+        key1 /some/path/a.h5
+        key2 /some/path/b.h5
+        key3 /some/path/c.h5
+        key4 /some/path/d.h5
+        ...
+        >>> loader = HDF5ScpLoader("hdf5.scp", "feats")
+        >>> array = loader["key1"]
+
+        key1 /some/path/a.h5:feats_1,feats_2
+        key2 /some/path/b.h5:feats_1,feats_2
+        key3 /some/path/c.h5:feats_1,feats_2
+        key4 /some/path/d.h5:feats_1,feats_2
+        ...
+        >>> loader = HDF5ScpLoader("hdf5.scp")
+        # feats_1 and feats_2 will be concatenated
+        >>> array = loader["key1"]
+
+    """
+
+    def __init__(self, feats_scp, default_hdf5_path="feats"):
+        """Initialize HDF5 scp loader.
+
+        Args:
+            feats_scp (str): Kaldi-style feats.scp file with hdf5 format.
+            default_hdf5_path (str): Path in hdf5 file. If the scp contain the info, not used.
+
+        """
+        self.default_hdf5_path = default_hdf5_path
+        with open(feats_scp) as f:
+            lines = [line.replace("\n", "") for line in f.readlines()]
+        self.data = {}
+        for line in lines:
+            key, value = line.split()
+            self.data[key] = value
+
+    def get_path(self, key):
+        """Get hdf5 file path for a given key."""
+        return self.data[key]
+
+    def __getitem__(self, key):
+        """Get ndarray for a given key."""
+        p = self.data[key]
+        if ":" in p:
+            if len(p.split(",")) == 1:
+                return read_hdf5(*p.split(":"))
+            else:
+                p1, p2 = p.split(":")
+                feats = [read_hdf5(p1, p) for p in p2.split(",")]
+                return np.concatenate(
+                    [f if len(f.shape) != 1 else f.reshape(-1, 1) for f in feats], 1
+                )
+        else:
+            return read_hdf5(p, self.default_hdf5_path)
+
+    def __len__(self):
+        """Return the length of the scp file."""
+        return len(self.data)
+
+    def __iter__(self):
+        """Return the iterator of the scp file."""
+        return iter(self.data)
+
+    def keys(self):
+        """Return the keys of the scp file."""
+        return self.data.keys()
+
+    def values(self):
+        """Return the values of the scp file."""
+        for key in self.keys():
+            yield self[key]
+
+
+class NpyScpLoader(object):
+    """Loader class for a fests.scp file of npy file.
+
+    Examples:
+        key1 /some/path/a.npy
+        key2 /some/path/b.npy
+        key3 /some/path/c.npy
+        key4 /some/path/d.npy
+        ...
+        >>> loader = NpyScpLoader("feats.scp")
+        >>> array = loader["key1"]
+
+    """
+
+    def __init__(self, feats_scp):
+        """Initialize npy scp loader.
+
+        Args:
+            feats_scp (str): Kaldi-style feats.scp file with npy format.
+
+        """
+        with open(feats_scp) as f:
+            lines = [line.replace("\n", "") for line in f.readlines()]
+        self.data = {}
+        for line in lines:
+            key, value = line.split()
+            self.data[key] = value
+
+    def get_path(self, key):
+        """Get npy file path for a given key."""
+        return self.data[key]
+
+    def __getitem__(self, key):
+        """Get ndarray for a given key."""
+        return np.load(self.data[key])
+
+    def __len__(self):
+        """Return the length of the scp file."""
+        return len(self.data)
+
+    def __iter__(self):
+        """Return the iterator of the scp file."""
+        return iter(self.data)
+
+    def keys(self):
+        """Return the keys of the scp file."""
+        return self.data.keys()
+
+    def values(self):
+        """Return the values of the scp file."""
+        for key in self.keys():
+            yield self[key]
+
+
+def load_model(checkpoint, config=None, stats=None):
+    """Load trained model.
+
+    Args:
+        checkpoint (str): Checkpoint path.
+        config (dict): Configuration dict.
+        stats (str): Statistics file path.
+
+    Return:
+        torch.nn.Module: Model instance.
+
+    """
+    # load config if not provided
+    if config is None:
+        dirname = os.path.dirname(checkpoint)
+        config = os.path.join(dirname, "config.yml")
+        with open(config) as f:
+            config = yaml.load(f, Loader=yaml.Loader)
+
+    # lazy load for circular error
+    import parallel_wavegan.models
+
+    # get model and load parameters
+    model_class = getattr(
+        parallel_wavegan.models,
+        config.get("generator_type", "ParallelWaveGANGenerator"),
+    )
+    # workaround for typo #295
+    generator_params = {
+        k.replace("upsample_kernal_sizes", "upsample_kernel_sizes"): v
+        for k, v in config["generator_params"].items()
+    }
+    model = model_class(**generator_params)
+    model.load_state_dict(
+        torch.load(checkpoint, map_location="cpu")["model"]["generator"]
+    )
+
+    # check stats existence
+    if stats is None:
+        dirname = os.path.dirname(checkpoint)
+        if config["format"] == "hdf5":
+            ext = "h5"
+        else:
+            ext = "npy"
+        if os.path.exists(os.path.join(dirname, f"stats.{ext}")):
+            stats = os.path.join(dirname, f"stats.{ext}")
+
+    # load stats
+    if stats is not None:
+        model.register_stats(stats)
+
+    # add pqmf if needed
+    if config["generator_params"]["out_channels"] > 1:
+        # lazy load for circular error
+        from parallel_wavegan.layers import PQMF
+
+        pqmf_params = {}
+        if LooseVersion(config.get("version", "0.1.0")) <= LooseVersion("0.4.2"):
+            # For compatibility, here we set default values in version <= 0.4.2
+            pqmf_params.update(taps=62, cutoff_ratio=0.15, beta=9.0)
+        model.pqmf = PQMF(
+            subbands=config["generator_params"]["out_channels"],
+            **config.get("pqmf_params", pqmf_params),
+        )
+
+    return model
+
+
+def download_pretrained_model(tag, download_dir=None):
+    """Download pretrained model form google drive.
+
+    Args:
+        tag (str): Pretrained model tag.
+        download_dir (str): Directory to save downloaded files.
+
+    Returns:
+        str: Path of downloaded model checkpoint.
+
+    """
+    assert tag in PRETRAINED_MODEL_LIST, f"{tag} does not exists."
+    id_ = PRETRAINED_MODEL_LIST[tag]
+    if download_dir is None:
+        download_dir = os.path.expanduser("~/.cache/parallel_wavegan")
+    output_path = f"{download_dir}/{tag}.tar.gz"
+    os.makedirs(f"{download_dir}", exist_ok=True)
+    with FileLock(output_path + ".lock"):
+        if not os.path.exists(output_path):
+            # lazy load for compatibility
+            import gdown
+
+            gdown.download(
+                f"https://drive.google.com/uc?id={id_}", output_path, quiet=False
+            )
+            with tarfile.open(output_path, "r:*") as tar:
+                for member in tar.getmembers():
+                    if member.isreg():
+                        member.name = os.path.basename(member.name)
+                        tar.extract(member, f"{download_dir}/{tag}")
+    checkpoint_path = find_files(f"{download_dir}/{tag}", "checkpoint*.pkl")
+
+    return checkpoint_path[0]
diff --git a/ParallelWaveGAN/pyrightconfig.json b/ParallelWaveGAN/pyrightconfig.json
new file mode 100644
index 0000000000000000000000000000000000000000..a0f46dafa8765ae88211dc2f73a9557187d2ad0f
--- /dev/null
+++ b/ParallelWaveGAN/pyrightconfig.json
@@ -0,0 +1,13 @@
+{
+  "include": [
+      "parallel_wavegan",
+      "utils",
+      "**/local"
+  ],
+  "exclude": [
+      "**/__pycache__",
+      ".git"
+  ],
+  // "venvPath": "",
+  // "venv": ""
+}
diff --git a/ParallelWaveGAN/setup.cfg b/ParallelWaveGAN/setup.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..daa1617d0524928acace14c439412ee2df581d00
--- /dev/null
+++ b/ParallelWaveGAN/setup.cfg
@@ -0,0 +1,11 @@
+[aliases]
+test=pytest
+
+[tool:pytest]
+addopts = --verbose --durations=0
+testpaths = test
+
+[flake8]
+ignore = H102,E203,W503,H238,D104
+# 120 is a workaround, 79 is good
+max-line-length = 120
diff --git a/ParallelWaveGAN/setup.py b/ParallelWaveGAN/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c105518dce6da67e0db106858a3100ae0029ab5
--- /dev/null
+++ b/ParallelWaveGAN/setup.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""Setup Parallel WaveGAN libarary."""
+
+import os
+import pip
+import sys
+
+from distutils.version import LooseVersion
+from setuptools import find_packages
+from setuptools import setup
+
+if LooseVersion(sys.version) < LooseVersion("3.6"):
+    raise RuntimeError(
+        "parallel-wavegan requires Python>=3.6, "
+        "but your Python is {}".format(sys.version)
+    )
+if LooseVersion(pip.__version__) < LooseVersion("19"):
+    raise RuntimeError(
+        "pip>=19.0.0 is required, but your pip is {}. "
+        'Try again after "pip install -U pip"'.format(pip.__version__)
+    )
+
+requirements = {
+    "install": [
+        "torch>=1.0.1",
+        "setuptools>=38.5.1",
+        "librosa>=0.8.0",
+        "soundfile>=0.10.2",
+        "tensorboardX>=1.8",
+        "matplotlib>=3.1.0",
+        "PyYAML>=3.12",
+        "tqdm>=4.26.1",
+        "kaldiio>=2.14.1",
+        "h5py>=2.9.0",
+        "yq>=2.10.0",
+        "gdown",
+        "filelock",
+    ],
+    "setup": [
+        "numpy",
+        "pytest-runner",
+    ],
+    "test": [
+        "pytest>=3.3.0",
+        "hacking>=4.1.0",
+        "flake8-docstrings>=1.3.1",
+        "black",
+    ],
+}
+entry_points = {
+    "console_scripts": [
+        "parallel-wavegan-preprocess=parallel_wavegan.bin.preprocess:main",
+        "parallel-wavegan-compute-statistics=parallel_wavegan.bin.compute_statistics:main",
+        "parallel-wavegan-normalize=parallel_wavegan.bin.normalize:main",
+        "parallel-wavegan-train=parallel_wavegan.bin.train:main",
+        "parallel-wavegan-decode=parallel_wavegan.bin.decode:main",
+    ]
+}
+
+install_requires = requirements["install"]
+setup_requires = requirements["setup"]
+tests_require = requirements["test"]
+extras_require = {
+    k: v for k, v in requirements.items() if k not in ["install", "setup"]
+}
+
+dirname = os.path.dirname(__file__)
+setup(
+    name="parallel_wavegan",
+    version="0.5.3",
+    url="http://github.com/kan-bayashi/ParallelWaveGAN",
+    author="Tomoki Hayashi",
+    author_email="hayashi.tomoki@g.sp.m.is.nagoya-u.ac.jp",
+    description="Parallel WaveGAN implementation",
+    long_description=open(os.path.join(dirname, "README.md"), encoding="utf-8").read(),
+    long_description_content_type="text/markdown",
+    license="MIT License",
+    packages=find_packages(include=["parallel_wavegan*"]),
+    install_requires=install_requires,
+    setup_requires=setup_requires,
+    tests_require=tests_require,
+    extras_require=extras_require,
+    entry_points=entry_points,
+    classifiers=[
+        "Programming Language :: Python :: 3.6",
+        "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Intended Audience :: Science/Research",
+        "Operating System :: POSIX :: Linux",
+        "License :: OSI Approved :: MIT License",
+        "Topic :: Software Development :: Libraries :: Python Modules",
+    ],
+)
diff --git a/ParallelWaveGAN/test/test_hifigan.py b/ParallelWaveGAN/test/test_hifigan.py
new file mode 100644
index 0000000000000000000000000000000000000000..f65f5ae5d111b5faff82651384a1c7a7b05bf8ab
--- /dev/null
+++ b/ParallelWaveGAN/test/test_hifigan.py
@@ -0,0 +1,155 @@
+#!/usr/bin/env python3
+
+# Copyright 2021 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+"""Test code for HiFi-GAN modules."""
+
+import logging
+
+import numpy as np
+import pytest
+import torch
+
+from parallel_wavegan.losses import DiscriminatorAdversarialLoss
+from parallel_wavegan.losses import FeatureMatchLoss
+from parallel_wavegan.losses import GeneratorAdversarialLoss
+from parallel_wavegan.losses import MultiResolutionSTFTLoss
+from parallel_wavegan.models import HiFiGANGenerator
+from parallel_wavegan.models import HiFiGANMultiScaleMultiPeriodDiscriminator
+from test_parallel_wavegan import make_mutli_reso_stft_loss_args
+
+
+logging.basicConfig(
+    level=logging.DEBUG,
+    format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+)
+
+
+def make_hifigan_generator_args(**kwargs):
+    defaults = dict(
+        in_channels=80,
+        out_channels=1,
+        channels=512,
+        kernel_size=7,
+        upsample_scales=(8, 8, 2, 2),
+        upsample_kernel_sizes=(16, 16, 4, 4),
+        resblock_kernel_sizes=(3, 7, 11),
+        resblock_dilations=[(1, 3, 5), (1, 3, 5), (1, 3, 5)],
+        use_additional_convs=True,
+        bias=True,
+        nonlinear_activation="LeakyReLU",
+        nonlinear_activation_params={"negative_slope": 0.1},
+        use_weight_norm=True,
+    )
+    defaults.update(kwargs)
+    return defaults
+
+
+def make_hifigan_multi_scale_multi_period_discriminator_args(**kwargs):
+    defaults = dict(
+        scales=3,
+        scale_downsample_pooling="AvgPool1d",
+        scale_downsample_pooling_params={
+            "kernel_size": 4,
+            "stride": 2,
+            "padding": 2,
+        },
+        scale_discriminator_params={
+            "in_channels": 1,
+            "out_channels": 1,
+            "kernel_sizes": [15, 41, 5, 3],
+            "channels": 128,
+            "max_downsample_channels": 128,
+            "max_groups": 16,
+            "bias": True,
+            "downsample_scales": [2, 2, 4, 4, 1],
+            "nonlinear_activation": "LeakyReLU",
+            "nonlinear_activation_params": {"negative_slope": 0.1},
+        },
+        follow_official_norm=False,
+        periods=[2, 3, 5, 7, 11],
+        period_discriminator_params={
+            "in_channels": 1,
+            "out_channels": 1,
+            "kernel_sizes": [5, 3],
+            "channels": 32,
+            "downsample_scales": [3, 3, 3, 3, 1],
+            "max_downsample_channels": 128,
+            "bias": True,
+            "nonlinear_activation": "LeakyReLU",
+            "nonlinear_activation_params": {"negative_slope": 0.1},
+            "use_weight_norm": True,
+            "use_spectral_norm": False,
+        },
+    )
+    defaults.update(kwargs)
+    return defaults
+
+
+@pytest.mark.parametrize(
+    "dict_g, dict_d, dict_loss",
+    [
+        ({}, {}, {}),
+        ({}, {"scales": 1}, {}),
+        ({}, {"periods": [2]}, {}),
+        ({}, {"scales": 1, "periods": [2]}, {}),
+        ({}, {"follow_official_norm": True}, {}),
+        ({"use_additional_convs": False}, {}, {}),
+    ],
+)
+def test_hifigan_trainable(dict_g, dict_d, dict_loss):
+    # setup
+    batch_size = 4
+    batch_length = 2 ** 13
+    args_g = make_hifigan_generator_args(**dict_g)
+    args_d = make_hifigan_multi_scale_multi_period_discriminator_args(**dict_d)
+    args_loss = make_mutli_reso_stft_loss_args(**dict_loss)
+    y = torch.randn(batch_size, 1, batch_length)
+    c = torch.randn(
+        batch_size,
+        args_g["in_channels"],
+        batch_length // np.prod(args_g["upsample_scales"]),
+    )
+    model_g = HiFiGANGenerator(**args_g)
+    model_d = HiFiGANMultiScaleMultiPeriodDiscriminator(**args_d)
+    aux_criterion = MultiResolutionSTFTLoss(**args_loss)
+    feat_match_criterion = FeatureMatchLoss(
+        average_by_layers=False,
+        average_by_discriminators=False,
+        include_final_outputs=True,
+    )
+    gen_adv_criterion = GeneratorAdversarialLoss(
+        average_by_discriminators=False,
+    )
+    dis_adv_criterion = DiscriminatorAdversarialLoss(
+        average_by_discriminators=False,
+    )
+    optimizer_g = torch.optim.AdamW(model_g.parameters())
+    optimizer_d = torch.optim.AdamW(model_d.parameters())
+
+    # check generator trainable
+    y_hat = model_g(c)
+    p_hat = model_d(y_hat)
+    sc_loss, mag_loss = aux_criterion(y_hat, y)
+    aux_loss = sc_loss + mag_loss
+    adv_loss = gen_adv_criterion(p_hat)
+    with torch.no_grad():
+        p = model_d(y)
+    fm_loss = feat_match_criterion(p_hat, p)
+    loss_g = adv_loss + aux_loss + fm_loss
+    optimizer_g.zero_grad()
+    loss_g.backward()
+    optimizer_g.step()
+
+    # check discriminator trainable
+    p = model_d(y)
+    p_hat = model_d(y_hat.detach())
+    real_loss, fake_loss = dis_adv_criterion(p_hat, p)
+    loss_d = real_loss + fake_loss
+    optimizer_d.zero_grad()
+    loss_d.backward()
+    optimizer_d.step()
+
+    print(model_d)
+    print(model_g)
diff --git a/ParallelWaveGAN/test/test_layers.py b/ParallelWaveGAN/test/test_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..155f6ca4d550e171999f00a0cda62e2a9c833593
--- /dev/null
+++ b/ParallelWaveGAN/test/test_layers.py
@@ -0,0 +1,148 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+import logging
+
+import numpy as np
+import pytest
+import torch
+
+from parallel_wavegan.layers import CausalConv1d
+from parallel_wavegan.layers import CausalConvTranspose1d
+from parallel_wavegan.layers import Conv1d
+from parallel_wavegan.layers import Conv1d1x1
+from parallel_wavegan.layers import Conv2d
+from parallel_wavegan.layers import ConvInUpsampleNetwork
+from parallel_wavegan.layers import PQMF
+from parallel_wavegan.layers import UpsampleNetwork
+
+logging.basicConfig(
+    level=logging.WARN,
+    format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+)
+
+
+def test_conv_initialization():
+    conv = Conv1d(10, 10, 3, bias=True)
+    np.testing.assert_array_equal(
+        conv.bias.data.numpy(), np.zeros_like(conv.bias.data.numpy())
+    )
+    conv1x1 = Conv1d1x1(10, 10, bias=True)
+    np.testing.assert_array_equal(
+        conv1x1.bias.data.numpy(), np.zeros_like(conv1x1.bias.data.numpy())
+    )
+    kernel_size = (10, 10)
+    conv2d = Conv2d(10, 10, kernel_size, bias=True)
+    np.testing.assert_array_equal(
+        conv2d.weight.data.numpy(),
+        np.ones_like(conv2d.weight.data.numpy()) / np.prod(kernel_size),
+    )
+    np.testing.assert_array_equal(
+        conv2d.bias.data.numpy(), np.zeros_like(conv2d.bias.data.numpy())
+    )
+    kernel_size = (1, 10)
+    conv2d = Conv2d(10, 10, kernel_size, bias=True)
+    np.testing.assert_array_equal(
+        conv2d.weight.data.numpy(),
+        np.ones_like(conv2d.weight.data.numpy()) / np.prod(kernel_size),
+    )
+    np.testing.assert_array_equal(
+        conv2d.bias.data.numpy(), np.zeros_like(conv2d.bias.data.numpy())
+    )
+
+
+@pytest.mark.parametrize(
+    "use_causal_conv",
+    [
+        (False),
+        (True),
+    ],
+)
+def test_upsample(use_causal_conv):
+    length = 10
+    scales = [4, 4]
+    x = torch.randn(1, 10, length)
+    upsample = UpsampleNetwork(scales)
+    y = upsample(x)
+    assert x.size(-1) * np.prod(scales) == y.size(-1)
+
+    for aux_context_window in [0, 1, 2, 3]:
+        conv_upsample = ConvInUpsampleNetwork(
+            scales,
+            aux_channels=x.size(1),
+            aux_context_window=aux_context_window,
+            use_causal_conv=use_causal_conv,
+        )
+        y = conv_upsample(x)
+        assert (x.size(-1) - 2 * aux_context_window) * np.prod(scales) == y.size(-1)
+
+
+@torch.no_grad()
+@pytest.mark.parametrize(
+    "kernel_size, dilation, pad, pad_params",
+    [
+        (3, 1, "ConstantPad1d", {"value": 0.0}),
+        (3, 3, "ConstantPad1d", {"value": 0.0}),
+        (2, 1, "ConstantPad1d", {"value": 0.0}),
+        (2, 3, "ConstantPad1d", {"value": 0.0}),
+        (5, 1, "ConstantPad1d", {"value": 0.0}),
+        (5, 3, "ConstantPad1d", {"value": 0.0}),
+        (3, 3, "ReflectionPad1d", {}),
+        (2, 1, "ReflectionPad1d", {}),
+        (2, 3, "ReflectionPad1d", {}),
+        (5, 1, "ReflectionPad1d", {}),
+        (5, 3, "ReflectionPad1d", {}),
+    ],
+)
+def test_causal_conv(kernel_size, dilation, pad, pad_params):
+    x = torch.randn(1, 1, 32)
+    conv = CausalConv1d(1, 1, kernel_size, dilation, pad=pad, pad_params=pad_params)
+    y1 = conv(x)
+    x[:, :, 16:] += torch.randn(1, 1, 16)
+    y2 = conv(x)
+    assert x.size(2) == y1.size(2)
+    np.testing.assert_array_equal(
+        y1[:, :, :16].cpu().numpy(),
+        y2[:, :, :16].cpu().numpy(),
+    )
+
+
+@torch.no_grad()
+@pytest.mark.parametrize(
+    "kernel_size, stride",
+    [
+        (4, 2),
+        (6, 3),
+        (10, 5),
+    ],
+)
+def test_causal_conv_transpose(kernel_size, stride):
+    deconv = CausalConvTranspose1d(1, 1, kernel_size, stride)
+    x = torch.randn(1, 1, 32)
+    y1 = deconv(x)
+    x[:, :, 19:] += torch.randn(1, 1, 32 - 19)
+    y2 = deconv(x)
+    assert x.size(2) * stride == y1.size(2)
+    np.testing.assert_array_equal(
+        y1[:, :, : 19 * stride].cpu().numpy(),
+        y2[:, :, : 19 * stride].cpu().numpy(),
+    )
+
+
+@pytest.mark.parametrize(
+    "subbands",
+    [
+        (3),
+        (4),
+    ],
+)
+def test_pqmf(subbands):
+    pqmf = PQMF(subbands)
+    x = torch.randn(1, 1, subbands * 32)
+    y = pqmf.analysis(x)
+    assert y.shape[2] * subbands == x.shape[2]
+    x_hat = pqmf.synthesis(y)
+    assert x.shape[2] == x_hat.shape[2]
diff --git a/ParallelWaveGAN/test/test_mel_loss.py b/ParallelWaveGAN/test/test_mel_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..18d3cd4df825793ecbaace524a6d8046be068d4a
--- /dev/null
+++ b/ParallelWaveGAN/test/test_mel_loss.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+
+# Copyright 2021 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+"""Test code for Mel-spectrogram loss modules."""
+
+
+import numpy as np
+import torch
+
+from parallel_wavegan.bin.preprocess import logmelfilterbank
+from parallel_wavegan.losses import MelSpectrogram
+
+
+def test_mel_spectrogram_is_equal():
+    x = np.random.randn(22050)
+    x = np.abs(x) / np.max(np.abs(x))
+    mel_npy = logmelfilterbank(
+        x,
+        22050,
+        fft_size=1024,
+        hop_size=256,
+        win_length=None,
+        window="hann",
+        num_mels=80,
+        fmin=80,
+        fmax=7600,
+        eps=1e-10,
+    )
+    mel_spectrogram = MelSpectrogram(
+        fs=22050,
+        fft_size=1024,
+        hop_size=256,
+        win_length=None,
+        window="hann",
+        num_mels=80,
+        fmin=80,
+        fmax=7600,
+        eps=1e-10,
+    ).to(dtype=torch.double)
+    mel_torch = mel_spectrogram(torch.from_numpy(x).unsqueeze(0))
+    np.testing.assert_array_almost_equal(
+        mel_npy.transpose(1, 0).astype(np.float32),
+        mel_torch[0].numpy().astype(np.float32),
+    )
diff --git a/ParallelWaveGAN/test/test_melgan.py b/ParallelWaveGAN/test/test_melgan.py
new file mode 100644
index 0000000000000000000000000000000000000000..b82d739250728a2f9511483e333d104066a8a2e7
--- /dev/null
+++ b/ParallelWaveGAN/test/test_melgan.py
@@ -0,0 +1,296 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+import logging
+
+import numpy as np
+import pytest
+import torch
+
+from parallel_wavegan.losses import DiscriminatorAdversarialLoss
+from parallel_wavegan.losses import FeatureMatchLoss
+from parallel_wavegan.losses import GeneratorAdversarialLoss
+from parallel_wavegan.losses import MultiResolutionSTFTLoss
+from parallel_wavegan.models import MelGANGenerator
+from parallel_wavegan.models import MelGANMultiScaleDiscriminator
+from parallel_wavegan.models import ParallelWaveGANDiscriminator
+from parallel_wavegan.models import ResidualParallelWaveGANDiscriminator
+from parallel_wavegan.optimizers import RAdam
+
+from test_parallel_wavegan import make_discriminator_args
+from test_parallel_wavegan import make_mutli_reso_stft_loss_args
+from test_parallel_wavegan import make_residual_discriminator_args
+
+logging.basicConfig(
+    level=logging.DEBUG,
+    format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+)
+
+
+def make_melgan_generator_args(**kwargs):
+    defaults = dict(
+        in_channels=80,
+        out_channels=1,
+        kernel_size=7,
+        channels=512,
+        bias=True,
+        upsample_scales=[8, 8, 2, 2],
+        stack_kernel_size=3,
+        stacks=3,
+        nonlinear_activation="LeakyReLU",
+        nonlinear_activation_params={"negative_slope": 0.2},
+        pad="ReflectionPad1d",
+        pad_params={},
+        use_final_nonlinear_activation=True,
+        use_weight_norm=True,
+        use_causal_conv=False,
+    )
+    defaults.update(kwargs)
+    return defaults
+
+
+def make_melgan_discriminator_args(**kwargs):
+    defaults = dict(
+        in_channels=1,
+        out_channels=1,
+        scales=3,
+        downsample_pooling="AvgPool1d",
+        # follow the official implementation setting
+        downsample_pooling_params={
+            "kernel_size": 4,
+            "stride": 2,
+            "padding": 1,
+            "count_include_pad": False,
+        },
+        kernel_sizes=[5, 3],
+        channels=16,
+        max_downsample_channels=1024,
+        bias=True,
+        downsample_scales=[4, 4, 4, 4],
+        nonlinear_activation="LeakyReLU",
+        nonlinear_activation_params={"negative_slope": 0.2},
+        pad="ReflectionPad1d",
+        pad_params={},
+        use_weight_norm=True,
+    )
+    defaults.update(kwargs)
+    return defaults
+
+
+@pytest.mark.parametrize(
+    "dict_g, dict_d, dict_loss",
+    [
+        ({}, {}, {}),
+        ({"kernel_size": 3}, {}, {}),
+        ({"channels": 1024}, {}, {}),
+        ({"stack_kernel_size": 5}, {}, {}),
+        ({"stack_kernel_size": 5, "stacks": 2}, {}, {}),
+        ({"upsample_scales": [4, 4, 4, 4]}, {}, {}),
+        ({"upsample_scales": [8, 8, 2, 2, 2]}, {}, {}),
+        ({"channels": 1024, "upsample_scales": [8, 8, 2, 2, 2, 2]}, {}, {}),
+        ({"pad": "ConstantPad1d", "pad_params": {"value": 0.0}}, {}, {}),
+        ({"nonlinear_activation": "ReLU", "nonlinear_activation_params": {}}, {}, {}),
+        ({"bias": False}, {}, {}),
+        ({"use_final_nonlinear_activation": False}, {}, {}),
+        ({"use_weight_norm": False}, {}, {}),
+        ({"use_causal_conv": True}, {}, {}),
+    ],
+)
+def test_melgan_trainable(dict_g, dict_d, dict_loss):
+    # setup
+    batch_size = 4
+    batch_length = 4096
+    args_g = make_melgan_generator_args(**dict_g)
+    args_d = make_discriminator_args(**dict_d)
+    args_loss = make_mutli_reso_stft_loss_args(**dict_loss)
+    y = torch.randn(batch_size, 1, batch_length)
+    c = torch.randn(
+        batch_size,
+        args_g["in_channels"],
+        batch_length // np.prod(args_g["upsample_scales"]),
+    )
+    model_g = MelGANGenerator(**args_g)
+    model_d = ParallelWaveGANDiscriminator(**args_d)
+    aux_criterion = MultiResolutionSTFTLoss(**args_loss)
+    gen_adv_criterion = GeneratorAdversarialLoss()
+    dis_adv_criterion = DiscriminatorAdversarialLoss()
+    optimizer_g = RAdam(model_g.parameters())
+    optimizer_d = RAdam(model_d.parameters())
+
+    # check generator trainable
+    y_hat = model_g(c)
+    p_hat = model_d(y_hat)
+    adv_loss = gen_adv_criterion(p_hat)
+    sc_loss, mag_loss = aux_criterion(y_hat, y)
+    aux_loss = sc_loss + mag_loss
+    loss_g = adv_loss + aux_loss
+    optimizer_g.zero_grad()
+    loss_g.backward()
+    optimizer_g.step()
+
+    # check discriminator trainable
+    p = model_d(y)
+    p_hat = model_d(y_hat.detach())
+    real_loss, fake_loss = dis_adv_criterion(p_hat, p)
+    loss_d = real_loss + fake_loss
+    optimizer_d.zero_grad()
+    loss_d.backward()
+    optimizer_d.step()
+
+
+@pytest.mark.parametrize(
+    "dict_g, dict_d, dict_loss",
+    [
+        ({}, {}, {}),
+        ({"kernel_size": 3}, {}, {}),
+        ({"channels": 1024}, {}, {}),
+        ({"stack_kernel_size": 5}, {}, {}),
+        ({"stack_kernel_size": 5, "stacks": 2}, {}, {}),
+        ({"upsample_scales": [4, 4, 4, 4]}, {}, {}),
+        ({"upsample_scales": [8, 8, 2, 2, 2]}, {}, {}),
+        ({"channels": 1024, "upsample_scales": [8, 8, 2, 2, 2, 2]}, {}, {}),
+        ({"pad": "ConstantPad1d", "pad_params": {"value": 0.0}}, {}, {}),
+        ({"nonlinear_activation": "ReLU", "nonlinear_activation_params": {}}, {}, {}),
+        ({"bias": False}, {}, {}),
+        ({"use_final_nonlinear_activation": False}, {}, {}),
+        ({"use_weight_norm": False}, {}, {}),
+    ],
+)
+def test_melgan_trainable_with_residual_discriminator(dict_g, dict_d, dict_loss):
+    # setup
+    batch_size = 4
+    batch_length = 4096
+    args_g = make_melgan_generator_args(**dict_g)
+    args_d = make_residual_discriminator_args(**dict_d)
+    args_loss = make_mutli_reso_stft_loss_args(**dict_loss)
+    y = torch.randn(batch_size, 1, batch_length)
+    c = torch.randn(
+        batch_size,
+        args_g["in_channels"],
+        batch_length // np.prod(args_g["upsample_scales"]),
+    )
+    model_g = MelGANGenerator(**args_g)
+    model_d = ResidualParallelWaveGANDiscriminator(**args_d)
+    aux_criterion = MultiResolutionSTFTLoss(**args_loss)
+    gen_adv_criterion = GeneratorAdversarialLoss()
+    dis_adv_criterion = DiscriminatorAdversarialLoss()
+    optimizer_g = RAdam(model_g.parameters())
+    optimizer_d = RAdam(model_d.parameters())
+
+    # check generator trainable
+    y_hat = model_g(c)
+    p_hat = model_d(y_hat)
+    adv_loss = gen_adv_criterion(p_hat)
+    sc_loss, mag_loss = aux_criterion(y_hat, y)
+    aux_loss = sc_loss + mag_loss
+    loss_g = adv_loss + aux_loss
+    optimizer_g.zero_grad()
+    loss_g.backward()
+    optimizer_g.step()
+
+    # check discriminator trainable
+    p = model_d(y)
+    p_hat = model_d(y_hat.detach())
+    real_loss, fake_loss = dis_adv_criterion(p_hat, p)
+    loss_d = real_loss + fake_loss
+    optimizer_d.zero_grad()
+    loss_d.backward()
+    optimizer_d.step()
+
+
+@pytest.mark.parametrize(
+    "dict_g, dict_d, dict_loss",
+    [
+        ({}, {}, {}),
+        ({}, {"scales": 4}, {}),
+        ({}, {"kernel_sizes": [7, 5]}, {}),
+        ({}, {"max_downsample_channels": 128}, {}),
+        ({}, {"downsample_scales": [4, 4]}, {}),
+        ({}, {"pad": "ConstantPad1d", "pad_params": {"value": 0.0}}, {}),
+        ({}, {"nonlinear_activation": "ReLU", "nonlinear_activation_params": {}}, {}),
+    ],
+)
+def test_melgan_trainable_with_melgan_discriminator(dict_g, dict_d, dict_loss):
+    # setup
+    batch_size = 4
+    batch_length = 4096
+    args_g = make_melgan_generator_args(**dict_g)
+    args_d = make_melgan_discriminator_args(**dict_d)
+    args_loss = make_mutli_reso_stft_loss_args(**dict_loss)
+    y = torch.randn(batch_size, 1, batch_length)
+    c = torch.randn(
+        batch_size,
+        args_g["in_channels"],
+        batch_length // np.prod(args_g["upsample_scales"]),
+    )
+    model_g = MelGANGenerator(**args_g)
+    model_d = MelGANMultiScaleDiscriminator(**args_d)
+    aux_criterion = MultiResolutionSTFTLoss(**args_loss)
+    feat_match_criterion = FeatureMatchLoss()
+    gen_adv_criterion = GeneratorAdversarialLoss()
+    dis_adv_criterion = DiscriminatorAdversarialLoss()
+    optimizer_g = RAdam(model_g.parameters())
+    optimizer_d = RAdam(model_d.parameters())
+
+    # check generator trainable
+    y_hat = model_g(c)
+    p_hat = model_d(y_hat)
+    sc_loss, mag_loss = aux_criterion(y_hat, y)
+    aux_loss = sc_loss + mag_loss
+    adv_loss = gen_adv_criterion(p_hat)
+    with torch.no_grad():
+        p = model_d(y)
+    fm_loss = feat_match_criterion(p_hat, p)
+    loss_g = adv_loss + aux_loss + fm_loss
+    optimizer_g.zero_grad()
+    loss_g.backward()
+    optimizer_g.step()
+
+    # check discriminator trainable
+    p = model_d(y)
+    p_hat = model_d(y_hat.detach())
+    real_loss, fake_loss = dis_adv_criterion(p_hat, p)
+    loss_d = real_loss + fake_loss
+    optimizer_d.zero_grad()
+    loss_d.backward()
+    optimizer_d.step()
+
+
+@pytest.mark.parametrize(
+    "dict_g",
+    [
+        ({"use_causal_conv": True}),
+        ({"use_causal_conv": True, "upsample_scales": [4, 4, 2, 2]}),
+        ({"use_causal_conv": True, "upsample_scales": [4, 5, 4, 3]}),
+    ],
+)
+def test_causal_melgan(dict_g):
+    batch_size = 4
+    batch_length = 4096
+    args_g = make_melgan_generator_args(**dict_g)
+    upsampling_factor = np.prod(args_g["upsample_scales"])
+    c = torch.randn(
+        batch_size, args_g["in_channels"], batch_length // upsampling_factor
+    )
+    model_g = MelGANGenerator(**args_g)
+    c_ = c.clone()
+    c_[..., c.size(-1) // 2 :] = torch.randn(c[..., c.size(-1) // 2 :].shape)
+    try:
+        # check not equal
+        np.testing.assert_array_equal(c.numpy(), c_.numpy())
+    except AssertionError:
+        pass
+    else:
+        raise AssertionError("Must be different.")
+
+    # check causality
+    y = model_g(c)
+    y_ = model_g(c_)
+    assert y.size(2) == c.size(2) * upsampling_factor
+    np.testing.assert_array_equal(
+        y[..., : c.size(-1) // 2 * upsampling_factor].detach().cpu().numpy(),
+        y_[..., : c_.size(-1) // 2 * upsampling_factor].detach().cpu().numpy(),
+    )
diff --git a/ParallelWaveGAN/test/test_parallel_wavegan.py b/ParallelWaveGAN/test/test_parallel_wavegan.py
new file mode 100644
index 0000000000000000000000000000000000000000..cab84451170823d00ae529e4c5a97d3fd3eb98d1
--- /dev/null
+++ b/ParallelWaveGAN/test/test_parallel_wavegan.py
@@ -0,0 +1,354 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+import logging
+
+import numpy as np
+import pytest
+import torch
+
+from parallel_wavegan.losses import DiscriminatorAdversarialLoss
+from parallel_wavegan.losses import GeneratorAdversarialLoss
+from parallel_wavegan.losses import MultiResolutionSTFTLoss
+from parallel_wavegan.models import ParallelWaveGANDiscriminator
+from parallel_wavegan.models import ParallelWaveGANGenerator
+from parallel_wavegan.models import ResidualParallelWaveGANDiscriminator
+from parallel_wavegan.optimizers import RAdam
+
+logging.basicConfig(
+    level=logging.DEBUG,
+    format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+)
+
+
+def make_generator_args(**kwargs):
+    defaults = dict(
+        in_channels=1,
+        out_channels=1,
+        kernel_size=3,
+        layers=6,
+        stacks=3,
+        residual_channels=8,
+        gate_channels=16,
+        skip_channels=8,
+        aux_channels=10,
+        aux_context_window=0,
+        dropout=1 - 0.95,
+        use_weight_norm=True,
+        use_causal_conv=False,
+        upsample_conditional_features=True,
+        upsample_net="ConvInUpsampleNetwork",
+        upsample_params={"upsample_scales": [4, 4]},
+    )
+    defaults.update(kwargs)
+    return defaults
+
+
+def make_discriminator_args(**kwargs):
+    defaults = dict(
+        in_channels=1,
+        out_channels=1,
+        kernel_size=3,
+        layers=5,
+        conv_channels=16,
+        nonlinear_activation="LeakyReLU",
+        nonlinear_activation_params={"negative_slope": 0.2},
+        bias=True,
+        use_weight_norm=True,
+    )
+    defaults.update(kwargs)
+    return defaults
+
+
+def make_residual_discriminator_args(**kwargs):
+    defaults = dict(
+        in_channels=1,
+        out_channels=1,
+        kernel_size=3,
+        layers=10,
+        stacks=1,
+        residual_channels=8,
+        gate_channels=16,
+        skip_channels=8,
+        dropout=0.0,
+        use_weight_norm=True,
+        use_causal_conv=False,
+        nonlinear_activation_params={"negative_slope": 0.2},
+    )
+    defaults.update(kwargs)
+    return defaults
+
+
+def make_mutli_reso_stft_loss_args(**kwargs):
+    defaults = dict(
+        fft_sizes=[64, 128, 256],
+        hop_sizes=[32, 64, 128],
+        win_lengths=[48, 96, 192],
+        window="hann_window",
+    )
+    defaults.update(kwargs)
+    return defaults
+
+
+@pytest.mark.parametrize(
+    "dict_g, dict_d, dict_loss",
+    [
+        ({}, {}, {}),
+        ({"layers": 1, "stacks": 1}, {}, {}),
+        ({}, {"layers": 1}, {}),
+        ({"kernel_size": 5}, {}, {}),
+        ({}, {"kernel_size": 5}, {}),
+        ({"gate_channels": 8}, {}, {}),
+        ({"stacks": 1}, {}, {}),
+        ({"use_weight_norm": False}, {"use_weight_norm": False}, {}),
+        ({"aux_context_window": 2}, {}, {}),
+        ({"upsample_net": "UpsampleNetwork"}, {}, {}),
+        (
+            {"upsample_params": {"upsample_scales": [4], "freq_axis_kernel_size": 3}},
+            {},
+            {},
+        ),
+        (
+            {
+                "upsample_params": {
+                    "upsample_scales": [4],
+                    "nonlinear_activation": "ReLU",
+                }
+            },
+            {},
+            {},
+        ),
+        (
+            {
+                "upsample_conditional_features": False,
+                "upsample_params": {"upsample_scales": [1]},
+            },
+            {},
+            {},
+        ),
+        ({}, {"nonlinear_activation": "ReLU", "nonlinear_activation_params": {}}, {}),
+        ({"use_causal_conv": True}, {}, {}),
+        ({"use_causal_conv": True, "upsample_net": "UpsampleNetwork"}, {}, {}),
+        ({"use_causal_conv": True, "aux_context_window": 1}, {}, {}),
+        ({"use_causal_conv": True, "aux_context_window": 2}, {}, {}),
+        ({"use_causal_conv": True, "aux_context_window": 3}, {}, {}),
+        (
+            {
+                "aux_channels": 16,
+                "upsample_net": "MelGANGenerator",
+                "upsample_params": {
+                    "upsample_scales": [4, 4],
+                    "in_channels": 16,
+                    "out_channels": 16,
+                },
+            },
+            {},
+            {},
+        ),
+    ],
+)
+def test_parallel_wavegan_trainable(dict_g, dict_d, dict_loss):
+    # setup
+    batch_size = 4
+    batch_length = 4096
+    args_g = make_generator_args(**dict_g)
+    args_d = make_discriminator_args(**dict_d)
+    args_loss = make_mutli_reso_stft_loss_args(**dict_loss)
+    z = torch.randn(batch_size, 1, batch_length)
+    y = torch.randn(batch_size, 1, batch_length)
+    c = torch.randn(
+        batch_size,
+        args_g["aux_channels"],
+        batch_length // np.prod(args_g["upsample_params"]["upsample_scales"])
+        + 2 * args_g["aux_context_window"],
+    )
+    model_g = ParallelWaveGANGenerator(**args_g)
+    model_d = ParallelWaveGANDiscriminator(**args_d)
+    aux_criterion = MultiResolutionSTFTLoss(**args_loss)
+    gen_adv_criterion = GeneratorAdversarialLoss()
+    dis_adv_criterion = DiscriminatorAdversarialLoss()
+    optimizer_g = RAdam(model_g.parameters())
+    optimizer_d = RAdam(model_d.parameters())
+
+    # check generator trainable
+    y_hat = model_g(z, c)
+    p_hat = model_d(y_hat)
+    adv_loss = gen_adv_criterion(p_hat)
+    sc_loss, mag_loss = aux_criterion(y_hat, y)
+    aux_loss = sc_loss + mag_loss
+    loss_g = adv_loss + aux_loss
+    optimizer_g.zero_grad()
+    loss_g.backward()
+    optimizer_g.step()
+
+    # check discriminator trainable
+    p = model_d(y)
+    p_hat = model_d(y_hat.detach())
+    real_loss, fake_loss = dis_adv_criterion(p_hat, p)
+    loss_d = real_loss + fake_loss
+    optimizer_d.zero_grad()
+    loss_d.backward()
+    optimizer_d.step()
+
+
+@pytest.mark.parametrize(
+    "dict_g, dict_d, dict_loss",
+    [
+        ({}, {}, {}),
+        ({"layers": 1, "stacks": 1}, {}, {}),
+        ({}, {"layers": 1}, {}),
+        ({"kernel_size": 5}, {}, {}),
+        ({}, {"kernel_size": 5}, {}),
+        ({"gate_channels": 8}, {}, {}),
+        ({"stacks": 1}, {}, {}),
+        ({"use_weight_norm": False}, {"use_weight_norm": False}, {}),
+        ({"aux_context_window": 2}, {}, {}),
+        ({"upsample_net": "UpsampleNetwork"}, {}, {}),
+        (
+            {"upsample_params": {"upsample_scales": [4], "freq_axis_kernel_size": 3}},
+            {},
+            {},
+        ),
+        (
+            {
+                "upsample_params": {
+                    "upsample_scales": [4],
+                    "nonlinear_activation": "ReLU",
+                }
+            },
+            {},
+            {},
+        ),
+        (
+            {
+                "upsample_conditional_features": False,
+                "upsample_params": {"upsample_scales": [1]},
+            },
+            {},
+            {},
+        ),
+        ({}, {"nonlinear_activation": "ReLU", "nonlinear_activation_params": {}}, {}),
+        ({"use_causal_conv": True}, {}, {}),
+        ({"use_causal_conv": True, "upsample_net": "UpsampleNetwork"}, {}, {}),
+        ({"use_causal_conv": True, "aux_context_window": 1}, {}, {}),
+        ({"use_causal_conv": True, "aux_context_window": 2}, {}, {}),
+        ({"use_causal_conv": True, "aux_context_window": 3}, {}, {}),
+        (
+            {
+                "aux_channels": 16,
+                "upsample_net": "MelGANGenerator",
+                "upsample_params": {
+                    "upsample_scales": [4, 4],
+                    "in_channels": 16,
+                    "out_channels": 16,
+                },
+            },
+            {},
+            {},
+        ),
+    ],
+)
+def test_parallel_wavegan_with_residual_discriminator_trainable(
+    dict_g, dict_d, dict_loss
+):
+    # setup
+    batch_size = 4
+    batch_length = 4096
+    args_g = make_generator_args(**dict_g)
+    args_d = make_residual_discriminator_args(**dict_d)
+    args_loss = make_mutli_reso_stft_loss_args(**dict_loss)
+    z = torch.randn(batch_size, 1, batch_length)
+    y = torch.randn(batch_size, 1, batch_length)
+    c = torch.randn(
+        batch_size,
+        args_g["aux_channels"],
+        batch_length // np.prod(args_g["upsample_params"]["upsample_scales"])
+        + 2 * args_g["aux_context_window"],
+    )
+    model_g = ParallelWaveGANGenerator(**args_g)
+    model_d = ResidualParallelWaveGANDiscriminator(**args_d)
+    aux_criterion = MultiResolutionSTFTLoss(**args_loss)
+    gen_adv_criterion = GeneratorAdversarialLoss()
+    dis_adv_criterion = DiscriminatorAdversarialLoss()
+    optimizer_g = RAdam(model_g.parameters())
+    optimizer_d = RAdam(model_d.parameters())
+
+    # check generator trainable
+    y_hat = model_g(z, c)
+    p_hat = model_d(y_hat)
+    adv_loss = gen_adv_criterion(p_hat)
+    sc_loss, mag_loss = aux_criterion(y_hat, y)
+    aux_loss = sc_loss + mag_loss
+    loss_g = adv_loss + aux_loss
+    optimizer_g.zero_grad()
+    loss_g.backward()
+    optimizer_g.step()
+
+    # check discriminator trainable
+    p = model_d(y)
+    p_hat = model_d(y_hat.detach())
+    real_loss, fake_loss = dis_adv_criterion(p_hat, p)
+    loss_d = real_loss + fake_loss
+    optimizer_d.zero_grad()
+    loss_d.backward()
+    optimizer_d.step()
+
+
+@pytest.mark.parametrize(
+    "upsample_net, aux_context_window",
+    [
+        ("ConvInUpsampleNetwork", 0),
+        ("ConvInUpsampleNetwork", 1),
+        ("ConvInUpsampleNetwork", 2),
+        ("ConvInUpsampleNetwork", 3),
+        ("UpsampleNetwork", 0),
+    ],
+)
+def test_causal_parallel_wavegan(upsample_net, aux_context_window):
+    batch_size = 1
+    batch_length = 4096
+    args_g = make_generator_args(
+        use_causal_conv=True,
+        upsample_net=upsample_net,
+        aux_context_window=aux_context_window,
+        dropout=0.0,
+    )
+    model_g = ParallelWaveGANGenerator(**args_g)
+    z = torch.randn(batch_size, 1, batch_length)
+    c = torch.randn(
+        batch_size,
+        args_g["aux_channels"],
+        batch_length // np.prod(args_g["upsample_params"]["upsample_scales"]),
+    )
+
+    z_ = z.clone()
+    c_ = c.clone()
+    z_[..., z.size(-1) // 2 :] = torch.randn(z[..., z.size(-1) // 2 :].shape)
+    c_[..., c.size(-1) // 2 :] = torch.randn(c[..., c.size(-1) // 2 :].shape)
+    c = torch.nn.ConstantPad1d(args_g["aux_context_window"], 0.0)(c)
+    c_ = torch.nn.ConstantPad1d(args_g["aux_context_window"], 0.0)(c_)
+    try:
+        # check not equal
+        np.testing.assert_array_equal(c.numpy(), c_.numpy())
+    except AssertionError:
+        pass
+    else:
+        raise AssertionError("Must be different.")
+    try:
+        # check not equal
+        np.testing.assert_array_equal(z.numpy(), z_.numpy())
+    except AssertionError:
+        pass
+    else:
+        raise AssertionError("Must be different.")
+
+    # check causality
+    y = model_g(z, c)
+    y_ = model_g(z_, c_)
+    np.testing.assert_array_equal(
+        y[..., : y.size(-1) // 2].detach().cpu().numpy(),
+        y_[..., : y_.size(-1) // 2].detach().cpu().numpy(),
+    )
diff --git a/ParallelWaveGAN/test/test_style_melgan.py b/ParallelWaveGAN/test/test_style_melgan.py
new file mode 100644
index 0000000000000000000000000000000000000000..10ee380c65630b7590f39a86d471fc753fb4ac13
--- /dev/null
+++ b/ParallelWaveGAN/test/test_style_melgan.py
@@ -0,0 +1,177 @@
+#!/usr/bin/env python3
+
+# Copyright 2021 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+"""Test code for StyleMelGAN modules."""
+
+import logging
+
+import numpy as np
+import pytest
+import torch
+
+from parallel_wavegan.losses import DiscriminatorAdversarialLoss
+from parallel_wavegan.losses import GeneratorAdversarialLoss
+from parallel_wavegan.losses import MultiResolutionSTFTLoss
+from parallel_wavegan.models import StyleMelGANDiscriminator
+from parallel_wavegan.models import StyleMelGANGenerator
+
+from test_parallel_wavegan import make_mutli_reso_stft_loss_args
+
+
+logging.basicConfig(
+    level=logging.DEBUG,
+    format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+)
+
+
+def make_style_melgan_generator_args(**kwargs):
+    defaults = dict(
+        in_channels=128,
+        aux_channels=80,
+        channels=64,
+        out_channels=1,
+        kernel_size=9,
+        dilation=2,
+        bias=True,
+        noise_upsample_scales=[11, 2, 2, 2],
+        noise_upsample_activation="LeakyReLU",
+        noise_upsample_activation_params={"negative_slope": 0.2},
+        upsample_scales=[2, 2, 2, 2, 2, 2, 2, 2, 1],
+        upsample_mode="nearest",
+        gated_function="softmax",
+        use_weight_norm=True,
+    )
+    defaults.update(kwargs)
+    return defaults
+
+
+def make_style_melgan_discriminator_args(**kwargs):
+    defaults = dict(
+        repeats=2,
+        window_sizes=[512, 1024, 2048, 4096],
+        pqmf_params=[
+            [1, None, None, None],
+            [2, 62, 0.26700, 9.0],
+            [4, 62, 0.14200, 9.0],
+            [8, 62, 0.07949, 9.0],
+        ],
+        discriminator_params={
+            "out_channels": 1,
+            "kernel_sizes": [5, 3],
+            "channels": 16,
+            "max_downsample_channels": 32,
+            "bias": True,
+            "downsample_scales": [4, 4, 4, 1],
+            "nonlinear_activation": "LeakyReLU",
+            "nonlinear_activation_params": {"negative_slope": 0.2},
+            "pad": "ReflectionPad1d",
+            "pad_params": {},
+        },
+        use_weight_norm=True,
+    )
+    defaults.update(kwargs)
+    return defaults
+
+
+@pytest.mark.parametrize(
+    "dict_d",
+    [
+        {"repeats": 1},
+        {"repeats": 4},
+    ],
+)
+def test_style_melgan_discriminator(dict_d):
+    batch_size = 4
+    batch_length = 2 ** 14
+    args_d = make_style_melgan_discriminator_args(**dict_d)
+    y = torch.randn(batch_size, 1, batch_length)
+    model_d = StyleMelGANDiscriminator(**args_d)
+    gen_adv_criterion = GeneratorAdversarialLoss()
+    outs = model_d(y)
+    gen_adv_criterion(outs)
+
+
+@pytest.mark.parametrize(
+    "dict_g",
+    [
+        {},
+        {"noise_upsample_scales": [4, 4, 4]},
+    ],
+)
+def test_style_melgan_generator(dict_g):
+    args_g = make_style_melgan_generator_args(**dict_g)
+    batch_size = 4
+    batch_length = np.prod(args_g["noise_upsample_scales"]) * np.prod(
+        args_g["upsample_scales"]
+    )
+    z = torch.randn(batch_size, args_g["in_channels"], 1)
+    c = torch.randn(
+        batch_size,
+        args_g["aux_channels"],
+        batch_length // np.prod(args_g["upsample_scales"]),
+    )
+    model_g = StyleMelGANGenerator(**args_g)
+    model_g(c, z)
+
+    # inference
+    c = torch.randn(
+        512,
+        args_g["aux_channels"],
+    )
+    y = model_g.inference(c)
+    print(y.shape)
+
+
+@pytest.mark.parametrize(
+    "dict_g, dict_d, dict_loss, loss_type",
+    [
+        ({}, {}, {}, "mse"),
+        ({}, {}, {}, "hinge"),
+        ({"noise_upsample_scales": [4, 4, 4]}, {}, {}, "mse"),
+        ({"gated_function": "sigmoid"}, {}, {}, "mse"),
+    ],
+)
+def test_style_melgan_trainable(dict_g, dict_d, dict_loss, loss_type):
+    # setup
+    args_g = make_style_melgan_generator_args(**dict_g)
+    args_d = make_style_melgan_discriminator_args(**dict_d)
+    args_loss = make_mutli_reso_stft_loss_args(**dict_loss)
+    batch_size = 4
+    batch_length = np.prod(args_g["noise_upsample_scales"]) * np.prod(
+        args_g["upsample_scales"]
+    )
+    y = torch.randn(batch_size, 1, batch_length)
+    c = torch.randn(
+        batch_size,
+        args_g["aux_channels"],
+        batch_length // np.prod(args_g["upsample_scales"]),
+    )
+    model_g = StyleMelGANGenerator(**args_g)
+    model_d = StyleMelGANDiscriminator(**args_d)
+    aux_criterion = MultiResolutionSTFTLoss(**args_loss)
+    gen_adv_criterion = GeneratorAdversarialLoss(loss_type=loss_type)
+    dis_adv_criterion = DiscriminatorAdversarialLoss(loss_type=loss_type)
+    optimizer_g = torch.optim.Adam(model_g.parameters())
+    optimizer_d = torch.optim.Adam(model_d.parameters())
+
+    # check generator trainable
+    y_hat = model_g(c)
+    p_hat = model_d(y_hat)
+    adv_loss = gen_adv_criterion(p_hat)
+    sc_loss, mag_loss = aux_criterion(y_hat, y)
+    aux_loss = sc_loss + mag_loss
+    loss_g = adv_loss + aux_loss
+    optimizer_g.zero_grad()
+    loss_g.backward()
+    optimizer_g.step()
+
+    # check discriminator trainable
+    p = model_d(y)
+    p_hat = model_d(y_hat.detach())
+    real_loss, fake_loss = dis_adv_criterion(p_hat, p)
+    loss_d = real_loss + fake_loss
+    optimizer_d.zero_grad()
+    loss_d.backward()
+    optimizer_d.step()
diff --git a/ParallelWaveGAN/tools/Makefile b/ParallelWaveGAN/tools/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..f81e3bba7059cec1cdba0cdcfe7eccd378f55572
--- /dev/null
+++ b/ParallelWaveGAN/tools/Makefile
@@ -0,0 +1,44 @@
+PYTHON:= python3.7
+CUDA_VERSION:= 11.0
+PYTORCH_VERSION:= 1.7.1
+DOT:= .
+.PHONY: all clean show_variables
+
+all: show_variables virtualenv.done pytorch.done parallel_wavegan.done
+
+show_variables:
+	@echo PYTHON=$(PYTHON)
+	@echo CUDA_VERSION=$(CUDA_VERSION)
+	@echo PYTORCH_VERSION=$(PYTORCH_VERSION)
+
+virtualenv.done: show_variables
+	test -d venv || $(PYTHON) -m venv venv
+	. venv/bin/activate; cd ../; pip install -U pip
+	# install numpy here since python3.6 is not supported in > 1.20
+	. venv/bin/activate; cd ../; pip install numpy
+	touch virtualenv.done
+
+pytorch.done: virtualenv.done
+ifeq ($(CUDA_VERSION),)
+	. venv/bin/activate; pip install torch==$(PYTORCH_VERSION) \
+		-f https://download.pytorch.org/whl/cpu/stable.html
+else
+	. venv/bin/activate; pip install torch==$(PYTORCH_VERSION) \
+		-f https://download.pytorch.org/whl/cu$(subst $(DOT),,$(CUDA_VERSION))/torch_stable.html
+endif
+	touch pytorch.done
+
+parallel_wavegan.done: virtualenv.done pytorch.done
+	. venv/bin/activate; cd ../; pip install -e .
+	. venv/bin/activate; cd ../; pip install -e .[test]
+	touch parallel_wavegan.done
+
+apex.done: virtualenv.done pytorch.done
+	git clone https://github.com/NVIDIA/apex.git
+	. venv/bin/activate; cd apex; \
+		pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
+	touch apex.done
+
+clean:
+	rm -fr venv apex *.done
+	find -iname "*.pyc" -delete
diff --git a/ParallelWaveGAN/utils/combine_data.sh b/ParallelWaveGAN/utils/combine_data.sh
new file mode 100755
index 0000000000000000000000000000000000000000..7ceb1703bd0e784d07f1c8e8624a1d552a505c1e
--- /dev/null
+++ b/ParallelWaveGAN/utils/combine_data.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+# Combine data direcotries into a single data direcotry
+
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+if [ $# -lt 2 ]; then
+    echo "Usage: $0 <dist_dir> <src_dir_1> <src_dir_2> ..."
+    echo "e.g.: $0 data/all data/spk_1 data/spk_2 data/spk_3"
+    exit 1
+fi
+
+set -euo pipefail
+
+dist_dir=$1
+shift
+first_src_dir=$1
+
+
+[ ! -e "${dist_dir}" ] && mkdir -p "${dist_dir}"
+
+if [ -e "${first_src_dir}/segments" ]; then
+    has_segments=true
+    segments=${dist_dir}/segments
+    segments_tmp=${dist_dir}/segments.unsorted
+    [ -e "${segments_tmp}" ] && rm "${segments_tmp}"
+else
+    has_segments=false
+fi
+scp=${dist_dir}/wav.scp
+scp_tmp=${dist_dir}/wav.scp.unsorted
+[ -e "${scp_tmp}" ] && rm "${scp_tmp}"
+
+# concatenate all of wav.scp and segments file
+for _ in $(seq 1 ${#}); do
+    src_dir=$1
+
+    if "${has_segments}"; then
+        [ ! -e "${src_dir}/segments" ] && echo "WARN: Not found segments in ${src_dir}. Skipped." >&2 && shift && continue
+        cat "${src_dir}/segments" >> "${segments_tmp}"
+    fi
+
+    [ ! -e "${src_dir}/wav.scp" ] && echo "Not found wav.scp in ${src_dir}." >&2 && exit 1;
+    cat "${src_dir}/wav.scp" >> "${scp_tmp}"
+
+    shift
+done
+
+# sort
+sort "${scp_tmp}" > "${scp}"
+if "${has_segments}"; then
+    sort "${segments_tmp}" > "${segments}"
+fi
+rm "${dist_dir}"/*.unsorted
+
+echo "Successfully combined data direcotries."
diff --git a/ParallelWaveGAN/utils/download_from_google_drive.sh b/ParallelWaveGAN/utils/download_from_google_drive.sh
new file mode 100755
index 0000000000000000000000000000000000000000..844a5595dd2110d047ad146379314634af8adc93
--- /dev/null
+++ b/ParallelWaveGAN/utils/download_from_google_drive.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# Copyright 2020 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+# Download zip, tar, or tar.gz file from google drive
+
+# shellcheck disable=SC1091
+. ./path.sh || eixit 1
+
+share_url=$1
+download_dir=${2:-"downloads"}
+file_ext=${3:-"zip"}
+
+if [ "$1" = "--help" ] || [ $# -lt 1 ] || [ $# -gt 3 ]; then
+   echo "Usage: $0 <share-url> [<download_dir> <file_ext>]";
+   echo "e.g.: $0 https://drive.google.com/open?id=xxxxxxxxxxxxxx downloads zip"
+   echo "Options:"
+   echo "    <download_dir>: directory to save downloaded file. (Default=downloads)"
+   echo "    <file_ext>: file extension of the file to be downloaded. (Default=zip)"
+   exit 1;
+fi
+
+set -euo pipefail
+
+[ ! -e "${download_dir}" ] && mkdir -p "${download_dir}"
+tmp=$(mktemp "${download_dir}/XXXXXXXX.${file_ext}")
+
+file_id=$(echo "${share_url}" | cut -d"=" -f 2)
+
+# define decompressor
+decompress () {
+    filename=$1
+    decompress_dir=$2
+    if echo "${filename}" | grep -q ".zip"; then
+        unzip "${filename}" -d "${decompress_dir}"
+    elif echo "${filename}" | grep -q -e ".tar" -e ".tar.gz" -e ".tgz"; then
+        tar xvzf "${filename}" -C "${decompress_dir}"
+    else
+        echo "Unsupported file extension." >&2 && exit 1
+    fi
+}
+
+set -e
+# Solution from https://github.com/wkentaro/gdown
+gdown --id "${file_id}" -O "${tmp}"
+decompress "${tmp}" "${download_dir}"
+
+# remove tmpfiles
+rm "${tmp}"
+echo "Sucessfully downloaded ${file_ext} file from ${share_url}"
diff --git a/ParallelWaveGAN/utils/make_subset_data.sh b/ParallelWaveGAN/utils/make_subset_data.sh
new file mode 100755
index 0000000000000000000000000000000000000000..2487aef51431e1ee552b1f6017a321d73dddf8ae
--- /dev/null
+++ b/ParallelWaveGAN/utils/make_subset_data.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+# Make subset files located in data direcoty.
+
+# Copyright 2020 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+# shellcheck disable=SC1091
+. ./path.sh || exit 1;
+
+
+if [ $# -ne 3 ]; then
+    echo "Usage: $0 <src_dir> <num_split> <dst_dir>"
+    echo "e.g.: $0 data/train_nodev 16 data/train_nodev/split16"
+    exit 1
+fi
+
+set -eu
+
+src_dir=$1
+num_split=$2
+dst_dir=$3
+
+src_scp=${src_dir}/wav.scp
+if [ -e "${src_dir}/segments" ]; then
+    has_segments=true
+    src_segments=${src_dir}/segments
+else
+    has_segments=false
+fi
+
+if ! ${has_segments}; then
+    split_scps=""
+    for i in $(seq 1 "${num_split}"); do
+        split_scps+=" ${dst_dir}/wav.${i}.scp"
+    done
+    # shellcheck disable=SC2086
+    utils/split_scp.pl "${src_scp}" ${split_scps}
+else
+    split_scps=""
+    for i in $(seq 1 "${num_split}"); do
+        split_scps+=" ${dst_dir}/segments.${i}"
+    done
+    # shellcheck disable=SC2086
+    utils/split_scp.pl "${src_segments}" ${split_scps}
+    for i in $(seq 1 "${num_split}"); do
+        awk '{print $2}' < "${dst_dir}/segments.${i}" | sort | uniq | while read -r wav_id; do
+            grep "^${wav_id} " < "${src_scp}" >> "${dst_dir}/wav.${i}.scp"
+        done
+    done
+fi
+echo "Successfully make subsets."
diff --git a/ParallelWaveGAN/utils/parse_options.sh b/ParallelWaveGAN/utils/parse_options.sh
new file mode 100755
index 0000000000000000000000000000000000000000..fdc8a36284316760ffa550dd151823201518db25
--- /dev/null
+++ b/ParallelWaveGAN/utils/parse_options.sh
@@ -0,0 +1,97 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey);
+#                 Arnab Ghoshal, Karel Vesely
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Parse command-line options.
+# To be sourced by another script (as in ". parse_options.sh").
+# Option format is: --option-name arg
+# and shell variable "option_name" gets set to value "arg."
+# The exception is --help, which takes no arguments, but prints the 
+# $help_message variable (if defined).
+
+
+###
+### The --config file options have lower priority to command line 
+### options, so we need to import them first...
+###
+
+# Now import all the configs specified by command-line, in left-to-right order
+for ((argpos=1; argpos<$#; argpos++)); do
+  if [ "${!argpos}" == "--config" ]; then
+    argpos_plus1=$((argpos+1))
+    config=${!argpos_plus1}
+    [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
+    . $config  # source the config file.
+  fi
+done
+
+
+###
+### No we process the command line options
+###
+while true; do
+  [ -z "${1:-}" ] && break;  # break if there are no arguments
+  case "$1" in
+    # If the enclosing script is called with --help option, print the help 
+    # message and exit.  Scripts should put help messages in $help_message
+  --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
+	  else printf "$help_message\n" 1>&2 ; fi; 
+	  exit 0 ;; 
+  --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
+       exit 1 ;;
+    # If the first command-line argument begins with "--" (e.g. --foo-bar), 
+    # then work out the variable name as $name, which will equal "foo_bar".
+  --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; 
+    # Next we test whether the variable in question is undefned-- if so it's 
+    # an invalid option and we die.  Note: $0 evaluates to the name of the 
+    # enclosing script.
+    # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
+    # is undefined.  We then have to wrap this test inside "eval" because 
+    # foo_bar is itself inside a variable ($name).
+      eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
+      
+      oldval="`eval echo \\$$name`";
+    # Work out whether we seem to be expecting a Boolean argument.
+      if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then 
+	was_bool=true;
+      else 
+	was_bool=false;
+      fi
+
+    # Set the variable to the right value-- the escaped quotes make it work if
+    # the option had spaces, like --cmd "queue.pl -sync y"
+      eval $name=\"$2\"; 
+        
+    # Check that Boolean-valued arguments are really Boolean.
+      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
+        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
+        exit 1;
+      fi
+      shift 2;
+      ;;
+  *) break;
+  esac
+done
+
+
+# Check for an empty argument to the --cmd option, which can easily occur as a 
+# result of scripting errors.
+[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
+
+
+true; # so this script returns exit code 0.
diff --git a/ParallelWaveGAN/utils/queue.pl b/ParallelWaveGAN/utils/queue.pl
new file mode 100755
index 0000000000000000000000000000000000000000..bddcb4fec234be755704b8918d8518fc00091e05
--- /dev/null
+++ b/ParallelWaveGAN/utils/queue.pl
@@ -0,0 +1,624 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).
+#           2014  Vimal Manohar (Johns Hopkins University)
+# Apache 2.0.
+
+use File::Basename;
+use Cwd;
+use Getopt::Long;
+
+# queue.pl has the same functionality as run.pl, except that
+# it runs the job in question on the queue (Sun GridEngine).
+# This version of queue.pl uses the task array functionality
+# of the grid engine.  Note: it's different from the queue.pl
+# in the s4 and earlier scripts.
+
+# The script now supports configuring the queue system using a config file
+# (default in conf/queue.conf; but can be passed specified with --config option)
+# and a set of command line options.
+# The current script handles:
+# 1) Normal configuration arguments
+# For e.g. a command line option of "--gpu 1" could be converted into the option
+# "-q g.q -l gpu=1" to qsub. How the CLI option is handled is determined by a
+# line in the config file like
+# gpu=* -q g.q -l gpu=$0
+# $0 here in the line is replaced with the argument read from the CLI and the
+# resulting string is passed to qsub.
+# 2) Special arguments to options such as
+# gpu=0
+# If --gpu 0 is given in the command line, then no special "-q" is given.
+# 3) Default argument
+# default gpu=0
+# If --gpu option is not passed in the command line, then the script behaves as
+# if --gpu 0 was passed since 0 is specified as the default argument for that
+# option
+# 4) Arbitrary options and arguments.
+# Any command line option starting with '--' and its argument would be handled
+# as long as its defined in the config file.
+# 5) Default behavior
+# If the config file that is passed using is not readable, then the script
+# behaves as if the queue has the following config file:
+# $ cat conf/queue.conf
+# # Default configuration
+# command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+# option mem=* -l mem_free=$0,ram_free=$0
+# option mem=0          # Do not add anything to qsub_opts
+# option num_threads=* -pe smp $0
+# option num_threads=1  # Do not add anything to qsub_opts
+# option max_jobs_run=* -tc $0
+# default gpu=0
+# option gpu=0 -q all.q
+# option gpu=* -l gpu=$0 -q g.q
+
+my $qsub_opts = "";
+my $sync = 0;
+my $num_threads = 1;
+my $gpu = 0;
+
+my $config = "conf/queue.conf";
+
+my %cli_options = ();
+
+my $jobname;
+my $jobstart;
+my $jobend;
+my $array_job = 0;
+my $sge_job_id;
+
+sub print_usage() {
+  print STDERR
+   "Usage: queue.pl [options] [JOB=1:n] log-file command-line arguments...\n" .
+   "e.g.: queue.pl foo.log echo baz\n" .
+   " (which will echo \"baz\", with stdout and stderr directed to foo.log)\n" .
+   "or: queue.pl -q all.q\@xyz foo.log echo bar \| sed s/bar/baz/ \n" .
+   " (which is an example of using a pipe; you can provide other escaped bash constructs)\n" .
+   "or: queue.pl -q all.q\@qyz JOB=1:10 foo.JOB.log echo JOB \n" .
+   " (which illustrates the mechanism to submit parallel jobs; note, you can use \n" .
+   "  another string other than JOB)\n" .
+   "Note: if you pass the \"-sync y\" option to qsub, this script will take note\n" .
+   "and change its behavior.  Otherwise it uses qstat to work out when the job finished\n" .
+   "Options:\n" .
+   "  --config <config-file> (default: $config)\n" .
+   "  --mem <mem-requirement> (e.g. --mem 2G, --mem 500M, \n" .
+   "                           also support K and numbers mean bytes)\n" .
+   "  --num-threads <num-threads> (default: $num_threads)\n" .
+   "  --max-jobs-run <num-jobs>\n" .
+   "  --gpu <0|1> (default: $gpu)\n";
+  exit 1;
+}
+
+sub caught_signal {
+  if ( defined $sge_job_id ) { # Signal trapped after submitting jobs
+    my $signal = $!;
+    system ("qdel $sge_job_id");
+    print STDERR "Caught a signal: $signal , deleting SGE task: $sge_job_id and exiting\n";
+    exit(2);
+  }
+}
+
+if (@ARGV < 2) {
+  print_usage();
+}
+
+for (my $x = 1; $x <= 2; $x++) { # This for-loop is to
+  # allow the JOB=1:n option to be interleaved with the
+  # options to qsub.
+  while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) {
+    my $switch = shift @ARGV;
+
+    if ($switch eq "-V") {
+      $qsub_opts .= "-V ";
+    } else {
+      my $argument = shift @ARGV;
+      if ($argument =~ m/^--/) {
+        print STDERR "WARNING: suspicious argument '$argument' to $switch; starts with '-'\n";
+      }
+      if ($switch eq "-sync" && $argument =~ m/^[yY]/) {
+        $sync = 1;
+        $qsub_opts .= "$switch $argument ";
+      } elsif ($switch eq "-pe") { # e.g. -pe smp 5
+        my $argument2 = shift @ARGV;
+        $qsub_opts .= "$switch $argument $argument2 ";
+        $num_threads = $argument2;
+      } elsif ($switch =~ m/^--/) { # Config options
+        # Convert CLI option to variable name
+        # by removing '--' from the switch and replacing any
+        # '-' with a '_'
+        $switch =~ s/^--//;
+        $switch =~ s/-/_/g;
+        $cli_options{$switch} = $argument;
+      } else {  # Other qsub options - passed as is
+        $qsub_opts .= "$switch $argument ";
+      }
+    }
+  }
+  if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:20
+    $array_job = 1;
+    $jobname = $1;
+    $jobstart = $2;
+    $jobend = $3;
+    shift;
+    if ($jobstart > $jobend) {
+      die "queue.pl: invalid job range $ARGV[0]";
+    }
+    if ($jobstart <= 0) {
+      die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is a GridEngine limitation).";
+    }
+  } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1.
+    $array_job = 1;
+    $jobname = $1;
+    $jobstart = $2;
+    $jobend = $2;
+    shift;
+  } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) {
+    print STDERR "queue.pl: Warning: suspicious first argument to queue.pl: $ARGV[0]\n";
+  }
+}
+
+if (@ARGV < 2) {
+  print_usage();
+}
+
+if (exists $cli_options{"config"}) {
+  $config = $cli_options{"config"};
+}
+
+my $default_config_file = <<'EOF';
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q '*.q'
+EOF
+
+# Here the configuration options specified by the user on the command line
+# (e.g. --mem 2G) are converted to options to the qsub system as defined in
+# the config file. (e.g. if the config file has the line
+# "option mem=* -l ram_free=$0,mem_free=$0"
+# and the user has specified '--mem 2G' on the command line, the options
+# passed to queue system would be "-l ram_free=2G,mem_free=2G
+# A more detailed description of the ways the options would be handled is at
+# the top of this file.
+
+$SIG{INT} = \&caught_signal;
+$SIG{TERM} = \&caught_signal;
+
+my $opened_config_file = 1;
+
+open CONFIG, "<$config" or $opened_config_file = 0;
+
+my %cli_config_options = ();
+my %cli_default_options = ();
+
+if ($opened_config_file == 0 && exists($cli_options{"config"})) {
+  print STDERR "Could not open config file $config\n";
+  exit(1);
+} elsif ($opened_config_file == 0 && !exists($cli_options{"config"})) {
+  # Open the default config file instead
+  open (CONFIG, "echo '$default_config_file' |") or die "Unable to open pipe\n";
+  $config = "Default config";
+}
+
+my $qsub_cmd = "";
+my $read_command = 0;
+
+while(<CONFIG>) {
+  chomp;
+  my $line = $_;
+  $_ =~ s/\s*#.*//g;
+  if ($_ eq "") { next; }
+  if ($_ =~ /^command (.+)/) {
+    $read_command = 1;
+    $qsub_cmd = $1 . " ";
+  } elsif ($_ =~ m/^option ([^=]+)=\* (.+)$/) {
+    # Config option that needs replacement with parameter value read from CLI
+    # e.g.: option mem=* -l mem_free=$0,ram_free=$0
+    my $option = $1;     # mem
+    my $arg= $2;         # -l mem_free=$0,ram_free=$0
+    if ($arg !~ m:\$0:) {
+      die "Unable to parse line '$line' in config file ($config)\n";
+    }
+    if (exists $cli_options{$option}) {
+      # Replace $0 with the argument read from command line.
+      # e.g. "-l mem_free=$0,ram_free=$0" -> "-l mem_free=2G,ram_free=2G"
+      $arg =~ s/\$0/$cli_options{$option}/g;
+      $cli_config_options{$option} = $arg;
+    }
+  } elsif ($_ =~ m/^option ([^=]+)=(\S+)\s?(.*)$/) {
+    # Config option that does not need replacement
+    # e.g. option gpu=0 -q all.q
+    my $option = $1;      # gpu
+    my $value = $2;       # 0
+    my $arg = $3;         # -q all.q
+    if (exists $cli_options{$option}) {
+      $cli_default_options{($option,$value)} = $arg;
+    }
+  } elsif ($_ =~ m/^default (\S+)=(\S+)/) {
+    # Default options. Used for setting default values to options i.e. when
+    # the user does not specify the option on the command line
+    # e.g. default gpu=0
+    my $option = $1;  # gpu
+    my $value = $2;   # 0
+    if (!exists $cli_options{$option}) {
+      # If the user has specified this option on the command line, then we
+      # don't have to do anything
+      $cli_options{$option} = $value;
+    }
+  } else {
+    print STDERR "queue.pl: unable to parse line '$line' in config file ($config)\n";
+    exit(1);
+  }
+}
+
+close(CONFIG);
+
+if ($read_command != 1) {
+  print STDERR "queue.pl: config file ($config) does not contain the line \"command .*\"\n";
+  exit(1);
+}
+
+for my $option (keys %cli_options) {
+  if ($option eq "config") { next; }
+  if ($option eq "max_jobs_run" && $array_job != 1) { next; }
+  my $value = $cli_options{$option};
+
+  if (exists $cli_default_options{($option,$value)}) {
+    $qsub_opts .= "$cli_default_options{($option,$value)} ";
+  } elsif (exists $cli_config_options{$option}) {
+    $qsub_opts .= "$cli_config_options{$option} ";
+  } else {
+    if ($opened_config_file == 0) { $config = "default config file"; }
+    die "queue.pl: Command line option $option not described in $config (or value '$value' not allowed)\n";
+  }
+}
+
+my $cwd = getcwd();
+my $logfile = shift @ARGV;
+
+if ($array_job == 1 && $logfile !~ m/$jobname/
+    && $jobend > $jobstart) {
+  print STDERR "queue.pl: you are trying to run a parallel job but "
+    . "you are putting the output into just one log file ($logfile)\n";
+  exit(1);
+}
+
+#
+# Work out the command; quote escaping is done here.
+# Note: the rules for escaping stuff are worked out pretty
+# arbitrarily, based on what we want it to do.  Some things that
+# we pass as arguments to queue.pl, such as "|", we want to be
+# interpreted by bash, so we don't escape them.  Other things,
+# such as archive specifiers like 'ark:gunzip -c foo.gz|', we want
+# to be passed, in quotes, to the Kaldi program.  Our heuristic
+# is that stuff with spaces in should be quoted.  This doesn't
+# always work.
+#
+my $cmd = "";
+
+foreach my $x (@ARGV) {
+  if ($x =~ m/^\S+$/) { $cmd .= $x . " "; } # If string contains no spaces, take
+                                            # as-is.
+  elsif ($x =~ m:\":) { $cmd .= "'$x' "; } # else if no dbl-quotes, use single
+  else { $cmd .= "\"$x\" "; }  # else use double.
+}
+
+#
+# Work out the location of the script file, and open it for writing.
+#
+my $dir = dirname($logfile);
+my $base = basename($logfile);
+my $qdir = "$dir/q";
+$qdir =~ s:/(log|LOG)/*q:/q:; # If qdir ends in .../log/q, make it just .../q.
+my $queue_logfile = "$qdir/$base";
+
+if (!-d $dir) { system "mkdir -p $dir 2>/dev/null"; } # another job may be doing this...
+if (!-d $dir) { die "Cannot make the directory $dir\n"; }
+# make a directory called "q",
+# where we will put the log created by qsub... normally this doesn't contain
+# anything interesting, evertyhing goes to $logfile.
+# in $qdir/sync we'll put the done.* files... we try to keep this
+# directory small because it's transmitted over NFS many times.
+if (! -d "$qdir/sync") {
+  system "mkdir -p $qdir/sync 2>/dev/null";
+  sleep(5); ## This is to fix an issue we encountered in denominator lattice creation,
+  ## where if e.g. the exp/tri2b_denlats/log/15/q directory had just been
+  ## created and the job immediately ran, it would die with an error because nfs
+  ## had not yet synced.  I'm also decreasing the acdirmin and acdirmax in our
+  ## NFS settings to something like 5 seconds.
+}
+
+my $queue_array_opt = "";
+if ($array_job == 1) { # It's an array job.
+  $queue_array_opt = "-t $jobstart:$jobend";
+  $logfile =~ s/$jobname/\$SGE_TASK_ID/g; # This variable will get
+  # replaced by qsub, in each job, with the job-id.
+  $cmd =~ s/$jobname/\$\{SGE_TASK_ID\}/g; # same for the command...
+  $queue_logfile =~ s/\.?$jobname//; # the log file in the q/ subdirectory
+  # is for the queue to put its log, and this doesn't need the task array subscript
+  # so we remove it.
+}
+
+# queue_scriptfile is as $queue_logfile [e.g. dir/q/foo.log] but
+# with the suffix .sh.
+my $queue_scriptfile = $queue_logfile;
+($queue_scriptfile =~ s/\.[a-zA-Z]{1,5}$/.sh/) || ($queue_scriptfile .= ".sh");
+if ($queue_scriptfile !~ m:^/:) {
+  $queue_scriptfile = $cwd . "/" . $queue_scriptfile; # just in case.
+}
+
+# We'll write to the standard input of "qsub" (the file-handle Q),
+# the job that we want it to execute.
+# Also keep our current PATH around, just in case there was something
+# in it that we need (although we also source ./path.sh)
+
+my $syncfile = "$qdir/sync/done.$$";
+
+unlink($queue_logfile, $syncfile);
+#
+# Write to the script file, and then close it.
+#
+open(Q, ">$queue_scriptfile") || die "Failed to write to $queue_scriptfile";
+
+print Q "#!/bin/bash\n";
+print Q "cd $cwd\n";
+print Q ". ./path.sh\n";
+print Q "( echo '#' Running on \`hostname\`\n";
+print Q "  echo '#' Started at \`date\`\n";
+print Q "  echo -n '# '; cat <<EOF\n";
+print Q "$cmd\n"; # this is a way of echoing the command into a comment in the log file,
+print Q "EOF\n"; # without having to escape things like "|" and quote characters.
+print Q ") >$logfile\n";
+print Q "time1=\`date +\"%s\"\`\n";
+print Q " ( $cmd ) 2>>$logfile >>$logfile\n";
+print Q "ret=\$?\n";
+print Q "time2=\`date +\"%s\"\`\n";
+print Q "echo '#' Accounting: time=\$((\$time2-\$time1)) threads=$num_threads >>$logfile\n";
+print Q "echo '#' Finished at \`date\` with status \$ret >>$logfile\n";
+print Q "[ \$ret -eq 137 ] && exit 100;\n"; # If process was killed (e.g. oom) it will exit with status 137;
+  # let the script return with status 100 which will put it to E state; more easily rerunnable.
+if ($array_job == 0) { # not an array job
+  print Q "touch $syncfile\n"; # so we know it's done.
+} else {
+  print Q "touch $syncfile.\$SGE_TASK_ID\n"; # touch a bunch of sync-files.
+}
+print Q "exit \$[\$ret ? 1 : 0]\n"; # avoid status 100 which grid-engine
+print Q "## submitted with:\n";       # treats specially.
+$qsub_cmd .= "-o $queue_logfile $qsub_opts $queue_array_opt $queue_scriptfile >>$queue_logfile 2>&1";
+print Q "# $qsub_cmd\n";
+if (!close(Q)) { # close was not successful... || die "Could not close script file $shfile";
+  die "Failed to close the script file (full disk?)";
+}
+chmod 0755, $queue_scriptfile;
+
+# This block submits the job to the queue.
+for (my $try = 1; $try < 5; $try++) {
+  my $ret = system ($qsub_cmd);
+  if ($ret != 0) {
+    if ($sync && $ret == 256) { # this is the exit status when a job failed (bad exit status)
+      if (defined $jobname) {
+        $logfile =~ s/\$SGE_TASK_ID/*/g;
+      }
+      print STDERR "queue.pl: job writing to $logfile failed\n";
+      exit(1);
+    } else {
+      print STDERR "queue.pl: Error submitting jobs to queue (return status was $ret)\n";
+      print STDERR "queue log file is $queue_logfile, command was $qsub_cmd\n";
+      my $err = `tail $queue_logfile`;
+      print STDERR "Output of qsub was: $err\n";
+      if ($err =~ m/gdi request/ || $err =~ m/qmaster/) {
+        # When we get queue connectivity problems we usually see a message like:
+        # Unable to run job: failed receiving gdi request response for mid=1 (got
+        # syncron message receive timeout error)..
+        my $waitfor = 20;
+        print STDERR "queue.pl: It looks like the queue master may be inaccessible. " .
+          " Trying again after $waitfor seconts\n";
+        sleep($waitfor);
+        # ... and continue throught the loop.
+      } else {
+        exit(1);
+      }
+    }
+  } else {
+    last;  # break from the loop.
+  }
+}
+
+if (! $sync) { # We're not submitting with -sync y, so we
+  # need to wait for the jobs to finish.  We wait for the
+  # sync-files we "touched" in the script to exist.
+  my @syncfiles = ();
+  if (!defined $jobname) { # not an array job.
+    push @syncfiles, $syncfile;
+  } else {
+    for (my $jobid = $jobstart; $jobid <= $jobend; $jobid++) {
+      push @syncfiles, "$syncfile.$jobid";
+    }
+  }
+  # We will need the sge_job_id, to check that job still exists
+  { # This block extracts the numeric SGE job-id from the log file in q/.
+    # It may be used later to query 'qstat' about the job.
+    open(L, "<$queue_logfile") || die "Error opening log file $queue_logfile";
+    undef $sge_job_id;
+    while (<L>) {
+      if (m/Your job\S* (\d+)[. ].+ has been submitted/) {
+        if (defined $sge_job_id) {
+          die "Error: your job was submitted more than once (see $queue_logfile)";
+        } else {
+          $sge_job_id = $1;
+        }
+      }
+    }
+    close(L);
+    if (!defined $sge_job_id) {
+      die "Error: log file $queue_logfile does not specify the SGE job-id.";
+    }
+  }
+  my $check_sge_job_ctr=1;
+
+  my $wait = 0.1;
+  my $counter = 0;
+  foreach my $f (@syncfiles) {
+    # wait for the jobs to finish one by one.
+    while (! -f $f) {
+      sleep($wait);
+      $wait *= 1.2;
+      if ($wait > 3.0) {
+        $wait = 3.0; # never wait more than 3 seconds.
+        # the following (.kick) commands are basically workarounds for NFS bugs.
+        if (rand() < 0.25) { # don't do this every time...
+          if (rand() > 0.5) {
+            system("touch $qdir/sync/.kick");
+          } else {
+            unlink("$qdir/sync/.kick");
+          }
+        }
+        if ($counter++ % 10 == 0) {
+          # This seems to kick NFS in the teeth to cause it to refresh the
+          # directory.  I've seen cases where it would indefinitely fail to get
+          # updated, even though the file exists on the server.
+          # Only do this every 10 waits (every 30 seconds) though, or if there
+          # are many jobs waiting they can overwhelm the file server.
+          system("ls $qdir/sync >/dev/null");
+        }
+      }
+
+      # The purpose of the next block is so that queue.pl can exit if the job
+      # was killed without terminating.  It's a bit complicated because (a) we
+      # don't want to overload the qmaster by querying it too frequently), and
+      # (b) sometimes the qmaster is unreachable or temporarily down, and we
+      # don't want this to necessarily kill the job.
+      if (($check_sge_job_ctr < 100 && ($check_sge_job_ctr++ % 10) == 0) ||
+          ($check_sge_job_ctr >= 100 && ($check_sge_job_ctr++ % 50) == 0)) {
+        # Don't run qstat too often, avoid stress on SGE; the if-condition above
+        # is designed to check every 10 waits at first, and eventually every 50
+        # waits.
+        if ( -f $f ) { next; }  #syncfile appeared: OK.
+        my $output = `qstat -j $sge_job_id 2>&1`;
+        my $ret = $?;
+        if ($ret >> 8 == 1 && $output !~ m/qmaster/ &&
+            $output !~ m/gdi request/) {
+          # Don't consider immediately missing job as error, first wait some
+          # time to make sure it is not just delayed creation of the syncfile.
+
+          sleep(3);
+          # Sometimes NFS gets confused and thinks it's transmitted the directory
+          # but it hasn't, due to timestamp issues.  Changing something in the
+          # directory will usually fix that.
+          system("touch $qdir/sync/.kick");
+          unlink("$qdir/sync/.kick");
+          if ( -f $f ) { next; }   #syncfile appeared, ok
+          sleep(7);
+          system("touch $qdir/sync/.kick");
+          sleep(1);
+          unlink("qdir/sync/.kick");
+          if ( -f $f ) {  next; }   #syncfile appeared, ok
+          sleep(60);
+          system("touch $qdir/sync/.kick");
+          sleep(1);
+          unlink("$qdir/sync/.kick");
+          if ( -f $f ) { next; }  #syncfile appeared, ok
+          $f =~ m/\.(\d+)$/ || die "Bad sync-file name $f";
+          my $job_id = $1;
+          if (defined $jobname) {
+            $logfile =~ s/\$SGE_TASK_ID/$job_id/g;
+          }
+          my $last_line = `tail -n 1 $logfile`;
+          if ($last_line =~ m/status 0$/ && (-M $logfile) < 0) {
+            # if the last line of $logfile ended with "status 0" and
+            # $logfile is newer than this program [(-M $logfile) gives the
+            # time elapsed between file modification and the start of this
+            # program], then we assume the program really finished OK,
+            # and maybe something is up with the file system.
+            print STDERR "**queue.pl: syncfile $f was not created but job seems\n" .
+              "**to have finished OK.  Probably your file-system has problems.\n" .
+              "**This is just a warning.\n";
+            last;
+          } else {
+            chop $last_line;
+            print STDERR "queue.pl: Error, unfinished job no " .
+              "longer exists, log is in $logfile, last line is '$last_line', " .
+              "syncfile is $f, return status of qstat was $ret\n" .
+              "Possible reasons: a) Exceeded time limit? -> Use more jobs!" .
+              " b) Shutdown/Frozen machine? -> Run again!  Qmaster output " .
+              "was: $output\n";
+            exit(1);
+          }
+        } elsif ($ret != 0) {
+          print STDERR "queue.pl: Warning: qstat command returned status $ret (qstat -j $sge_job_id,$!)\n";
+          print STDERR "queue.pl: output was: $output";
+        }
+      }
+    }
+  }
+  unlink(@syncfiles);
+}
+
+# OK, at this point we are synced; we know the job is done.
+# But we don't know about its exit status.  We'll look at $logfile for this.
+# First work out an array @logfiles of file-locations we need to
+# read (just one, unless it's an array job).
+my @logfiles = ();
+if (!defined $jobname) { # not an array job.
+  push @logfiles, $logfile;
+} else {
+  for (my $jobid = $jobstart; $jobid <= $jobend; $jobid++) {
+    my $l = $logfile;
+    $l =~ s/\$SGE_TASK_ID/$jobid/g;
+    push @logfiles, $l;
+  }
+}
+
+my $num_failed = 0;
+my $status = 1;
+foreach my $l (@logfiles) {
+  my @wait_times = (0.1, 0.2, 0.2, 0.3, 0.5, 0.5, 1.0, 2.0, 5.0, 5.0, 5.0, 10.0, 25.0);
+  for (my $iter = 0; $iter <= @wait_times; $iter++) {
+    my $line = `tail -10 $l 2>/dev/null`; # Note: although this line should be the last
+    # line of the file, I've seen cases where it was not quite the last line because
+    # of delayed output by the process that was running, or processes it had called.
+    # so tail -10 gives it a little leeway.
+    if ($line =~ m/with status (\d+)/) {
+      $status = $1;
+      last;
+    } else {
+      if ($iter < @wait_times) {
+        sleep($wait_times[$iter]);
+      } else {
+        if (! -f $l) {
+          print STDERR "Log-file $l does not exist.\n";
+        } else {
+          print STDERR "The last line of log-file $l does not seem to indicate the "
+            . "return status as expected\n";
+        }
+        exit(1);                # Something went wrong with the queue, or the
+        # machine it was running on, probably.
+      }
+    }
+  }
+  # OK, now we have $status, which is the return-status of
+  # the command in the job.
+  if ($status != 0) { $num_failed++; }
+}
+if ($num_failed == 0) { exit(0); }
+else { # we failed.
+  if (@logfiles == 1) {
+    if (defined $jobname) { $logfile =~ s/\$SGE_TASK_ID/$jobstart/g; }
+    print STDERR "queue.pl: job failed with status $status, log is in $logfile\n";
+    if ($logfile =~ m/JOB/) {
+      print STDERR "queue.pl: probably you forgot to put JOB=1:\$nj in your script.\n";
+    }
+  } else {
+    if (defined $jobname) { $logfile =~ s/\$SGE_TASK_ID/*/g; }
+    my $numjobs = 1 + $jobend - $jobstart;
+    print STDERR "queue.pl: $num_failed / $numjobs failed, log is in $logfile\n";
+  }
+  exit(1);
+}
diff --git a/ParallelWaveGAN/utils/run.pl b/ParallelWaveGAN/utils/run.pl
new file mode 100755
index 0000000000000000000000000000000000000000..f23bb8dc0b0ea53af01b52cac86bc4a451f52018
--- /dev/null
+++ b/ParallelWaveGAN/utils/run.pl
@@ -0,0 +1,282 @@
+#!/usr/bin/env perl
+use warnings; #sed replacement for -w perl parameter
+
+# In general, doing
+#  run.pl some.log a b c is like running the command a b c in
+# the bash shell, and putting the standard error and output into some.log.
+# To run parallel jobs (backgrounded on the host machine), you can do (e.g.)
+#  run.pl JOB=1:4 some.JOB.log a b c JOB is like running the command a b c JOB
+# and putting it in some.JOB.log, for each one. [Note: JOB can be any identifier].
+# If any of the jobs fails, this script will fail.
+
+# A typical example is:
+#  run.pl some.log my-prog "--opt=foo bar" foo \|  other-prog baz
+# and run.pl will run something like:
+# ( my-prog '--opt=foo bar' foo |  other-prog baz ) >& some.log
+#
+# Basically it takes the command-line arguments, quotes them
+# as necessary to preserve spaces, and evaluates them with bash.
+# In addition it puts the command line at the top of the log, and
+# the start and end times of the command at the beginning and end.
+# The reason why this is useful is so that we can create a different
+# version of this program that uses a queueing system instead.
+
+# use Data::Dumper;
+
+@ARGV < 2 && die "usage: run.pl log-file command-line arguments...";
+
+
+$max_jobs_run = -1;
+$jobstart = 1;
+$jobend = 1;
+$ignored_opts = ""; # These will be ignored.
+
+# First parse an option like JOB=1:4, and any
+# options that would normally be given to
+# queue.pl, which we will just discard.
+
+for (my $x = 1; $x <= 2; $x++) { # This for-loop is to
+  # allow the JOB=1:n option to be interleaved with the
+  # options to qsub.
+  while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) {
+    # parse any options that would normally go to qsub, but which will be ignored here.
+    my $switch = shift @ARGV;
+    if ($switch eq "-V") {
+      $ignored_opts .= "-V ";
+    } elsif ($switch eq "--max-jobs-run" || $switch eq "-tc") {
+      # we do support the option --max-jobs-run n, and its GridEngine form -tc n.
+      $max_jobs_run = shift @ARGV;
+      if (! ($max_jobs_run > 0)) {
+        die "run.pl: invalid option --max-jobs-run $max_jobs_run";
+      }
+    } else {
+      my $argument = shift @ARGV;
+      if ($argument =~ m/^--/) {
+        print STDERR "run.pl: WARNING: suspicious argument '$argument' to $switch; starts with '-'\n";
+      }
+      if ($switch eq "-sync" && $argument =~ m/^[yY]/) {
+        $ignored_opts .= "-sync "; # Note: in the
+        # corresponding code in queue.pl it says instead, just "$sync = 1;".
+      } elsif ($switch eq "-pe") { # e.g. -pe smp 5
+        my $argument2 = shift @ARGV;
+        $ignored_opts .= "$switch $argument $argument2 ";
+      } elsif ($switch eq "--gpu") {
+        $using_gpu = $argument;
+      } else {
+        # Ignore option.
+        $ignored_opts .= "$switch $argument ";
+      }
+    }
+  }
+  if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:20
+    $jobname = $1;
+    $jobstart = $2;
+    $jobend = $3;
+    shift;
+    if ($jobstart > $jobend) {
+      die "run.pl: invalid job range $ARGV[0]";
+    }
+    if ($jobstart <= 0) {
+      die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is required for GridEngine compatibility).";
+    }
+  } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1.
+    $jobname = $1;
+    $jobstart = $2;
+    $jobend = $2;
+    shift;
+  } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) {
+    print STDERR "run.pl: Warning: suspicious first argument to run.pl: $ARGV[0]\n";
+  }
+}
+
+# Users found this message confusing so we are removing it.
+# if ($ignored_opts ne "") {
+#   print STDERR "run.pl: Warning: ignoring options \"$ignored_opts\"\n";
+# }
+
+if ($max_jobs_run == -1) { # If --max-jobs-run option not set,
+                           # then work out the number of processors if possible,
+                           # and set it based on that.
+  $max_jobs_run = 0;
+  if ($using_gpu) {
+    if (open(P, "nvidia-smi -L |")) {
+      $max_jobs_run++ while (<P>);
+      close(P);
+    }
+    if ($max_jobs_run == 0) {
+      $max_jobs_run = 1;
+      print STDERR "run.pl: Warning: failed to detect number of GPUs from nvidia-smi, using ${max_jobs_run}\n";
+    }
+  } elsif (open(P, "</proc/cpuinfo")) {  # Linux
+    while (<P>) { if (m/^processor/) { $max_jobs_run++; } }
+    if ($max_jobs_run == 0) {
+      print STDERR "run.pl: Warning: failed to detect any processors from /proc/cpuinfo\n";
+      $max_jobs_run = 10;  # reasonable default.
+    }
+    close(P);
+  } elsif (open(P, "sysctl -a |")) {  # BSD/Darwin
+    while (<P>) {
+      if (m/hw\.ncpu\s*[:=]\s*(\d+)/) { # hw.ncpu = 4, or hw.ncpu: 4
+        $max_jobs_run = $1;
+        last;
+      }
+    }
+    close(P);
+    if ($max_jobs_run == 0) {
+      print STDERR "run.pl: Warning: failed to detect any processors from sysctl -a\n";
+      $max_jobs_run = 10;  # reasonable default.
+    }
+  } else {
+    # allow at most 32 jobs at once, on non-UNIX systems; change this code
+    # if you need to change this default.
+    $max_jobs_run = 32;
+  }
+  # The just-computed value of $max_jobs_run is just the number of processors
+  # (or our best guess); and if it happens that the number of jobs we need to
+  # run is just slightly above $max_jobs_run, it will make sense to increase
+  # $max_jobs_run to equal the number of jobs, so we don't have a small number
+  # of leftover jobs.
+  $num_jobs = $jobend - $jobstart + 1;
+  if (!$using_gpu &&
+      $num_jobs > $max_jobs_run && $num_jobs < 1.4 * $max_jobs_run) {
+    $max_jobs_run = $num_jobs;
+  }
+}
+
+$logfile = shift @ARGV;
+
+if (defined $jobname && $logfile !~ m/$jobname/ &&
+    $jobend > $jobstart) {
+  print STDERR "run.pl: you are trying to run a parallel job but "
+    . "you are putting the output into just one log file ($logfile)\n";
+  exit(1);
+}
+
+$cmd = "";
+
+foreach $x (@ARGV) {
+    if ($x =~ m/^\S+$/) { $cmd .=  $x . " "; }
+    elsif ($x =~ m:\":) { $cmd .= "'$x' "; }
+    else { $cmd .= "\"$x\" "; }
+}
+
+#$Data::Dumper::Indent=0;
+$ret = 0;
+$numfail = 0;
+%active_pids=();
+
+use POSIX ":sys_wait_h";
+for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
+  if (scalar(keys %active_pids) >= $max_jobs_run) {
+
+    # Lets wait for a change in any child's status
+    # Then we have to work out which child finished
+    $r = waitpid(-1, 0);
+    $code = $?;
+    if ($r < 0 ) { die "run.pl: Error waiting for child process"; } # should never happen.
+    if ( defined $active_pids{$r} ) {
+        $jid=$active_pids{$r};
+        $fail[$jid]=$code;
+        if ($code !=0) { $numfail++;}
+        delete $active_pids{$r};
+        # print STDERR "Finished: $r/$jid " .  Dumper(\%active_pids) . "\n";
+    } else {
+        die "run.pl: Cannot find the PID of the chold process that just finished.";
+    }
+
+    # In theory we could do a non-blocking waitpid over all jobs running just
+    # to find out if only one or more jobs finished during the previous waitpid()
+    # However, we just omit this and will reap the next one in the next pass
+    # through the for(;;) cycle
+  }
+  $childpid = fork();
+  if (!defined $childpid) { die "run.pl: Error forking in run.pl (writing to $logfile)"; }
+  if ($childpid == 0) { # We're in the child... this branch
+    # executes the job and returns (possibly with an error status).
+    if (defined $jobname) {
+      $cmd =~ s/$jobname/$jobid/g;
+      $logfile =~ s/$jobname/$jobid/g;
+    }
+    system("mkdir -p `dirname $logfile` 2>/dev/null");
+    open(F, ">$logfile") || die "run.pl: Error opening log file $logfile";
+    print F "# " . $cmd . "\n";
+    print F "# Started at " . `date`;
+    $starttime = `date +'%s'`;
+    print F "#\n";
+    close(F);
+
+    # Pipe into bash.. make sure we're not using any other shell.
+    open(B, "|bash") || die "run.pl: Error opening shell command";
+    print B "( " . $cmd . ") 2>>$logfile >> $logfile";
+    close(B);                   # If there was an error, exit status is in $?
+    $ret = $?;
+
+    $lowbits = $ret & 127;
+    $highbits = $ret >> 8;
+    if ($lowbits != 0) { $return_str = "code $highbits; signal $lowbits" }
+    else { $return_str = "code $highbits"; }
+
+    $endtime = `date +'%s'`;
+    open(F, ">>$logfile") || die "run.pl: Error opening log file $logfile (again)";
+    $enddate = `date`;
+    chop $enddate;
+    print F "# Accounting: time=" . ($endtime - $starttime) . " threads=1\n";
+    print F "# Ended ($return_str) at " . $enddate . ", elapsed time " . ($endtime-$starttime) . " seconds\n";
+    close(F);
+    exit($ret == 0 ? 0 : 1);
+  } else {
+    $pid[$jobid] = $childpid;
+    $active_pids{$childpid} = $jobid;
+    # print STDERR "Queued: " .  Dumper(\%active_pids) . "\n";
+  }
+}
+
+# Now we have submitted all the jobs, lets wait until all the jobs finish
+foreach $child (keys %active_pids) {
+    $jobid=$active_pids{$child};
+    $r = waitpid($pid[$jobid], 0);
+    $code = $?;
+    if ($r == -1) { die "run.pl: Error waiting for child process"; } # should never happen.
+    if ($r != 0) { $fail[$jobid]=$code; $numfail++ if $code!=0; } # Completed successfully
+}
+
+# Some sanity checks:
+# The $fail array should not contain undefined codes
+# The number of non-zeros in that array  should be equal to $numfail
+# We cannot do foreach() here, as the JOB ids do not necessarily start by zero
+$failed_jids=0;
+for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
+  $job_return = $fail[$jobid];
+  if (not defined $job_return ) {
+    # print Dumper(\@fail);
+
+    die "run.pl: Sanity check failed: we have indication that some jobs are running " .
+      "even after we waited for all jobs to finish" ;
+  }
+  if ($job_return != 0 ){ $failed_jids++;}
+}
+if ($failed_jids != $numfail) {
+  die "run.pl: Sanity check failed: cannot find out how many jobs failed ($failed_jids x $numfail)."
+}
+if ($numfail > 0) { $ret = 1; }
+
+if ($ret != 0) {
+  $njobs = $jobend - $jobstart + 1;
+  if ($njobs == 1) {
+    if (defined $jobname) {
+      $logfile =~ s/$jobname/$jobstart/; # only one numbered job, so replace name with
+                                         # that job.
+    }
+    print STDERR "run.pl: job failed, log is in $logfile\n";
+    if ($logfile =~ m/JOB/) {
+      print STDERR "run.pl: probably you forgot to put JOB=1:\$nj in your script.";
+    }
+  }
+  else {
+    $logfile =~ s/$jobname/*/g;
+    print STDERR "run.pl: $numfail / $njobs failed, log is in $logfile\n";
+  }
+}
+
+
+exit ($ret);
diff --git a/ParallelWaveGAN/utils/slurm.pl b/ParallelWaveGAN/utils/slurm.pl
new file mode 100755
index 0000000000000000000000000000000000000000..27e5fce9c01839c54dc7662d942c74d7046475c3
--- /dev/null
+++ b/ParallelWaveGAN/utils/slurm.pl
@@ -0,0 +1,627 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).
+#           2014  Vimal Manohar (Johns Hopkins University)
+#           2015  Johns Hopkins University (Yenda Trmal <jtrmal@gmail.com>>)
+# Apache 2.0.
+
+use File::Basename;
+use Cwd;
+use Getopt::Long;
+
+# slurm.pl was created from the queue.pl
+# queue.pl has the same functionality as run.pl, except that
+# it runs the job in question on the queue (Sun GridEngine).
+# This version of queue.pl uses the task array functionality
+# of the grid engine.  Note: it's different from the queue.pl
+# in the s4 and earlier scripts.
+
+# The script now supports configuring the queue system using a config file
+# (default in conf/queue.conf; but can be passed specified with --config option)
+# and a set of command line options.
+# The current script handles:
+# 1) Normal configuration arguments
+# For e.g. a command line option of "--gpu 1" could be converted into the option
+# "-q g.q -l gpu=1" to qsub. How the CLI option is handled is determined by a
+# line in the config file like
+# gpu=* -q g.q -l gpu=$0
+# $0 here in the line is replaced with the argument read from the CLI and the
+# resulting string is passed to qsub.
+# 2) Special arguments to options such as
+# gpu=0
+# If --gpu 0 is given in the command line, then no special "-q" is given.
+# 3) Default argument
+# default gpu=0
+# If --gpu option is not passed in the command line, then the script behaves as
+# if --gpu 0 was passed since 0 is specified as the default argument for that
+# option
+# 4) Arbitrary options and arguments.
+# Any command line option starting with '--' and its argument would be handled
+# as long as its defined in the config file.
+# 5) Default behavior
+# If the config file that is passed using is not readable, then the script
+# behaves as if the queue has the following config file:
+# $ cat conf/queue.conf
+# # Default configuration
+# command sbatch --export=PATH  -S /bin/bash -j y -l arch=*64*
+# option mem=* --mem-per-cpu $0
+# option mem=0          # Do not add anything to qsub_opts
+# option num_threads=* --cpus-per-task $0
+# option num_threads=1  # Do not add anything to qsub_opts
+# option max_jobs_run=* -tc $0
+# default gpu=0
+# option gpu=0 -p shared
+# option gpu=*  -p gpu  #this has to be figured out
+
+#print STDERR "$0 " . join(" ", @ARGV) . "\n";
+
+my $qsub_opts = "";
+my $sync = 0;
+my $num_threads = 1;
+my $max_jobs_run;
+my $gpu = 0;
+
+my $config = "conf/slurm.conf";
+
+my %cli_options = ();
+
+my $jobname;
+my $jobstart;
+my $jobend;
+
+my $array_job = 0;
+
+sub print_usage() {
+  print STDERR
+   "Usage: $0 [options] [JOB=1:n] log-file command-line arguments...\n" .
+   "e.g.: $0 foo.log echo baz\n" .
+   " (which will echo \"baz\", with stdout and stderr directed to foo.log)\n" .
+   "or: $0 -q all.q\@xyz foo.log echo bar \| sed s/bar/baz/ \n" .
+   " (which is an example of using a pipe; you can provide other escaped bash constructs)\n" .
+   "or: $0 -q all.q\@qyz JOB=1:10 foo.JOB.log echo JOB \n" .
+   " (which illustrates the mechanism to submit parallel jobs; note, you can use \n" .
+   "  another string other than JOB)\n" .
+   "Note: if you pass the \"-sync y\" option to qsub, this script will take note\n" .
+   "and change its behavior.  Otherwise it uses squeue to work out when the job finished\n" .
+   "Options:\n" .
+   "  --config <config-file> (default: $config)\n" .
+   "  --mem <mem-requirement> (e.g. --mem 2G, --mem 500M, \n" .
+   "                           also support K and numbers mean bytes)\n" .
+   "  --num-threads <num-threads> (default: $num_threads)\n" .
+   "  --max-jobs-run <num-jobs>\n" .
+   "  --gpu <0|1> (default: $gpu)\n";
+  exit 1;
+}
+
+sub exec_command {
+  # Execute command and return a tuple of stdout and exit code
+  my $command = join ' ', @_;
+  # To get the actual exit value, shift right by eight bits.
+  ($_ = `$command 2>&1`, $? >> 8);
+}
+
+if (@ARGV < 2) {
+  print_usage();
+}
+
+for (my $x = 1; $x <= 3; $x++) { # This for-loop is to
+  # allow the JOB=1:n option to be interleaved with the
+  # options to qsub.
+  while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) {
+    my $switch = shift @ARGV;
+
+    if ($switch eq "-V") {
+      $qsub_opts .= "-V ";
+    } else {
+      my $argument = shift @ARGV;
+      if ($argument =~ m/^--/) {
+        print STDERR "WARNING: suspicious argument '$argument' to $switch; starts with '-'\n";
+      }
+      if ($switch eq "-sync" && $argument =~ m/^[yY]/) {
+        $sync = 1;
+        $qsub_opts .= "$switch $argument ";
+      } elsif ($switch eq "-pe") { # e.g. -pe smp 5
+        my $argument2 = shift @ARGV;
+        $qsub_opts .= "$switch $argument $argument2 ";
+        $num_threads = $argument2;
+      } elsif ($switch =~ m/^--/) { # Config options
+        # Convert CLI option to variable name
+        # by removing '--' from the switch and replacing any
+        # '-' with a '_'
+        $switch =~ s/^--//;
+        $switch =~ s/-/_/g;
+        $cli_options{$switch} = $argument;
+      } else {  # Other qsub options - passed as is
+        $qsub_opts .= "$switch $argument ";
+      }
+    }
+  }
+  if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:20
+    $array_job = 1;
+    $jobname = $1;
+    $jobstart = $2;
+    $jobend = $3;
+    shift;
+    if ($jobstart > $jobend) {
+      die "$0: invalid job range $ARGV[0]";
+    }
+    if ($jobstart <= 0) {
+      die "$0: invalid job range $ARGV[0], start must be strictly positive (this is a GridEngine limitation).";
+    }
+  } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1.
+    $array_job = 1;
+    $jobname = $1;
+    $jobstart = $2;
+    $jobend = $2;
+    shift;
+  } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) {
+    print STDERR "Warning: suspicious first argument to $0: $ARGV[0]\n";
+  }
+}
+
+if (@ARGV < 2) {
+  print_usage();
+}
+
+if (exists $cli_options{"config"}) {
+  $config = $cli_options{"config"};
+}
+
+my $default_config_file = <<'EOF';
+# Default configuration
+command sbatch --export=PATH  --ntasks-per-node=1
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
+option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1 # Do not add anything to qsub_opts
+default gpu=0
+option gpu=0 -p shared
+option gpu=* -p gpu --gres=gpu:$0 --time 4:0:0  # this has to be figured out
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
+EOF
+
+# Here the configuration options specified by the user on the command line
+# (e.g. --mem 2G) are converted to options to the qsub system as defined in
+# the config file. (e.g. if the config file has the line
+# "option mem=* -l ram_free=$0,mem_free=$0"
+# and the user has specified '--mem 2G' on the command line, the options
+# passed to queue system would be "-l ram_free=2G,mem_free=2G
+# A more detailed description of the ways the options would be handled is at
+# the top of this file.
+
+my $opened_config_file = 1;
+
+open CONFIG, "<$config" or $opened_config_file = 0;
+
+my %cli_config_options = ();
+my %cli_default_options = ();
+
+if ($opened_config_file == 0 && exists($cli_options{"config"})) {
+  print STDERR "Could not open config file $config\n";
+  exit(1);
+} elsif ($opened_config_file == 0 && !exists($cli_options{"config"})) {
+  # Open the default config file instead
+  open (CONFIG, "echo '$default_config_file' |") or die "Unable to open pipe\n";
+  $config = "Default config";
+}
+
+my $qsub_cmd = "";
+my $read_command = 0;
+
+while(<CONFIG>) {
+  chomp;
+  my $line = $_;
+  $_ =~ s/\s*#.*//g;
+  if ($_ eq "") { next; }
+  if ($_ =~ /^command (.+)/) {
+    $read_command = 1;
+    $qsub_cmd = $1 . " ";
+  } elsif ($_ =~ m/^option ([^=]+)=\* (.+)$/) {
+    # Config option that needs replacement with parameter value read from CLI
+    # e.g.: option mem=* -l mem_free=$0,ram_free=$0
+    my $option = $1;     # mem
+    my $arg= $2;         # -l mem_free=$0,ram_free=$0
+    if ($arg !~ m:\$0:) {
+      print STDERR "Warning: the line '$line' in config file ($config) does not substitution variable \$0\n";
+    }
+    if (exists $cli_options{$option}) {
+      # Replace $0 with the argument read from command line.
+      # e.g. "-l mem_free=$0,ram_free=$0" -> "-l mem_free=2G,ram_free=2G"
+      $arg =~ s/\$0/$cli_options{$option}/g;
+      $cli_config_options{$option} = $arg;
+    }
+  } elsif ($_ =~ m/^option ([^=]+)=(\S+)\s?(.*)$/) {
+    # Config option that does not need replacement
+    # e.g. option gpu=0 -q all.q
+    my $option = $1;      # gpu
+    my $value = $2;       # 0
+    my $arg = $3;         # -q all.q
+    if (exists $cli_options{$option}) {
+      $cli_default_options{($option,$value)} = $arg;
+    }
+  } elsif ($_ =~ m/^default (\S+)=(\S+)/) {
+    # Default options. Used for setting default values to options i.e. when
+    # the user does not specify the option on the command line
+    # e.g. default gpu=0
+    my $option = $1;  # gpu
+    my $value = $2;   # 0
+    if (!exists $cli_options{$option}) {
+      # If the user has specified this option on the command line, then we
+      # don't have to do anything
+      $cli_options{$option} = $value;
+    }
+  } else {
+    print STDERR "$0: unable to parse line '$line' in config file ($config)\n";
+    exit(1);
+  }
+}
+
+close(CONFIG);
+
+if ($read_command != 1) {
+  print STDERR "$0: config file ($config) does not contain the line \"command .*\"\n";
+  exit(1);
+}
+
+for my $option (keys %cli_options) {
+  if ($option eq "config") { next; }
+
+  my $value = $cli_options{$option};
+
+  if ($option eq "max_jobs_run") {
+    if ($array_job != 1) {
+      print STDERR "Ignoring $option since this is not an array task.";
+    } else {
+      $max_jobs_run = $value;
+    }
+  } elsif (exists $cli_default_options{($option,$value)}) {
+    $qsub_opts .= "$cli_default_options{($option,$value)} ";
+  } elsif (exists $cli_config_options{$option}) {
+    $qsub_opts .= "$cli_config_options{$option} ";
+  } elsif (exists $cli_default_options{($option,"*")}) {
+    $qsub_opts .= $cli_default_options{($option,"*")} . " ";
+  } else {
+    if ($opened_config_file == 0) {
+      $config = "default config file";
+    }
+    die "$0: Command line option $option not described in $config (or value '$value' not allowed)\n";
+  }
+}
+
+my $cwd = getcwd();
+my $logfile = shift @ARGV;
+
+if ($array_job == 1 && $logfile !~ m/$jobname/
+    && $jobend > $jobstart) {
+  print STDERR "$0: you are trying to run a parallel job but "
+    . "you are putting the output into just one log file ($logfile)\n";
+  exit(1);
+}
+
+#
+# Work out the command; quote escaping is done here.
+# Note: the rules for escaping stuff are worked out pretty
+# arbitrarily, based on what we want it to do.  Some things that
+# we pass as arguments to $0, such as "|", we want to be
+# interpreted by bash, so we don't escape them.  Other things,
+# such as archive specifiers like 'ark:gunzip -c foo.gz|', we want
+# to be passed, in quotes, to the Kaldi program.  Our heuristic
+# is that stuff with spaces in should be quoted.  This doesn't
+# always work.
+#
+my $cmd = "";
+
+foreach my $x (@ARGV) {
+  if ($x =~ m/^\S+$/) { $cmd .= $x . " "; } # If string contains no spaces, take
+                                            # as-is.
+  elsif ($x =~ m:\":) { $cmd .= "'$x' "; } # else if no dbl-quotes, use single
+  else { $cmd .= "\"$x\" "; }  # else use double.
+}
+
+#
+# Work out the location of the script file, and open it for writing.
+#
+my $dir = dirname($logfile);
+my $base = basename($logfile);
+my $qdir = "$dir/q";
+$qdir =~ s:/(log|LOG)/*q:/q:; # If qdir ends in .../log/q, make it just .../q.
+my $queue_logfile = "$qdir/$base";
+
+if (!-d $dir) { system "mkdir -p $dir 2>/dev/null"; } # another job may be doing this...
+if (!-d $dir) { die "Cannot make the directory $dir\n"; }
+# make a directory called "q",
+# where we will put the log created by qsub... normally this doesn't contain
+# anything interesting, evertyhing goes to $logfile.
+if (! -d "$qdir") {
+  system "mkdir $qdir 2>/dev/null";
+  sleep(5); ## This is to fix an issue we encountered in denominator lattice creation,
+  ## where if e.g. the exp/tri2b_denlats/log/15/q directory had just been
+  ## created and the job immediately ran, it would die with an error because nfs
+  ## had not yet synced.  I'm also decreasing the acdirmin and acdirmax in our
+  ## NFS settings to something like 5 seconds.
+}
+
+my $queue_array_opt = "";
+if ($array_job == 1) { # It's an array job.
+  if ($max_jobs_run) {
+      $queue_array_opt = "--array ${jobstart}-${jobend}%${max_jobs_run}";
+  } else {
+      $queue_array_opt = "--array ${jobstart}-${jobend}";
+  }
+  $logfile =~ s/$jobname/\$SLURM_ARRAY_TASK_ID/g; # This variable will get
+  # replaced by qsub, in each job, with the job-id.
+  $cmd =~ s/$jobname/\$\{SLURM_ARRAY_TASK_ID\}/g; # same for the command...
+  $queue_logfile =~ s/\.?$jobname//; # the log file in the q/ subdirectory
+  # is for the queue to put its log, and this doesn't need the task array subscript
+  # so we remove it.
+}
+
+# queue_scriptfile is as $queue_logfile [e.g. dir/q/foo.log] but
+# with the suffix .sh.
+my $queue_scriptfile = $queue_logfile;
+($queue_scriptfile =~ s/\.[a-zA-Z]{1,5}$/.sh/) || ($queue_scriptfile .= ".sh");
+if ($queue_scriptfile !~ m:^/:) {
+  $queue_scriptfile = $cwd . "/" . $queue_scriptfile; # just in case.
+}
+
+# We'll write to the standard input of "qsub" (the file-handle Q),
+# the job that we want it to execute.
+# Also keep our current PATH around, just in case there was something
+# in it that we need (although we also source ./path.sh)
+
+my $syncfile = "$qdir/done.$$";
+
+system("rm $queue_logfile $syncfile 2>/dev/null");
+#
+# Write to the script file, and then close it.
+#
+open(Q, ">$queue_scriptfile") || die "Failed to write to $queue_scriptfile";
+
+print Q "#!/bin/bash\n";
+print Q "cd $cwd\n";
+print Q ". ./path.sh\n";
+print Q "( echo '#' Running on \`hostname\`\n";
+print Q "  echo '#' Started at \`date\`\n";
+print Q "  set | grep SLURM | while read line; do echo \"# \$line\"; done\n";
+print Q "  echo -n '# '; cat <<EOF\n";
+print Q "$cmd\n"; # this is a way of echoing the command into a comment in the log file,
+print Q "EOF\n"; # without having to escape things like "|" and quote characters.
+print Q ") >$logfile\n";
+print Q "if [ \"\$CUDA_VISIBLE_DEVICES\" == \"NoDevFiles\" ]; then\n";
+print Q "  ( echo CUDA_VISIBLE_DEVICES set to NoDevFiles, unsetting it... \n";
+print Q "  )>>$logfile\n";
+print Q "  unset CUDA_VISIBLE_DEVICES.\n";
+print Q "fi\n";
+print Q "time1=\`date +\"%s\"\`\n";
+print Q " ( $cmd ) &>>$logfile\n";
+print Q "ret=\$?\n";
+print Q "sync || true";
+print Q "time2=\`date +\"%s\"\`\n";
+print Q "echo '#' Accounting: begin_time=\$time1 >>$logfile\n";
+print Q "echo '#' Accounting: end_time=\$time2 >>$logfile\n";
+print Q "echo '#' Accounting: time=\$((\$time2-\$time1)) threads=$num_threads >>$logfile\n";
+print Q "echo '#' Finished at \`date\` with status \$ret >>$logfile\n";
+print Q "[ \$ret -eq 137 ] && exit 100;\n"; # If process was killed (e.g. oom) it will exit with status 137;
+  # let the script return with status 100 which will put it to E state; more easily rerunnable.
+if ($array_job == 0) { # not an array job
+  print Q "touch $syncfile\n"; # so we know it's done.
+} else {
+  print Q "touch $syncfile.\$SLURM_ARRAY_TASK_ID\n"; # touch a bunch of sync-files.
+}
+print Q "exit \$[\$ret ? 1 : 0]\n"; # avoid status 100 which grid-engine
+print Q "## submitted with:\n";       # treats specially.
+$qsub_cmd .= " $qsub_opts --open-mode=append -e ${queue_logfile} -o ${queue_logfile} $queue_array_opt $queue_scriptfile >>$queue_logfile 2>&1";
+print Q "# $qsub_cmd\n";
+if (!close(Q)) { # close was not successful... || die "Could not close script file $shfile";
+  die "Failed to close the script file (full disk?)";
+}
+
+my $ret = system ($qsub_cmd);
+if ($ret != 0) {
+  if ($sync && $ret == 256) { # this is the exit status when a job failed (bad exit status)
+    if (defined $jobname) { $logfile =~ s/\$SLURM_ARRAY_TASK_ID/*/g; }
+    print STDERR "$0: job writing to $logfile failed\n";
+  } else {
+    print STDERR "$0: error submitting jobs to queue (return status was $ret)\n";
+    print STDERR "queue log file is $queue_logfile, command was $qsub_cmd\n";
+    print STDERR `tail $queue_logfile`;
+  }
+  exit(1);
+}
+
+my $sge_job_id;
+if (! $sync) { # We're not submitting with -sync y, so we
+  # need to wait for the jobs to finish.  We wait for the
+  # sync-files we "touched" in the script to exist.
+  my @syncfiles = ();
+  if (!defined $jobname) { # not an array job.
+    push @syncfiles, $syncfile;
+  } else {
+    for (my $jobid = $jobstart; $jobid <= $jobend; $jobid++) {
+      push @syncfiles, "$syncfile.$jobid";
+    }
+  }
+  # We will need the sge_job_id, to check that job still exists
+  { # Get the SLURM job-id from the log file in q/
+    open(L, "<$queue_logfile") || die "Error opening log file $queue_logfile";
+    undef $sge_job_id;
+    while (<L>) {
+      if (m/Submitted batch job (\d+)/) {
+        if (defined $sge_job_id) {
+          die "Error: your job was submitted more than once (see $queue_logfile)";
+        } else {
+          $sge_job_id = $1;
+        }
+      }
+    }
+    close(L);
+    if (!defined $sge_job_id) {
+      die "Error: log file $queue_logfile does not specify the SLURM job-id.";
+    }
+  }
+  my $check_sge_job_ctr=1;
+  #
+  my $wait = 0.1;
+  my $counter = 0;
+  foreach my $f (@syncfiles) {
+    # wait for them to finish one by one.
+    while (! -f $f) {
+      sleep($wait);
+      $wait *= 1.2;
+      if ($wait > 3.0) {
+        $wait = 3.0; # never wait more than 3 seconds.
+        # the following (.kick) commands are basically workarounds for NFS bugs.
+        if (rand() < 0.25) { # don't do this every time...
+          if (rand() > 0.5) {
+            system("touch $qdir/.kick");
+          } else {
+            system("rm $qdir/.kick 2>/dev/null");
+          }
+        }
+        if ($counter++ % 10 == 0) {
+          # This seems to kick NFS in the teeth to cause it to refresh the
+          # directory.  I've seen cases where it would indefinitely fail to get
+          # updated, even though the file exists on the server.
+          # Only do this every 10 waits (every 30 seconds) though, or if there
+          # are many jobs waiting they can overwhelm the file server.
+          system("ls $qdir >/dev/null");
+        }
+      }
+
+      # Check that the job exists in SLURM. Job can be killed if duration
+      # exceeds some hard limit, or in case of a machine shutdown.
+      if (($check_sge_job_ctr++ % 10) == 0) { # Don't run qstat too often, avoid stress on SGE.
+        if ( -f $f ) { next; }; #syncfile appeared: OK.
+        # system(...) : To get the actual exit value, shift $ret right by eight bits.
+        my ($squeue_output, $squeue_status) = exec_command("squeue -j $sge_job_id");
+        if ($squeue_status == 1) {
+          # Don't consider immediately missing job as error, first wait some
+          sleep(4);
+          ($squeue_output, $squeue_status) = exec_command("squeue -j $sge_job_id");
+        }
+        if ($squeue_status == 1) {
+          # time to make sure it is not just delayed creation of the syncfile.
+
+          # Don't consider immediately missing job as error, first wait some  
+          # time to make sure it is not just delayed creation of the syncfile.
+          sleep(4);
+          # Sometimes NFS gets confused and thinks it's transmitted the directory
+          # but it hasn't, due to timestamp issues.  Changing something in the
+          # directory will usually fix that.
+          system("touch $qdir/.kick");
+          system("rm $qdir/.kick 2>/dev/null");
+          if ( -f $f ) { next; }   #syncfile appeared, ok
+          sleep(7);
+          system("touch $qdir/.kick");
+          sleep(1);
+          system("rm $qdir/.kick 2>/dev/null");
+          if ( -f $f ) {  next; }   #syncfile appeared, ok
+          sleep(60);
+          system("touch $qdir/.kick");
+          sleep(1);
+          system("rm $qdir/.kick 2>/dev/null");
+          if ( -f $f ) { next; }  #syncfile appeared, ok
+          $f =~ m/\.(\d+)$/ || die "Bad sync-file name $f";
+          my $job_id = $1;
+          if (defined $jobname) {
+            $logfile =~ s/\$SLURM_ARRAY_TASK_ID/$job_id/g;
+          }
+          my $last_line = `tail -n 1 $logfile`;
+          if ($last_line =~ m/status 0$/ && (-M $logfile) < 0) {
+            # if the last line of $logfile ended with "status 0" and
+            # $logfile is newer than this program [(-M $logfile) gives the
+            # time elapsed between file modification and the start of this
+            # program], then we assume the program really finished OK,
+            # and maybe something is up with the file system.
+            print STDERR "**$0: syncfile $f was not created but job seems\n" .
+              "**to have finished OK.  Probably your file-system has problems.\n" .
+              "**This is just a warning.\n";
+            last;
+          } else {
+            chop $last_line;
+            print STDERR "$0: Error: Job $sge_job_id seems to no longer exists:\n" .
+              "'squeue -j $sge_job_id' returned error code $squeue_status and said:\n" .
+              "  $squeue_output\n" .
+              "Syncfile $f does not exist, meaning that the job did not finish.\n" .
+              "Log is in $logfile. Last line '$last_line' does not end in 'status 0'.\n" .
+              "Possible reasons:\n" .
+              "  a) Exceeded time limit? -> Use more jobs!\n" .
+              "  b) Shutdown/Frozen machine? -> Run again! squeue:\n";
+            system("squeue -j $sge_job_id");
+            exit(1);
+          }
+        } elsif ($ret != 0) {
+          print STDERR "$0: Warning: squeue command returned status $ret (squeue -j $sge_job_id,$!)\n";
+        }
+      }
+    }
+  }
+  my $all_syncfiles = join(" ", @syncfiles);
+  system("rm $all_syncfiles 2>/dev/null");
+}
+
+# OK, at this point we are synced; we know the job is done.
+# But we don't know about its exit status.  We'll look at $logfile for this.
+# First work out an array @logfiles of file-locations we need to
+# read (just one, unless it's an array job).
+my @logfiles = ();
+if (!defined $jobname) { # not an array job.
+  push @logfiles, $logfile;
+} else {
+  for (my $jobid = $jobstart; $jobid <= $jobend; $jobid++) {
+    my $l = $logfile;
+    $l =~ s/\$SLURM_ARRAY_TASK_ID/$jobid/g;
+    push @logfiles, $l;
+  }
+}
+
+my $num_failed = 0;
+my $status = 1;
+foreach my $l (@logfiles) {
+  my @wait_times = (0.1, 0.2, 0.2, 0.3, 0.5, 0.5, 1.0, 2.0, 5.0, 5.0, 5.0, 10.0, 25.0);
+  for (my $iter = 0; $iter <= @wait_times; $iter++) {
+    my $line = `tail -10 $l 2>/dev/null`; # Note: although this line should be the last
+    # line of the file, I've seen cases where it was not quite the last line because
+    # of delayed output by the process that was running, or processes it had called.
+    # so tail -10 gives it a little leeway.
+    if ($line =~ m/with status (\d+)/) {
+      $status = $1;
+      last;
+    } else {
+      if ($iter < @wait_times) {
+        sleep($wait_times[$iter]);
+      } else {
+        if (! -f $l) {
+          print STDERR "Log-file $l does not exist.\n";
+        } else {
+          print STDERR "The last line of log-file $l does not seem to indicate the "
+            . "return status as expected\n";
+        }
+        exit(1);                # Something went wrong with the queue, or the
+        # machine it was running on, probably.
+      }
+    }
+  }
+  # OK, now we have $status, which is the return-status of
+  # the command in the job.
+  if ($status != 0) { $num_failed++; }
+}
+if ($num_failed == 0) { exit(0); }
+else { # we failed.
+  if (@logfiles == 1) {
+    if (defined $jobname) { $logfile =~ s/\$SLURM_TASK_ARRAY_ID/$jobstart/g; }
+    print STDERR "$0: job failed with status $status, log is in $logfile\n";
+    if ($logfile =~ m/JOB/) {
+      print STDERR "$0: probably you forgot to put JOB=1:\$nj in your script.\n";
+    }
+  } else {
+    if (defined $jobname) { $logfile =~ s/\$SLURM_ARRAY_TASK_ID/*/g; }
+    my $numjobs = 1 + $jobend - $jobstart;
+    print STDERR "$0: $num_failed / $numjobs failed, log is in $logfile\n";
+  }
+  exit(1);
+}
diff --git a/ParallelWaveGAN/utils/split_data.sh b/ParallelWaveGAN/utils/split_data.sh
new file mode 100755
index 0000000000000000000000000000000000000000..baf97b6664c37b714213bafd0260bd7aa600b69a
--- /dev/null
+++ b/ParallelWaveGAN/utils/split_data.sh
@@ -0,0 +1,104 @@
+#!/bin/bash
+
+# Split data direcoty into two data direcotries
+
+# Copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+
+# shellcheck disable=SC1091
+. ./path.sh || exit 1;
+
+shuffle=false
+num_first=0
+num_second=0
+
+# shellcheck disable=SC1091
+. utils/parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+    echo "Usage: $0 <src_dir> <dist_dir_1> <dist_dir_2> ..."
+    echo "e.g.: $0 data/all data/train data/deveval"
+    echo ""
+    echo "Options:"
+    echo "    --shuffle: Whether to perform shuffle (default=false)."
+    echo "    --num_first: Number of utts in the first dist dir."
+    echo "        If set to 0, it will be automatically decided (default=0)."
+    echo "    --num_second: Number of utts in the second dist dir."
+    echo "        If set to 0, it will be automatically decided (default=0)."
+    exit 1
+fi
+
+set -eu
+
+src_dir=$1
+first_dist_dir=$2
+second_dist_dir=$3
+
+src_scp=${src_dir}/wav.scp
+if [ -e "${src_dir}/segments" ]; then
+    has_segments=true
+    src_segments=${src_dir}/segments
+    num_src_utts=$(wc -l < "${src_segments}")
+else
+    has_segments=false
+    num_src_utts=$(wc -l < "${src_scp}")
+fi
+
+# check number of utts
+if [ "${num_first}" -eq 0 ] && [ "${num_second}" -eq 0 ]; then
+    num_first=$((num_src_utts / 2 ))
+    num_second=$((num_src_utts - num_first))
+elif [ "${num_first}" -gt 0 ] && [ "${num_second}" -eq 0 ]; then
+    [ "${num_src_utts}" -le "${num_first}" ] && \
+        echo "ERROR: num_first must be less than # utts in src. (${num_first} vs ${num_src_utts})" >&2 && \
+        exit 1
+    num_second=$((num_src_utts - num_first))
+elif [ "${num_first}" -eq 0 ] && [ "${num_second}" -gt 0 ]; then
+    [ "${num_src_utts}" -le "${num_second}" ] && \
+        echo "ERROR: num_second must be less than # utts in src. (${num_second} vs ${num_src_utts})" >&2 && \
+        exit 1
+    num_first=$((num_src_utts - num_second))
+elif [ "${num_first}" -gt 0 ] && [ "${num_second}" -gt 0 ]; then
+    [ "${num_src_utts}" -ne "$((num_first + num_second))" ] && \
+        echo "ERROR: num_first + num_second must be the same # utts in src. ($((num_first + num_second)) vs ${num_src_utts})" >&2 && \
+        exit 1
+fi
+
+# check directory existence
+[ ! -e "${first_dist_dir}" ] && mkdir -p "${first_dist_dir}"
+[ ! -e "${second_dist_dir}" ] && mkdir -p "${second_dist_dir}"
+
+# split
+if ! "${has_segments}"; then
+    if "${shuffle}"; then
+        sort -R "${src_scp}" > "${src_scp}.unsorted"
+        head -n "${num_first}" "${src_scp}.unsorted" | sort > "${first_dist_dir}/wav.scp"
+        tail -n "${num_second}" "${src_scp}.unsorted" | sort > "${second_dist_dir}/wav.scp"
+        rm "${src_scp}.unsorted"
+    else
+        head -n "${num_first}" "${src_scp}" | sort > "${first_dist_dir}/wav.scp"
+        tail -n "${num_second}" "${src_scp}" | sort > "${second_dist_dir}/wav.scp"
+    fi
+else
+    # split segments at first
+    if "${shuffle}"; then
+        sort -R "${src_segments}" > "${src_segments}.unsorted"
+        head -n "${num_first}" "${src_segments}.unsorted" | sort > "${first_dist_dir}/segments"
+        tail -n "${num_second}" "${src_segments}.unsorted" | sort > "${second_dist_dir}/segments"
+        rm "${src_segments}.unsorted"
+    else
+        head -n "${num_first}" "${src_segments}" | sort > "${first_dist_dir}/segments"
+        tail -n "${num_second}" "${src_segments}" | sort > "${second_dist_dir}/segments"
+    fi
+    # split wav.scp
+    rm -rf "${first_dist_dir}/wav.scp"
+    awk '{print $2}' < "${first_dist_dir}/segments" | sort | uniq | while read -r wav_id; do
+        grep "^${wav_id} " < "${src_scp}" >> "${first_dist_dir}/wav.scp"
+    done
+    rm -rf "${second_dist_dir}/wav.scp"
+    awk '{print $2}' < "${second_dist_dir}/segments" | sort | uniq | while read -r wav_id; do
+        grep "^${wav_id} " < "${src_scp}" >> "${second_dist_dir}/wav.scp"
+    done
+fi
+
+echo "Successfully split data directory."
diff --git a/ParallelWaveGAN/utils/split_scp.pl b/ParallelWaveGAN/utils/split_scp.pl
new file mode 100755
index 0000000000000000000000000000000000000000..dc798282f79dcaeed60de4eba5c587f91ee071a8
--- /dev/null
+++ b/ParallelWaveGAN/utils/split_scp.pl
@@ -0,0 +1,246 @@
+#!/usr/bin/env perl
+
+# Copyright 2010-2011 Microsoft Corporation
+
+# See ../../COPYING for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This program splits up any kind of .scp or archive-type file.
+# If there is no utt2spk option it will work on any text  file and
+# will split it up with an approximately equal number of lines in
+# each but.
+# With the --utt2spk option it will work on anything that has the
+# utterance-id as the first entry on each line; the utt2spk file is
+# of the form "utterance speaker" (on each line).
+# It splits it into equal size chunks as far as it can.  If you use the utt2spk
+# option it will make sure these chunks coincide with speaker boundaries.  In
+# this case, if there are more chunks than speakers (and in some other
+# circumstances), some of the resulting chunks will be empty and it will print
+# an error message and exit with nonzero status.
+# You will normally call this like:
+# split_scp.pl scp scp.1 scp.2 scp.3 ...
+# or
+# split_scp.pl --utt2spk=utt2spk scp scp.1 scp.2 scp.3 ...
+# Note that you can use this script to split the utt2spk file itself,
+# e.g. split_scp.pl --utt2spk=utt2spk utt2spk utt2spk.1 utt2spk.2 ...
+
+# You can also call the scripts like:
+# split_scp.pl -j 3 0 scp scp.0
+# [note: with this option, it assumes zero-based indexing of the split parts,
+# i.e. the second number must be 0 <= n < num-jobs.]
+
+use warnings;
+
+$num_jobs = 0;
+$job_id = 0;
+$utt2spk_file = "";
+$one_based = 0;
+
+for ($x = 1; $x <= 3 && @ARGV > 0; $x++) {
+    if ($ARGV[0] eq "-j") {
+        shift @ARGV;
+        $num_jobs = shift @ARGV;
+        $job_id = shift @ARGV;
+    }
+    if ($ARGV[0] =~ /--utt2spk=(.+)/) {
+        $utt2spk_file=$1;
+        shift;
+    }
+    if ($ARGV[0] eq '--one-based') {
+        $one_based = 1;
+        shift @ARGV;
+    }
+}
+
+if ($num_jobs != 0 && ($num_jobs < 0 || $job_id - $one_based < 0 ||
+                       $job_id - $one_based >= $num_jobs)) {
+  die "$0: Invalid job number/index values for '-j $num_jobs $job_id" .
+      ($one_based ? " --one-based" : "") . "'\n"
+}
+
+$one_based
+    and $job_id--;
+
+if(($num_jobs == 0 && @ARGV < 2) || ($num_jobs > 0 && (@ARGV < 1 || @ARGV > 2))) {
+    die
+"Usage: split_scp.pl [--utt2spk=<utt2spk_file>] in.scp out1.scp out2.scp ...
+   or: split_scp.pl -j num-jobs job-id [--one-based] [--utt2spk=<utt2spk_file>] in.scp [out.scp]
+ ... where 0 <= job-id < num-jobs, or 1 <= job-id <- num-jobs if --one-based.\n";
+}
+
+$error = 0;
+$inscp = shift @ARGV;
+if ($num_jobs == 0) { # without -j option
+    @OUTPUTS = @ARGV;
+} else {
+    for ($j = 0; $j < $num_jobs; $j++) {
+        if ($j == $job_id) {
+            if (@ARGV > 0) { push @OUTPUTS, $ARGV[0]; }
+            else { push @OUTPUTS, "-"; }
+        } else {
+            push @OUTPUTS, "/dev/null";
+        }
+    }
+}
+
+if ($utt2spk_file ne "") {  # We have the --utt2spk option...
+    open($u_fh, '<', $utt2spk_file) || die "$0: Error opening utt2spk file $utt2spk_file: $!\n";
+    while(<$u_fh>) {
+        @A = split;
+        @A == 2 || die "$0: Bad line $_ in utt2spk file $utt2spk_file\n";
+        ($u,$s) = @A;
+        $utt2spk{$u} = $s;
+    }
+    close $u_fh;
+    open($i_fh, '<', $inscp) || die "$0: Error opening input scp file $inscp: $!\n";
+    @spkrs = ();
+    while(<$i_fh>) {
+        @A = split;
+        if(@A == 0) { die "$0: Empty or space-only line in scp file $inscp\n"; }
+        $u = $A[0];
+        $s = $utt2spk{$u};
+        defined $s || die "$0: No utterance $u in utt2spk file $utt2spk_file\n";
+        if(!defined $spk_count{$s}) {
+            push @spkrs, $s;
+            $spk_count{$s} = 0;
+            $spk_data{$s} = [];  # ref to new empty array.
+        }
+        $spk_count{$s}++;
+        push @{$spk_data{$s}}, $_;
+    }
+    # Now split as equally as possible ..
+    # First allocate spks to files by allocating an approximately
+    # equal number of speakers.
+    $numspks = @spkrs;  # number of speakers.
+    $numscps = @OUTPUTS; # number of output files.
+    if ($numspks < $numscps) {
+      die "$0: Refusing to split data because number of speakers $numspks " .
+          "is less than the number of output .scp files $numscps\n";
+    }
+    for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
+        $scparray[$scpidx] = []; # [] is array reference.
+    }
+    for ($spkidx = 0; $spkidx < $numspks; $spkidx++) {
+        $scpidx = int(($spkidx*$numscps) / $numspks);
+        $spk = $spkrs[$spkidx];
+        push @{$scparray[$scpidx]}, $spk;
+        $scpcount[$scpidx] += $spk_count{$spk};
+    }
+
+    # Now will try to reassign beginning + ending speakers
+    # to different scp's and see if it gets more balanced.
+    # Suppose objf we're minimizing is sum_i (num utts in scp[i] - average)^2.
+    # We can show that if considering changing just 2 scp's, we minimize
+    # this by minimizing the squared difference in sizes.  This is
+    # equivalent to minimizing the absolute difference in sizes.  This
+    # shows this method is bound to converge.
+
+    $changed = 1;
+    while($changed) {
+        $changed = 0;
+        for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
+            # First try to reassign ending spk of this scp.
+            if($scpidx < $numscps-1) {
+                $sz = @{$scparray[$scpidx]};
+                if($sz > 0) {
+                    $spk = $scparray[$scpidx]->[$sz-1];
+                    $count = $spk_count{$spk};
+                    $nutt1 = $scpcount[$scpidx];
+                    $nutt2 = $scpcount[$scpidx+1];
+                    if( abs( ($nutt2+$count) - ($nutt1-$count))
+                        < abs($nutt2 - $nutt1))  { # Would decrease
+                        # size-diff by reassigning spk...
+                        $scpcount[$scpidx+1] += $count;
+                        $scpcount[$scpidx] -= $count;
+                        pop @{$scparray[$scpidx]};
+                        unshift @{$scparray[$scpidx+1]}, $spk;
+                        $changed = 1;
+                    }
+                }
+            }
+            if($scpidx > 0 && @{$scparray[$scpidx]} > 0) {
+                $spk = $scparray[$scpidx]->[0];
+                $count = $spk_count{$spk};
+                $nutt1 = $scpcount[$scpidx-1];
+                $nutt2 = $scpcount[$scpidx];
+                if( abs( ($nutt2-$count) - ($nutt1+$count))
+                    < abs($nutt2 - $nutt1))  { # Would decrease
+                    # size-diff by reassigning spk...
+                    $scpcount[$scpidx-1] += $count;
+                    $scpcount[$scpidx] -= $count;
+                    shift @{$scparray[$scpidx]};
+                    push @{$scparray[$scpidx-1]}, $spk;
+                    $changed = 1;
+                }
+            }
+        }
+    }
+    # Now print out the files...
+    for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
+        $scpfile = $OUTPUTS[$scpidx];
+        ($scpfile ne '-' ? open($f_fh, '>', $scpfile)
+                         : open($f_fh, '>&', \*STDOUT)) ||
+            die "$0: Could not open scp file $scpfile for writing: $!\n";
+        $count = 0;
+        if(@{$scparray[$scpidx]} == 0) {
+            print STDERR "$0: eError: split_scp.pl producing empty .scp file " .
+                         "$scpfile (too many splits and too few speakers?)\n";
+            $error = 1;
+        } else {
+            foreach $spk ( @{$scparray[$scpidx]} ) {
+                print $f_fh @{$spk_data{$spk}};
+                $count += $spk_count{$spk};
+            }
+            $count == $scpcount[$scpidx] || die "Count mismatch [code error]";
+        }
+        close($f_fh);
+    }
+} else {
+   # This block is the "normal" case where there is no --utt2spk
+   # option and we just break into equal size chunks.
+
+    open($i_fh, '<', $inscp) || die "$0: Error opening input scp file $inscp: $!\n";
+
+    $numscps = @OUTPUTS;  # size of array.
+    @F = ();
+    while(<$i_fh>) {
+        push @F, $_;
+    }
+    $numlines = @F;
+    if($numlines == 0) {
+        print STDERR "$0: error: empty input scp file $inscp\n";
+        $error = 1;
+    }
+    $linesperscp = int( $numlines / $numscps); # the "whole part"..
+    $linesperscp >= 1 || die "$0: You are splitting into too many pieces! [reduce \$nj]\n";
+    $remainder = $numlines - ($linesperscp * $numscps);
+    ($remainder >= 0 && $remainder < $numlines) || die "bad remainder $remainder";
+    # [just doing int() rounds down].
+    $n = 0;
+    for($scpidx = 0; $scpidx < @OUTPUTS; $scpidx++) {
+        $scpfile = $OUTPUTS[$scpidx];
+        ($scpfile ne '-' ? open($o_fh, '>', $scpfile)
+                         : open($o_fh, '>&', \*STDOUT)) ||
+            die "$0: Could not open scp file $scpfile for writing: $!\n";
+        for($k = 0; $k < $linesperscp + ($scpidx < $remainder ? 1 : 0); $k++) {
+            print $o_fh $F[$n++];
+        }
+        close($o_fh) || die "$0: Eror closing scp file $scpfile: $!\n";
+    }
+    $n == $numlines || die "$n != $numlines [code error]";
+}
+
+exit ($error);
diff --git a/ParallelWaveGAN/utils/ssh.pl b/ParallelWaveGAN/utils/ssh.pl
new file mode 100755
index 0000000000000000000000000000000000000000..5d3e3e44d71112044ce59ce02b76ff03340dbf7f
--- /dev/null
+++ b/ParallelWaveGAN/utils/ssh.pl
@@ -0,0 +1,219 @@
+#!/usr/bin/env perl
+use warnings; #sed replacement for -w perl parameter
+
+use Cwd;
+use File::Basename;
+
+# This program is like run.pl except rather than just running on a local
+# machine, it can be configured to run on remote machines via ssh.
+# It requires that you have set up passwordless access to those machines,
+# and that Kaldi is running from a location that is accessible via the
+# same path on those machines (presumably via an NFS mount).
+#
+# It looks for a file .queue/machines that should have, on each line, the name
+# of a machine that you can ssh to (which may include this machine).  It doesn't
+# have to be a fully qualified name.
+#
+# Later we may extend this so that on each line of .queue/machines you
+# can specify various resources that each machine has, such as how
+# many slots and how much memory, and make it wait if machines are 
+# busy.  But for now it simply ssh's to a machine from those in the list.
+
+# The command-line interface of this program is the same as run.pl;
+# see run.pl for more information about the usage.
+
+
+@ARGV < 2 && die "usage: ssh.pl log-file command-line arguments...";
+
+$jobstart = 1;
+$jobend = 1;
+$qsub_opts=""; # These will be ignored.
+
+# First parse an option like JOB=1:4, and any
+# options that would normally be given to 
+# ssh.pl, which we will just discard.
+
+if (@ARGV > 0) {
+  while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) { # parse any options
+    # that would normally go to qsub, but which will be ignored here.
+    $switch = shift @ARGV;
+    if ($switch eq "-V") {
+      $qsub_opts .= "-V ";
+    } else {
+      $option = shift @ARGV;
+      if ($switch eq "-sync" && $option =~ m/^[yY]/) {
+        $qsub_opts .= "-sync "; # Note: in the
+        # corresponding code in queue.pl it says instead, just "$sync = 1;".
+      }
+      $qsub_opts .= "$switch $option ";
+      if ($switch eq "-pe") { # e.g. -pe smp 5
+        $option2 = shift @ARGV;
+        $qsub_opts .= "$option2 ";
+      }
+    }
+  }
+  if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:10
+    $jobname = $1;
+    $jobstart = $2;
+    $jobend = $3;
+    shift;
+    if ($jobstart > $jobend) {
+      die "run.pl: invalid job range $ARGV[0]";
+    }
+    if ($jobstart <= 0) {
+      die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is required for GridEngine compatibility)";
+    }
+  } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1.
+    $jobname = $1;
+    $jobstart = $2;
+    $jobend = $2;
+    shift;
+  } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) {
+    print STDERR "Warning: suspicious first argument to run.pl: $ARGV[0]\n";
+  }
+}
+
+if ($qsub_opts ne "") {
+  print STDERR "Warning: ssh.pl ignoring options \"$qsub_opts\"\n";
+}
+
+{ # Read .queue/machines
+  if (!open(Q, "<.queue/machines")) {
+    print STDERR "ssh.pl: expected the file .queue/machines to exist.\n";
+    exit(1);
+  }
+  @machines = ();
+  while (<Q>) {
+    chop;
+    if ($_ ne "") {
+      @A = split;
+      if (@A != 1) {
+        die "ssh.pl: bad line '$_' in .queue/machines.";
+      }
+      if ($A[0] !~ m/^[a-z0-9\.\-]+/) {
+        die "ssh.pl: invalid machine name '$A[0]'";
+      }
+      push @machines, $A[0];
+    }
+  }
+  if (@machines == 0) {   die "ssh.pl: no machines listed in .queue/machines";  }
+}
+
+$logfile = shift @ARGV;
+
+if (defined $jobname && $logfile !~ m/$jobname/ &&
+    $jobend > $jobstart) {
+  print STDERR "ssh.pl: you are trying to run a parallel job but "
+    . "you are putting the output into just one log file ($logfile)\n";
+  exit(1);
+}
+
+{
+  $offset = 0;  # $offset will be an offset added to any index from the job-id
+                # specified if the user does JOB=1:10.  The main point of this is
+                # that there are instances where a script will manually submit a
+                # number of jobs to the queue, e.g. with log files foo.1.log,
+                # foo.2.log and so on, and we don't want all of these to go
+                # to the first machine.
+  @A = split(".", basename($logfile));
+  # if $logfile looks like foo.9.log, add 9 to $offset.
+  foreach $a (@A) {  if ($a =~ m/^\d+$/) { $offset += $a; } }
+}
+
+$cmd = "";
+
+foreach $x (@ARGV) { 
+    if ($x =~ m/^\S+$/) { $cmd .=  $x . " "; }
+    elsif ($x =~ m:\":) { $cmd .= "'$x' "; }
+    else { $cmd .= "\"$x\" "; } 
+}
+
+
+for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
+  $childpid = fork();
+  if (!defined $childpid) { die "Error forking in ssh.pl (writing to $logfile)"; }
+  if ($childpid == 0) {
+    # We're in the child... this branch executes the job and returns (possibly
+    # with an error status).
+    if (defined $jobname) {
+      $cmd =~ s/$jobname/$jobid/g;
+      $logfile =~ s/$jobname/$jobid/g;
+    }
+    { # work out the machine to ssh to.
+      $local_offset = $offset + $jobid - 1;  # subtract 1 since jobs never start
+                                             # from 0; we'd like the first job
+                                             # to normally run on the first
+                                             # machine.
+      $num_machines = scalar @machines;
+      # in the next line, the "+ $num_machines" is in case $local_offset is
+      # negative, to ensure the modulus is calculated in the mathematical way, not
+      # in the C way where (negative number % positive number) is negative.
+      $machines_index = ($local_offset + $num_machines) % $num_machines;
+      $machine = $machines[$machines_index];
+    }
+    if (!open(S, "|ssh $machine bash")) {
+      print STDERR "ssh.pl failed to ssh to $machine";
+      exit(1);  # exits from the forked process within ssh.pl.
+    }
+    $cwd = getcwd();
+    $logdir = dirname($logfile);
+    # Below, we're printing into ssh which has opened a bash session; these are
+    # bash commands.
+    print S "set -e\n";  # if any of the later commands fails, we want it to exit.
+    print S "cd $cwd\n";
+    print S ". ./path.sh\n";
+    print S "mkdir -p $logdir\n";
+    print S "time1=\`date +\"%s\"\`\n";
+    print S "( echo '#' Running on \`hostname\`\n";
+    print S "  echo '#' Started at \`date\`\n";
+    print S "  echo -n '# '; cat <<EOF\n";
+    print S "$cmd\n";
+    print S "EOF\n";
+    print S ") >$logfile\n";
+    print S "set +e\n";  # we don't want bash to exit if the next line fails.
+    # in the next line, || true means allow this one to fail and not have bash exit immediately.
+    print S " ( $cmd ) 2>>$logfile >>$logfile\n"; 
+    print S "ret=\$?\n";
+    print S "set -e\n"; # back into mode where it will exit on error.
+    print S "time2=\`date +\"%s\"\`\n";
+    print S "echo '#' Accounting: time=\$((\$time2-\$time1)) threads=1 >>$logfile\n";
+    print S "echo '#' Finished at \`date\` with status \$ret >>$logfile\n";
+    print S "exit \$ret";  # return with the status the command exited with.
+    $ret = close(S);
+    $ssh_return_status = $?;
+    # see http://perldoc.perl.org/functions/close.html for explanation of return
+    # status of close() and the variables it sets.
+    if (! $ret && $! != 0) { die "ssh.pl: unexpected problem ssh'ing to machine $machine"; }
+    if ($ssh_return_status != 0) { exit(1); } # exit with error status from this forked process.
+    else { exit(0); } # else exit with non-error status.
+  }
+}
+
+$ret = 0;
+$numfail = 0;
+for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
+  $r = wait();
+  if ($r == -1) { die "Error waiting for child process"; } # should never happen.
+  if ($? != 0) { $numfail++; $ret = 1; } # The child process failed.
+}
+
+if ($ret != 0) {
+  $njobs = $jobend - $jobstart + 1;
+  if ($njobs == 1) { 
+    if (defined $jobname) {
+      $logfile =~ s/$jobname/$jobstart/; # only one numbered job, so replace name with
+                                         # that job.
+    }
+    print STDERR "ssh.pl: job failed, log is in $logfile\n";
+    if ($logfile =~ m/JOB/) {
+      print STDERR "run.pl: probably you forgot to put JOB=1:\$nj in your script.";
+    }
+  }
+  else {
+    $logfile =~ s/$jobname/*/g;
+    print STDERR "ssh.pl: $numfail / $njobs failed, log is in $logfile\n";
+  }
+}
+
+
+exit ($ret);
diff --git a/ParallelWaveGAN/utils/stdout.pl b/ParallelWaveGAN/utils/stdout.pl
new file mode 100755
index 0000000000000000000000000000000000000000..1636406b18c8d4c61cd2f04f83d41a5c6b42c299
--- /dev/null
+++ b/ParallelWaveGAN/utils/stdout.pl
@@ -0,0 +1,282 @@
+#!/usr/bin/env perl
+use warnings; #sed replacement for -w perl parameter
+
+# In general, doing
+#  run.pl some.log a b c is like running the command a b c in
+# the bash shell, and putting the standard error and output into some.log.
+# To run parallel jobs (backgrounded on the host machine), you can do (e.g.)
+#  run.pl JOB=1:4 some.JOB.log a b c JOB is like running the command a b c JOB
+# and putting it in some.JOB.log, for each one. [Note: JOB can be any identifier].
+# If any of the jobs fails, this script will fail.
+
+# A typical example is:
+#  run.pl some.log my-prog "--opt=foo bar" foo \|  other-prog baz
+# and run.pl will run something like:
+# ( my-prog '--opt=foo bar' foo |  other-prog baz ) >& some.log
+#
+# Basically it takes the command-line arguments, quotes them
+# as necessary to preserve spaces, and evaluates them with bash.
+# In addition it puts the command line at the top of the log, and
+# the start and end times of the command at the beginning and end.
+# The reason why this is useful is so that we can create a different
+# version of this program that uses a queueing system instead.
+
+# use Data::Dumper;
+
+@ARGV < 2 && die "usage: run.pl log-file command-line arguments...";
+
+
+$max_jobs_run = -1;
+$jobstart = 1;
+$jobend = 1;
+$ignored_opts = ""; # These will be ignored.
+
+# First parse an option like JOB=1:4, and any
+# options that would normally be given to
+# queue.pl, which we will just discard.
+
+for (my $x = 1; $x <= 2; $x++) { # This for-loop is to
+  # allow the JOB=1:n option to be interleaved with the
+  # options to qsub.
+  while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) {
+    # parse any options that would normally go to qsub, but which will be ignored here.
+    my $switch = shift @ARGV;
+    if ($switch eq "-V") {
+      $ignored_opts .= "-V ";
+    } elsif ($switch eq "--max-jobs-run" || $switch eq "-tc") {
+      # we do support the option --max-jobs-run n, and its GridEngine form -tc n.
+      $max_jobs_run = shift @ARGV;
+      if (! ($max_jobs_run > 0)) {
+        die "run.pl: invalid option --max-jobs-run $max_jobs_run";
+      }
+    } else {
+      my $argument = shift @ARGV;
+      if ($argument =~ m/^--/) {
+        print STDERR "run.pl: WARNING: suspicious argument '$argument' to $switch; starts with '-'\n";
+      }
+      if ($switch eq "-sync" && $argument =~ m/^[yY]/) {
+        $ignored_opts .= "-sync "; # Note: in the
+        # corresponding code in queue.pl it says instead, just "$sync = 1;".
+      } elsif ($switch eq "-pe") { # e.g. -pe smp 5
+        my $argument2 = shift @ARGV;
+        $ignored_opts .= "$switch $argument $argument2 ";
+      } elsif ($switch eq "--gpu") {
+        $using_gpu = $argument;
+      } else {
+        # Ignore option.
+        $ignored_opts .= "$switch $argument ";
+      }
+    }
+  }
+  if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:20
+    $jobname = $1;
+    $jobstart = $2;
+    $jobend = $3;
+    shift;
+    if ($jobstart > $jobend) {
+      die "run.pl: invalid job range $ARGV[0]";
+    }
+    if ($jobstart <= 0) {
+      die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is required for GridEngine compatibility).";
+    }
+  } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1.
+    $jobname = $1;
+    $jobstart = $2;
+    $jobend = $2;
+    shift;
+  } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) {
+    print STDERR "run.pl: Warning: suspicious first argument to run.pl: $ARGV[0]\n";
+  }
+}
+
+# Users found this message confusing so we are removing it.
+# if ($ignored_opts ne "") {
+#   print STDERR "run.pl: Warning: ignoring options \"$ignored_opts\"\n";
+# }
+
+if ($max_jobs_run == -1) { # If --max-jobs-run option not set,
+                           # then work out the number of processors if possible,
+                           # and set it based on that.
+  $max_jobs_run = 0;
+  if ($using_gpu) {
+    if (open(P, "nvidia-smi -L |")) {
+      $max_jobs_run++ while (<P>);
+      close(P);
+    }
+    if ($max_jobs_run == 0) {
+      $max_jobs_run = 1;
+      print STDERR "run.pl: Warning: failed to detect number of GPUs from nvidia-smi, using ${max_jobs_run}\n";
+    }
+  } elsif (open(P, "</proc/cpuinfo")) {  # Linux
+    while (<P>) { if (m/^processor/) { $max_jobs_run++; } }
+    if ($max_jobs_run == 0) {
+      print STDERR "run.pl: Warning: failed to detect any processors from /proc/cpuinfo\n";
+      $max_jobs_run = 10;  # reasonable default.
+    }
+    close(P);
+  } elsif (open(P, "sysctl -a |")) {  # BSD/Darwin
+    while (<P>) {
+      if (m/hw\.ncpu\s*[:=]\s*(\d+)/) { # hw.ncpu = 4, or hw.ncpu: 4
+        $max_jobs_run = $1;
+        last;
+      }
+    }
+    close(P);
+    if ($max_jobs_run == 0) {
+      print STDERR "run.pl: Warning: failed to detect any processors from sysctl -a\n";
+      $max_jobs_run = 10;  # reasonable default.
+    }
+  } else {
+    # allow at most 32 jobs at once, on non-UNIX systems; change this code
+    # if you need to change this default.
+    $max_jobs_run = 32;
+  }
+  # The just-computed value of $max_jobs_run is just the number of processors
+  # (or our best guess); and if it happens that the number of jobs we need to
+  # run is just slightly above $max_jobs_run, it will make sense to increase
+  # $max_jobs_run to equal the number of jobs, so we don't have a small number
+  # of leftover jobs.
+  $num_jobs = $jobend - $jobstart + 1;
+  if (!$using_gpu &&
+      $num_jobs > $max_jobs_run && $num_jobs < 1.4 * $max_jobs_run) {
+    $max_jobs_run = $num_jobs;
+  }
+}
+
+$logfile = shift @ARGV;
+
+if (defined $jobname && $logfile !~ m/$jobname/ &&
+    $jobend > $jobstart) {
+  print STDERR "run.pl: you are trying to run a parallel job but "
+    . "you are putting the output into just one log file ($logfile)\n";
+  exit(1);
+}
+
+$cmd = "";
+
+foreach $x (@ARGV) {
+    if ($x =~ m/^\S+$/) { $cmd .=  $x . " "; }
+    elsif ($x =~ m:\":) { $cmd .= "'$x' "; }
+    else { $cmd .= "\"$x\" "; }
+}
+
+#$Data::Dumper::Indent=0;
+$ret = 0;
+$numfail = 0;
+%active_pids=();
+
+use POSIX ":sys_wait_h";
+for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
+  if (scalar(keys %active_pids) >= $max_jobs_run) {
+
+    # Lets wait for a change in any child's status
+    # Then we have to work out which child finished
+    $r = waitpid(-1, 0);
+    $code = $?;
+    if ($r < 0 ) { die "run.pl: Error waiting for child process"; } # should never happen.
+    if ( defined $active_pids{$r} ) {
+        $jid=$active_pids{$r};
+        $fail[$jid]=$code;
+        if ($code !=0) { $numfail++;}
+        delete $active_pids{$r};
+        # print STDERR "Finished: $r/$jid " .  Dumper(\%active_pids) . "\n";
+    } else {
+        die "run.pl: Cannot find the PID of the chold process that just finished.";
+    }
+
+    # In theory we could do a non-blocking waitpid over all jobs running just
+    # to find out if only one or more jobs finished during the previous waitpid()
+    # However, we just omit this and will reap the next one in the next pass
+    # through the for(;;) cycle
+  }
+  $childpid = fork();
+  if (!defined $childpid) { die "run.pl: Error forking in run.pl (writing to $logfile)"; }
+  if ($childpid == 0) { # We're in the child... this branch
+    # executes the job and returns (possibly with an error status).
+    if (defined $jobname) {
+      $cmd =~ s/$jobname/$jobid/g;
+      $logfile =~ s/$jobname/$jobid/g;
+    }
+    system("mkdir -p `dirname $logfile` 2>/dev/null");
+    open(F, ">$logfile") || die "run.pl: Error opening log file $logfile";
+    print F "# " . $cmd . "\n";
+    print F "# Started at " . `date`;
+    $starttime = `date +'%s'`;
+    print F "#\n";
+    close(F);
+
+    # Pipe into bash.. make sure we're not using any other shell.
+    open(B, "|bash") || die "run.pl: Error opening shell command";
+    print B "( " . $cmd . ") |& tee -a $logfile";
+    close(B);                   # If there was an error, exit status is in $?
+    $ret = $?;
+
+    $lowbits = $ret & 127;
+    $highbits = $ret >> 8;
+    if ($lowbits != 0) { $return_str = "code $highbits; signal $lowbits" }
+    else { $return_str = "code $highbits"; }
+
+    $endtime = `date +'%s'`;
+    open(F, ">>$logfile") || die "run.pl: Error opening log file $logfile (again)";
+    $enddate = `date`;
+    chop $enddate;
+    print F "# Accounting: time=" . ($endtime - $starttime) . " threads=1\n";
+    print F "# Ended ($return_str) at " . $enddate . ", elapsed time " . ($endtime-$starttime) . " seconds\n";
+    close(F);
+    exit($ret == 0 ? 0 : 1);
+  } else {
+    $pid[$jobid] = $childpid;
+    $active_pids{$childpid} = $jobid;
+    # print STDERR "Queued: " .  Dumper(\%active_pids) . "\n";
+  }
+}
+
+# Now we have submitted all the jobs, lets wait until all the jobs finish
+foreach $child (keys %active_pids) {
+    $jobid=$active_pids{$child};
+    $r = waitpid($pid[$jobid], 0);
+    $code = $?;
+    if ($r == -1) { die "run.pl: Error waiting for child process"; } # should never happen.
+    if ($r != 0) { $fail[$jobid]=$code; $numfail++ if $code!=0; } # Completed successfully
+}
+
+# Some sanity checks:
+# The $fail array should not contain undefined codes
+# The number of non-zeros in that array  should be equal to $numfail
+# We cannot do foreach() here, as the JOB ids do not necessarily start by zero
+$failed_jids=0;
+for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
+  $job_return = $fail[$jobid];
+  if (not defined $job_return ) {
+    # print Dumper(\@fail);
+
+    die "run.pl: Sanity check failed: we have indication that some jobs are running " .
+      "even after we waited for all jobs to finish" ;
+  }
+  if ($job_return != 0 ){ $failed_jids++;}
+}
+if ($failed_jids != $numfail) {
+  die "run.pl: Sanity check failed: cannot find out how many jobs failed ($failed_jids x $numfail)."
+}
+if ($numfail > 0) { $ret = 1; }
+
+if ($ret != 0) {
+  $njobs = $jobend - $jobstart + 1;
+  if ($njobs == 1) {
+    if (defined $jobname) {
+      $logfile =~ s/$jobname/$jobstart/; # only one numbered job, so replace name with
+                                         # that job.
+    }
+    print STDERR "run.pl: job failed, log is in $logfile\n";
+    if ($logfile =~ m/JOB/) {
+      print STDERR "run.pl: probably you forgot to put JOB=1:\$nj in your script.";
+    }
+  }
+  else {
+    $logfile =~ s/$jobname/*/g;
+    print STDERR "run.pl: $numfail / $njobs failed, log is in $logfile\n";
+  }
+}
+
+
+exit ($ret);
diff --git a/config/convert.yaml b/config/convert.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2c95425c62888c5a2077a49bcae7a4d0ec4c8652
--- /dev/null
+++ b/config/convert.yaml
@@ -0,0 +1,4 @@
+defaults:
+    - model: default
+
+checkpoint: ???
diff --git a/config/model/default.yaml b/config/model/default.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d0a4ecdd4b97a7965324ea7723d10e59e7e3577a
--- /dev/null
+++ b/config/model/default.yaml
@@ -0,0 +1,25 @@
+model:
+    encoder:
+        in_channels: 80
+        channels: 512
+        n_embeddings: 512
+        z_dim: 64
+        c_dim: 256
+    cpc:
+        n_prediction_steps: ${training.n_prediction_steps}
+        n_speakers_per_batch: ${training.n_speakers_per_batch}
+        n_utterances_per_speaker: ${training.n_utterances_per_speaker}
+        n_negatives: ${training.n_negatives}
+        z_dim: ${model.encoder.z_dim}
+        c_dim: ${model.encoder.c_dim}
+    cpc_model:
+        nPredicts: 12
+        dimOutputAR: 256
+        dimOutputEncoder: 64
+        negativeSamplingExt: 64
+        rnnMode: ffd
+        dropout: False
+        speakerEmbedding: 0
+        nSpeakers: 0
+        sizeInputSeq: 64
+
diff --git a/config/train.yaml b/config/train.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8c73c1508361d1ec8b32d505108d8659eac9c708
--- /dev/null
+++ b/config/train.yaml
@@ -0,0 +1,21 @@
+defaults:
+    - model: default
+    - training: cpc
+
+resume: False
+checkpoint_dir: checkpoints
+
+mi_weight: 0.01
+mi_lr: 3e-4
+mi_iters: 5
+
+sampling: sameSeq
+train_file: train
+valid_file: valid
+
+encoder_lf0_type: no_emb
+use_CSMI: True  # use MI between content and speaker
+use_CPMI: True  # use MI between content and pitch
+use_PSMI: True  # use MI between picth and speaker
+
+use_amp: False # set to True to speed up training
\ No newline at end of file
diff --git a/config/training/cpc.yaml b/config/training/cpc.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a898ac779d99e1cbc5e35b6020477ce23867e539
--- /dev/null
+++ b/config/training/cpc.yaml
@@ -0,0 +1,20 @@
+training:
+    sample_frames: 128
+    batch_size: 256
+    n_speakers_per_batch: 256
+    n_utterances_per_speaker: 8
+    n_prediction_steps: 6
+    n_negatives: 10
+    n_epochs: 500
+    scheduler:
+        warmup_epochs: 10
+        initial_lr: 1e-6
+        max_lr: 1e-3
+        gamma: 0.5
+        milestones:
+            - 300
+            - 400
+            - 500
+    checkpoint_interval: 50
+    n_workers: 4
+    log_interval: 50
\ No newline at end of file
diff --git a/convert.py b/convert.py
new file mode 100644
index 0000000000000000000000000000000000000000..c300f505d97ea2942ce3ac7ef4cc982dfd10b3c0
--- /dev/null
+++ b/convert.py
@@ -0,0 +1,154 @@
+import hydra
+import hydra.utils as utils
+
+from pathlib import Path
+import torch
+import numpy as np
+from tqdm import tqdm
+
+import soundfile as sf
+
+from model_encoder import Encoder, Encoder_lf0
+from model_decoder import Decoder_ac
+from model_encoder import SpeakerEncoder as Encoder_spk
+import os
+import random
+
+from glob import glob
+import subprocess
+from spectrogram import logmelspectrogram
+import kaldiio
+
+import resampy
+import pyworld as pw
+
+def select_wavs(paths, min_dur=2, max_dur=8):
+    pp = []
+    for p in paths:
+        x, fs = sf.read(p)
+        if len(x)/fs>=min_dur and len(x)/fs<=8:
+            pp.append(p)
+    return pp
+
+
+def extract_logmel(wav_path, mean, std, sr=16000):
+    # wav, fs = librosa.load(wav_path, sr=sr)
+    wav, fs = sf.read(wav_path)
+    if fs != sr:
+        wav = resampy.resample(wav, fs, sr, axis=0)
+        fs = sr
+    #wav, _ = librosa.effects.trim(wav, top_db=15)
+    # duration = len(wav)/fs
+    assert fs == 16000
+    peak = np.abs(wav).max()
+    if peak > 1.0:
+        wav /= peak
+    mel = logmelspectrogram(
+                x=wav,
+                fs=fs,
+                n_mels=80,
+                n_fft=400,
+                n_shift=160,
+                win_length=400,
+                window='hann',
+                fmin=80,
+                fmax=7600,
+            )
+    mel = (mel - mean) / (std + 1e-8)
+    tlen = mel.shape[0]
+    frame_period = 160/fs*1000
+    f0, timeaxis = pw.dio(wav.astype('float64'), fs, frame_period=frame_period)
+    f0 = pw.stonemask(wav.astype('float64'), f0, timeaxis, fs)
+    f0 = f0[:tlen].reshape(-1).astype('float32')
+    nonzeros_indices = np.nonzero(f0)
+    lf0 = f0.copy()
+    lf0[nonzeros_indices] = np.log(f0[nonzeros_indices]) # for f0(Hz), lf0 > 0 when f0 != 0
+    mean, std = np.mean(lf0[nonzeros_indices]), np.std(lf0[nonzeros_indices])
+    lf0[nonzeros_indices] = (lf0[nonzeros_indices] - mean) / (std + 1e-8)
+    return mel, lf0
+
+@hydra.main(config_path="config/convert.yaml")
+def convert(cfg):
+    src_wav_paths = glob('/Dataset/VCTK-Corpus/wav48_silence_trimmed/p225/*mic1.flac') # modified to absolute wavs path, can select any unseen speakers
+    src_wav_paths = select_wavs(src_wav_paths)
+    
+    tar1_wav_paths = glob('/Dataset/VCTK-Corpus/wav48_silence_trimmed/p231/*mic1.flac') # can select any unseen speakers
+    tar2_wav_paths = glob('/Dataset/VCTK-Corpus/wav48_silence_trimmed/p243/*mic1.flac') # can select any unseen speakers
+    # tar1_wav_paths = select_wavs(tar1_wav_paths)
+    # tar2_wav_paths = select_wavs(tar2_wav_paths)
+    tar1_wav_paths = [sorted(tar1_wav_paths)[0]]
+    tar2_wav_paths = [sorted(tar2_wav_paths)[0]]
+    
+    print('len(src):', len(src_wav_paths), 'len(tar1):', len(tar1_wav_paths), 'len(tar2):', len(tar2_wav_paths))
+
+    tmp = cfg.checkpoint.split('/')
+    steps = tmp[-1].split('-')[-1].split('.')[0]
+    out_dir = f'test/{tmp[-3]}-{tmp[-2]}-{steps}'
+    out_dir = Path(utils.to_absolute_path(out_dir))
+    out_dir.mkdir(exist_ok=True, parents=True)
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    encoder = Encoder(**cfg.model.encoder)
+    encoder_lf0 = Encoder_lf0()
+    encoder_spk = Encoder_spk()
+    decoder = Decoder_ac(dim_neck=64)
+    encoder.to(device)
+    encoder_lf0.to(device)
+    encoder_spk.to(device)
+    decoder.to(device)
+
+    print("Load checkpoint from: {}:".format(cfg.checkpoint))
+    checkpoint_path = utils.to_absolute_path(cfg.checkpoint)
+    checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage)
+    encoder.load_state_dict(checkpoint["encoder"])
+    encoder_spk.load_state_dict(checkpoint["encoder_spk"])
+    decoder.load_state_dict(checkpoint["decoder"])
+
+    encoder.eval()
+    encoder_spk.eval()
+    decoder.eval()
+    
+    mel_stats = np.load('./data/mel_stats.npy')
+    mean = mel_stats[0]
+    std = mel_stats[1]
+    feat_writer = kaldiio.WriteHelper("ark,scp:{o}.ark,{o}.scp".format(o=str(out_dir)+'/feats.1'))
+    for i, src_wav_path in tqdm(enumerate(src_wav_paths, 1)):
+        if i>10:
+            break
+        mel, lf0 = extract_logmel(src_wav_path, mean, std)
+        if i % 2 == 1:
+            ref_wav_path = random.choice(tar2_wav_paths)
+            tar = 'tarMale_'
+        else:
+            ref_wav_path = random.choice(tar1_wav_paths)
+            tar = 'tarFemale_'
+        ref_mel, _ = extract_logmel(ref_wav_path, mean, std)
+        
+        mel = torch.FloatTensor(mel.T).unsqueeze(0).to(device)
+        lf0 = torch.FloatTensor(lf0).unsqueeze(0).to(device)
+        ref_mel = torch.FloatTensor(ref_mel.T).unsqueeze(0).to(device)
+        
+        out_filename = os.path.basename(src_wav_path).split('.')[0] 
+        with torch.no_grad():
+            z, _, _, _ = encoder.encode(mel)
+            lf0_embs = encoder_lf0(lf0)
+            spk_embs = encoder_spk(ref_mel)
+            output = decoder(z, lf0_embs, spk_embs)
+            
+            logmel = output.squeeze(0).cpu().numpy()
+            feat_writer[out_filename] = logmel
+            feat_writer[out_filename+'_src'] = mel.squeeze(0).cpu().numpy().T
+            feat_writer[out_filename+'_ref'] = ref_mel.squeeze(0).cpu().numpy().T
+            
+        subprocess.call(['cp', src_wav_path, out_dir])
+    
+    feat_writer.close()
+    print('synthesize waveform...')
+    cmd = ['parallel-wavegan-decode', '--checkpoint', \
+           '/vocoder/checkpoint-3000000steps.pkl', \
+           '--feats-scp', f'{str(out_dir)}/feats.1.scp', '--outdir', str(out_dir)]
+    subprocess.call(cmd)
+
+if __name__ == "__main__":
+    convert()
diff --git a/convert_example.py b/convert_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..764d63a8665bc4407557fc8ad4c9cc5036d84891
--- /dev/null
+++ b/convert_example.py
@@ -0,0 +1,122 @@
+
+import torch
+import numpy as np
+
+
+import soundfile as sf
+
+from model_encoder import Encoder, Encoder_lf0
+from model_decoder import Decoder_ac
+from model_encoder import SpeakerEncoder as Encoder_spk
+import os
+
+import subprocess
+from spectrogram import logmelspectrogram
+import kaldiio
+
+import resampy
+import pyworld as pw
+
+import argparse
+
+
+def extract_logmel(wav_path, mean, std, sr=16000):
+    # wav, fs = librosa.load(wav_path, sr=sr)
+    wav, fs = sf.read(wav_path)
+    if fs != sr:
+        wav = resampy.resample(wav, fs, sr, axis=0)
+        fs = sr
+    #wav, _ = librosa.effects.trim(wav, top_db=15)
+    # duration = len(wav)/fs
+    assert fs == 16000
+    peak = np.abs(wav).max()
+    if peak > 1.0:
+        wav /= peak
+    mel = logmelspectrogram(
+                x=wav,
+                fs=fs,
+                n_mels=80,
+                n_fft=400,
+                n_shift=160,
+                win_length=400,
+                window='hann',
+                fmin=80,
+                fmax=7600,
+            )
+    
+    mel = (mel - mean) / (std + 1e-8)
+    tlen = mel.shape[0]
+    frame_period = 160/fs*1000
+    f0, timeaxis = pw.dio(wav.astype('float64'), fs, frame_period=frame_period)
+    f0 = pw.stonemask(wav.astype('float64'), f0, timeaxis, fs)
+    f0 = f0[:tlen].reshape(-1).astype('float32')
+    nonzeros_indices = np.nonzero(f0)
+    lf0 = f0.copy()
+    lf0[nonzeros_indices] = np.log(f0[nonzeros_indices]) # for f0(Hz), lf0 > 0 when f0 != 0
+    mean, std = np.mean(lf0[nonzeros_indices]), np.std(lf0[nonzeros_indices])
+    lf0[nonzeros_indices] = (lf0[nonzeros_indices] - mean) / (std + 1e-8)
+    return mel, lf0
+
+
+def convert(args):
+    src_wav_path = args.source_wav
+    ref_wav_path = args.reference_wav
+    
+    out_dir = args.converted_wav_path
+    os.makedirs(out_dir, exist_ok=True)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    encoder = Encoder(in_channels=80, channels=512, n_embeddings=512, z_dim=64, c_dim=256)
+    encoder_lf0 = Encoder_lf0()
+    encoder_spk = Encoder_spk()
+    decoder = Decoder_ac(dim_neck=64)
+    encoder.to(device)
+    encoder_lf0.to(device)
+    encoder_spk.to(device)
+    decoder.to(device)
+
+    checkpoint_path = args.model_path
+    checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage)
+    encoder.load_state_dict(checkpoint["encoder"])
+    encoder_spk.load_state_dict(checkpoint["encoder_spk"])
+    decoder.load_state_dict(checkpoint["decoder"])
+
+    encoder.eval()
+    encoder_spk.eval()
+    decoder.eval()
+    
+    mel_stats = np.load('./mel_stats/stats.npy')
+    mean = mel_stats[0]
+    std = mel_stats[1]
+    feat_writer = kaldiio.WriteHelper("ark,scp:{o}.ark,{o}.scp".format(o=str(out_dir)+'/feats.1'))
+    src_mel, src_lf0 = extract_logmel(src_wav_path, mean, std)
+    ref_mel, _ = extract_logmel(ref_wav_path, mean, std)
+    src_mel = torch.FloatTensor(src_mel.T).unsqueeze(0).to(device)
+    src_lf0 = torch.FloatTensor(src_lf0).unsqueeze(0).to(device)
+    ref_mel = torch.FloatTensor(ref_mel.T).unsqueeze(0).to(device)
+    out_filename = os.path.basename(src_wav_path).split('.')[0] 
+    with torch.no_grad():
+        z, _, _, _ = encoder.encode(src_mel)
+        lf0_embs = encoder_lf0(src_lf0)
+        spk_emb = encoder_spk(ref_mel)
+        output = decoder(z, lf0_embs, spk_emb)
+        
+        feat_writer[out_filename+'_converted'] = output.squeeze(0).cpu().numpy()
+        feat_writer[out_filename+'_source'] = src_mel.squeeze(0).cpu().numpy().T
+        feat_writer[out_filename+'_reference'] = ref_mel.squeeze(0).cpu().numpy().T
+    
+    feat_writer.close()
+    print('synthesize waveform...')
+    cmd = ['parallel-wavegan-decode', '--checkpoint', \
+           './vocoder/checkpoint-3000000steps.pkl', \
+           '--feats-scp', f'{str(out_dir)}/feats.1.scp', '--outdir', str(out_dir)]
+    subprocess.call(cmd)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--source_wav', '-s', type=str, required=True)
+    parser.add_argument('--reference_wav', '-r', type=str, required=True)
+    parser.add_argument('--converted_wav_path', '-c', type=str, default='converted')
+    parser.add_argument('--model_path', '-m', type=str, required=True)
+    args = parser.parse_args()
+    convert(args)
diff --git a/dataset.py b/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..d546bcdccf19aacadbb3998090aaeda3860229ca
--- /dev/null
+++ b/dataset.py
@@ -0,0 +1,63 @@
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+import json
+import random
+from pathlib import Path
+import os
+
+
+class CPCDataset_sameSeq(Dataset):
+    def __init__(self, root, n_sample_frames, mode):
+        self.root = Path(root)
+        self.n_sample_frames = n_sample_frames
+
+        self.speakers = sorted(os.listdir(root/f'{mode}/mels'))
+        
+        with open(self.root / f"{mode}.json") as file:
+            metadata = json.load(file)
+        self.metadata = []
+        for mel_len, mel_out_path, lf0_out_path in metadata:
+            # if mel_len > n_sample_frames: # only select wavs having frames>=140
+            mel_out_path = Path(mel_out_path)
+            lf0_out_path = Path(lf0_out_path)
+            speaker = mel_out_path.parent.stem
+            self.metadata.append([speaker, mel_out_path, lf0_out_path])
+        print('n_sample_frames:', n_sample_frames, 'metadata:', len(self.metadata))
+        random.shuffle(self.metadata)
+
+    def __len__(self):
+        return len(self.metadata)
+
+    def __getitem__(self, index):
+        speaker, mel_path, lf0_path = self.metadata[index]
+
+        mel_path = self.root.parent / mel_path
+        lf0_path = self.root.parent / lf0_path
+        mel = np.load(mel_path).T
+        lf0 = np.load(lf0_path)
+        melt = mel
+        lf0t = lf0
+        while mel.shape[-1] < self.n_sample_frames:
+            mel = np.concatenate([mel, melt], -1)
+            lf0 = np.concatenate([lf0, lf0t], 0)
+        zero_idxs = np.where(lf0 == 0.0)[0]
+        nonzero_idxs = np.where(lf0 != 0.0)[0]
+        if len(nonzero_idxs) > 0 :
+            mean = np.mean(lf0[nonzero_idxs])
+            std = np.std(lf0[nonzero_idxs])
+            if std == 0:
+                lf0 -= mean
+                lf0[zero_idxs] = 0.0
+            else:
+                lf0 = (lf0 - mean) / (std + 1e-8)
+                lf0[zero_idxs] = 0.0
+
+        pos = random.randint(0, mel.shape[-1] - self.n_sample_frames)
+        mel = mel[:, pos:pos + self.n_sample_frames]
+        lf0 = lf0[pos:pos + self.n_sample_frames]
+        return torch.from_numpy(mel), torch.from_numpy(lf0), self.speakers.index(speaker)
+
+
+
+
diff --git a/mel_stats/stats.npy b/mel_stats/stats.npy
new file mode 100644
index 0000000000000000000000000000000000000000..32f533fa2fcc1c5dba8b4643644963c7bb4dfb55
Binary files /dev/null and b/mel_stats/stats.npy differ
diff --git a/mi_estimators.py b/mi_estimators.py
new file mode 100644
index 0000000000000000000000000000000000000000..27e08155720c4f9b4c993c6aff86aaedf1cfc014
--- /dev/null
+++ b/mi_estimators.py
@@ -0,0 +1,201 @@
+'''
+Modified from: https://github.com/Linear95/CLUB
+'''
+
+import torch 
+import torch.nn as nn
+
+class CLUB(nn.Module):  # CLUB: Mutual Information Contrastive Learning Upper Bound
+    '''
+        This class provides the CLUB estimation to I(X,Y)
+        Method:
+            mi_est() :      provides the estimation with input samples  
+            loglikeli() :   provides the log-likelihood of the approximation q(Y|X) with input samples
+        Arguments:
+            x_dim, y_dim :         the dimensions of samples from X, Y respectively
+            hidden_size :          the dimension of the hidden layer of the approximation network q(Y|X)
+            x_samples, y_samples : samples from X and Y, having shape [sample_size, x_dim/y_dim] 
+    '''
+    def __init__(self, x_dim, y_dim, hidden_size):
+        super(CLUB, self).__init__()
+        # p_mu outputs mean of q(Y|X)
+        self.p_mu = nn.Sequential(nn.Linear(x_dim, hidden_size//2),
+                                       nn.ReLU(),
+                                       nn.Linear(hidden_size//2, y_dim))
+        # p_logvar outputs log of variance of q(Y|X)
+        self.p_logvar = nn.Sequential(nn.Linear(x_dim, hidden_size//2),
+                                        nn.ReLU(),
+                                        nn.Linear(hidden_size//2, y_dim),
+                                        nn.Tanh())
+        # self.p_logvar = nn.Sequential(nn.Linear(x_dim, hidden_size//2),
+        #                                 nn.ReLU(),
+        #                                 nn.Linear(hidden_size//2, y_dim))
+
+    def get_mu_logvar(self, x_samples):
+        mu = self.p_mu(x_samples)
+        logvar = self.p_logvar(x_samples)
+        return mu, logvar
+    
+    def mi_est(self, x_samples, y_samples): 
+        mu, logvar = self.get_mu_logvar(x_samples)
+        
+        # log of conditional probability of positive sample pairs
+        positive = - (mu - y_samples)**2 /2./logvar.exp()  
+        
+        prediction_1 = mu.unsqueeze(1)          # shape [nsample,1,dim]
+        y_samples_1 = y_samples.unsqueeze(0)    # shape [1,nsample,dim]
+
+        # log of conditional probability of negative sample pairs
+        negative = - ((y_samples_1 - prediction_1)**2).mean(dim=1)/2./logvar.exp() 
+
+        return (positive.sum(dim = -1) - negative.sum(dim = -1)).mean()
+
+    def loglikeli(self, x_samples, y_samples): # unnormalized loglikelihood 
+        mu, logvar = self.get_mu_logvar(x_samples)
+        return (-(mu - y_samples)**2 /logvar.exp()-logvar).sum(dim=1).mean(dim=0)
+
+    
+    
+class CLUBSample(nn.Module):  # Sampled version of the CLUB estimator
+    def __init__(self, x_dim, y_dim, hidden_size):
+        super(CLUBSample, self).__init__()
+        self.p_mu = nn.Sequential(nn.Linear(x_dim, hidden_size//2),
+                                       nn.ReLU(),
+                                       nn.Linear(hidden_size//2, hidden_size//2),
+                                       nn.ReLU(),
+                                       nn.Linear(hidden_size//2, hidden_size//2),
+                                       nn.ReLU(),
+                                       nn.Linear(hidden_size//2, y_dim))
+
+        self.p_logvar = nn.Sequential(nn.Linear(x_dim, hidden_size//2),
+                                       nn.ReLU(),
+                                       nn.Linear(hidden_size//2, hidden_size//2),
+                                       nn.ReLU(),
+                                       nn.Linear(hidden_size//2, hidden_size//2),
+                                       nn.ReLU(),
+                                       nn.Linear(hidden_size//2, y_dim),
+                                       nn.Tanh())
+
+    def get_mu_logvar(self, x_samples):
+        mu = self.p_mu(x_samples)
+        logvar = self.p_logvar(x_samples)
+        return mu, logvar
+     
+        
+    def loglikeli(self, x_samples, y_samples):
+        mu, logvar = self.get_mu_logvar(x_samples)
+        return (-(mu - y_samples)**2 /logvar.exp()-logvar).sum(dim=1).mean(dim=0)
+    
+
+    def mi_est(self, x_samples, y_samples):
+        mu, logvar = self.get_mu_logvar(x_samples)
+        
+        sample_size = x_samples.shape[0]
+        #random_index = torch.randint(sample_size, (sample_size,)).long()
+        random_index = torch.randperm(sample_size).long()
+        
+        positive = - (mu - y_samples)**2 / logvar.exp()
+        negative = - (mu - y_samples[random_index])**2 / logvar.exp()
+        upper_bound = (positive.sum(dim = -1) - negative.sum(dim = -1)).mean()
+        return upper_bound/2.
+    
+    
+class CLUBSample_reshape(nn.Module):  # Sampled version of the CLUB estimator
+    def __init__(self, x_dim, y_dim, hidden_size):
+        super(CLUBSample_reshape, self).__init__()
+        self.p_mu = nn.Sequential(nn.Linear(x_dim, hidden_size//2),
+                                       nn.ReLU(),
+                                       nn.Linear(hidden_size//2, hidden_size//2),
+                                       nn.ReLU(),
+                                       nn.Linear(hidden_size//2, hidden_size//2),
+                                       nn.ReLU(),
+                                       nn.Linear(hidden_size//2, y_dim))
+
+        self.p_logvar = nn.Sequential(nn.Linear(x_dim, hidden_size//2),
+                                       nn.ReLU(),
+                                       nn.Linear(hidden_size//2, hidden_size//2),
+                                       nn.ReLU(),
+                                       nn.Linear(hidden_size//2, hidden_size//2),
+                                       nn.ReLU(),
+                                       nn.Linear(hidden_size//2, y_dim),
+                                       nn.Tanh())
+
+    def get_mu_logvar(self, x_samples):
+        mu = self.p_mu(x_samples)
+        logvar = self.p_logvar(x_samples)
+        return mu, logvar
+     
+        
+    def loglikeli(self, x_samples, y_samples):
+        mu, logvar = self.get_mu_logvar(x_samples)
+        mu = mu.reshape(-1, mu.shape[-1]) # (bs, y_dim) -> (bs, 1, y_dim) -> (bs, T, y_dim) -> (bs*T, y_dim)
+        logvar = logvar.reshape(-1, logvar.shape[-1])
+        y_samples = y_samples.reshape(-1, y_samples.shape[-1]) # (bs, T, y_dim) -> (bs*T, y_dim)
+        return (-(mu - y_samples)**2 /logvar.exp()-logvar).sum(dim=1).mean(dim=0)
+    
+
+    def mi_est(self, x_samples, y_samples):
+        mu, logvar = self.get_mu_logvar(x_samples)
+        sample_size = mu.shape[0]
+        random_index = torch.randperm(sample_size).long()
+        y_shuffle = y_samples[random_index]
+        mu = mu.reshape(-1, mu.shape[-1]) # (bs, y_dim) -> (bs, 1, y_dim) -> (bs, T, y_dim) -> (bs*T, y_dim)
+        logvar = logvar.reshape(-1, logvar.shape[-1])
+        y_samples = y_samples.reshape(-1, y_samples.shape[-1]) # (bs, T, y_dim) -> (bs*T, y_dim)
+        y_shuffle = y_shuffle.reshape(-1, y_shuffle.shape[-1]) # (bs, T, y_dim) -> (bs*T, y_dim)
+        
+        positive = - (mu - y_samples)**2 / logvar.exp()
+        negative = - (mu - y_shuffle)**2 / logvar.exp()
+        upper_bound = (positive.sum(dim = -1) - negative.sum(dim = -1)).mean()
+        return upper_bound/2.
+
+
+class CLUBSample_group(nn.Module):  # Sampled version of the CLUB estimator
+    def __init__(self, x_dim, y_dim, hidden_size):
+        super(CLUBSample_group, self).__init__()
+        self.p_mu = nn.Sequential(nn.Linear(x_dim, hidden_size//2),
+                                       nn.ReLU(),
+                                       nn.Linear(hidden_size//2, hidden_size//2),
+                                       nn.ReLU(),
+                                       nn.Linear(hidden_size//2, hidden_size//2),
+                                       nn.ReLU(),
+                                       nn.Linear(hidden_size//2, y_dim))
+
+        self.p_logvar = nn.Sequential(nn.Linear(x_dim, hidden_size//2),
+                                       nn.ReLU(),
+                                       nn.Linear(hidden_size//2, hidden_size//2),
+                                       nn.ReLU(),
+                                       nn.Linear(hidden_size//2, hidden_size//2),
+                                       nn.ReLU(),
+                                       nn.Linear(hidden_size//2, y_dim),
+                                       nn.Tanh())
+
+    def get_mu_logvar(self, x_samples):
+        mu = self.p_mu(x_samples)
+        logvar = self.p_logvar(x_samples)
+        return mu, logvar
+     
+        
+    def loglikeli(self, x_samples, y_samples): # unnormalized loglikelihood 
+        mu, logvar = self.get_mu_logvar(x_samples) # mu/logvar: (bs, y_dim)
+        mu = mu.unsqueeze(1).expand(-1, y_samples.shape[1], -1).reshape(-1, mu.shape[-1]) # (bs, y_dim) -> (bs, 1, y_dim) -> (bs, T, y_dim) -> (bs*T, y_dim)
+        logvar = logvar.unsqueeze(1).expand(-1, y_samples.shape[1], -1).reshape(-1, logvar.shape[-1])
+        y_samples = y_samples.reshape(-1, y_samples.shape[-1]) # (bs, T, y_dim) -> (bs*T, y_dim)
+        return (-(mu - y_samples)**2 /logvar.exp()-logvar).sum(dim=1).mean(dim=0) / 2 
+
+    def mi_est(self, x_samples, y_samples): # x_samples: (bs, x_dim); y_samples: (bs, T, y_dim)
+        mu, logvar = self.get_mu_logvar(x_samples)
+        
+        sample_size = x_samples.shape[0]
+        #random_index = torch.randint(sample_size, (sample_size,)).long()
+        random_index = torch.randperm(sample_size).long()
+        
+        # log of conditional probability of positive sample pairs
+        mu_exp1 = mu.unsqueeze(1).expand(-1, y_samples.shape[1], -1) # (bs, y_dim) -> (bs, T, y_dim)
+        # logvar_exp1 = logvar.unqueeze(1).expand(-1, y_samples.shape[1], -1).reshape(-1, logvar.shape[-1])
+        positive = - ((mu_exp1 - y_samples)**2).mean(dim=1) / logvar.exp() # mean along T
+        negative = - ((mu_exp1 - y_samples[random_index])**2).mean(dim=1) / logvar.exp() # mean along T
+
+        return (positive.sum(dim = -1) - negative.sum(dim = -1)).mean() / 2
+
+
diff --git a/model_decoder.py b/model_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ce4cdf6b1aa7b14bd5e15876589a797ecb7b023
--- /dev/null
+++ b/model_decoder.py
@@ -0,0 +1,168 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# import numpy as np
+'''
+reference from: https://github.com/auspicious3000/autovc/blob/master/model_vc.py
+'''
+
+class LinearNorm(torch.nn.Module):
+    def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
+        super(LinearNorm, self).__init__()
+        self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
+
+        torch.nn.init.xavier_uniform_(
+            self.linear_layer.weight,
+            gain=torch.nn.init.calculate_gain(w_init_gain))
+
+    def forward(self, x):
+        return self.linear_layer(x)
+
+
+class ConvNorm(torch.nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
+                 padding=None, dilation=1, bias=True, w_init_gain='linear'):
+        super(ConvNorm, self).__init__()
+        if padding is None:
+            assert(kernel_size % 2 == 1)
+            padding = int(dilation * (kernel_size - 1) / 2)
+
+        self.conv = torch.nn.Conv1d(in_channels, out_channels,
+                                    kernel_size=kernel_size, stride=stride,
+                                    padding=padding, dilation=dilation,
+                                    bias=bias)
+
+        torch.nn.init.xavier_uniform_(
+            self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
+
+    def forward(self, signal):
+        conv_signal = self.conv(signal)
+        return conv_signal
+
+    
+    
+class Postnet(nn.Module):
+    """Postnet
+        - Five 1-d convolution with 512 channels and kernel size 5
+    """
+
+    def __init__(self):
+        super(Postnet, self).__init__()
+        self.convolutions = nn.ModuleList()
+
+        self.convolutions.append(
+            nn.Sequential(
+                ConvNorm(80, 512,
+                         kernel_size=5, stride=1,
+                         padding=2,
+                         dilation=1, w_init_gain='tanh'),
+                nn.BatchNorm1d(512))
+        )
+
+        for i in range(1, 5 - 1):
+            self.convolutions.append(
+                nn.Sequential(
+                    ConvNorm(512,
+                             512,
+                             kernel_size=5, stride=1,
+                             padding=2,
+                             dilation=1, w_init_gain='tanh'),
+                    nn.BatchNorm1d(512))
+            )
+
+        self.convolutions.append(
+            nn.Sequential(
+                ConvNorm(512, 80,
+                         kernel_size=5, stride=1,
+                         padding=2,
+                         dilation=1, w_init_gain='linear'),
+                nn.BatchNorm1d(80))
+            )
+
+    def forward(self, x):
+        for i in range(len(self.convolutions) - 1):
+            x = torch.tanh(self.convolutions[i](x))
+
+        x = self.convolutions[-1](x)
+
+        return x    
+    
+    
+
+class Decoder(nn.Module):
+    """Decoder module:
+    """
+    def __init__(self, dim_neck=64, dim_lf0=1, dim_emb=256, dim_pre=512):
+        super(Decoder, self).__init__()
+        
+        self.lstm1 = nn.LSTM(dim_neck+dim_emb+dim_lf0, dim_pre, 1, batch_first=True)
+        
+        convolutions = []
+        for i in range(3):
+            conv_layer = nn.Sequential(
+                ConvNorm(dim_pre,
+                         dim_pre,
+                         kernel_size=5, stride=1,
+                         padding=2,
+                         dilation=1, w_init_gain='relu'),
+                nn.BatchNorm1d(dim_pre))
+            convolutions.append(conv_layer)
+        self.convolutions = nn.ModuleList(convolutions)
+        
+        self.lstm2 = nn.LSTM(dim_pre, 1024, 2, batch_first=True)
+        
+        self.linear_projection = LinearNorm(1024, 80)
+
+    def forward(self, x):
+        
+        #self.lstm1.flatten_parameters()
+        x, _ = self.lstm1(x)
+        x = x.transpose(1, 2)
+        
+        for conv in self.convolutions:
+            x = F.relu(conv(x))
+        x = x.transpose(1, 2)
+        
+        outputs, _ = self.lstm2(x)
+        
+        decoder_output = self.linear_projection(outputs)
+
+        return decoder_output   
+    
+    
+    
+        
+class Decoder_ac(nn.Module):
+    """Decoder_ac network."""
+    def __init__(self, dim_neck=64, dim_lf0=1, dim_emb=256, dim_pre=512, use_l1_loss=False):
+        super(Decoder_ac, self).__init__()
+        self.use_l1_loss = use_l1_loss
+        # self.encoder = Encoder(dim_neck, dim_emb, freq)
+        self.decoder = Decoder(dim_neck, dim_lf0, dim_emb, dim_pre)
+        self.postnet = Postnet()
+
+    def forward(self, z, lf0_embs, spk_embs, mel_target=None):
+        z = F.interpolate(z.transpose(1, 2), scale_factor=2) # (bs, 140/2, 64) -> (bs, 64, 140/2) -> (bs, 64, 140)
+        z = z.transpose(1, 2) # (bs, 64, 140) -> (bs, 140, 64)
+        spk_embs_exp = spk_embs.unsqueeze(1).expand(-1,z.shape[1],-1)
+        lf0_embs = lf0_embs[:,:z.shape[1],:]
+        # print(z.shape, lf0_embs.shape)
+        x = torch.cat([z, lf0_embs, spk_embs_exp], dim=-1)
+        
+        mel_outputs = self.decoder(x)
+                
+        mel_outputs_postnet = self.postnet(mel_outputs.transpose(2,1))
+        mel_outputs_postnet = mel_outputs + mel_outputs_postnet.transpose(2,1)
+        # print('mel_outputs.shape:', mel_outputs_postnet.shape)
+        if mel_target is None:
+            return mel_outputs_postnet
+        else:
+            # mel_target = mel_target[:,1:-1,:]
+            loss = F.mse_loss(mel_outputs, mel_target) + \
+                F.mse_loss(mel_outputs_postnet, mel_target)
+            if self.use_l1_loss:
+                loss = loss + F.l1_loss(mel_outputs, mel_target) + \
+                    F.l1_loss(mel_outputs_postnet, mel_target)
+            return loss, mel_outputs_postnet
+
+
diff --git a/model_encoder.py b/model_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f22368329be0f93915b985ddcbcc6b3ae506618
--- /dev/null
+++ b/model_encoder.py
@@ -0,0 +1,449 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+
+
+class ConvNorm(torch.nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
+                 padding=None, dilation=1, bias=True, w_init_gain='linear'):
+        super(ConvNorm, self).__init__()
+        if padding is None:
+            assert(kernel_size % 2 == 1)
+            padding = int(dilation * (kernel_size - 1) / 2)
+
+        self.conv = torch.nn.Conv1d(in_channels, out_channels,
+                                    kernel_size=kernel_size, stride=stride,
+                                    padding=padding, dilation=dilation,
+                                    bias=bias)
+
+        torch.nn.init.xavier_uniform_(
+            self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
+
+    def forward(self, signal):
+        conv_signal = self.conv(signal)
+        return conv_signal
+
+
+class Encoder_lf0(nn.Module):
+    def __init__(self, typ='no_emb'):
+        super(Encoder_lf0, self).__init__()
+        self.type = typ
+        if typ != 'no_emb':
+            convolutions = []
+            for i in range(3):
+                conv_layer = nn.Sequential(
+                    ConvNorm(1 if i==0 else 256, 256,
+                             kernel_size=5, stride=2 if i==2 else 1,
+                             padding=2,
+                             dilation=1, w_init_gain='relu'),
+                    nn.GroupNorm(256//16, 256),
+                    nn.ReLU())
+                convolutions.append(conv_layer)
+            self.convolutions = nn.ModuleList(convolutions)
+            self.lstm = nn.LSTM(256, 32, 1, batch_first=True, bidirectional=True)
+
+    def forward(self, lf0):
+        if self.type != 'no_emb':
+            if len(lf0.shape) == 2:
+                lf0 = lf0.unsqueeze(1) # bz x 1 x 128
+            for conv in self.convolutions:
+                lf0 = conv(lf0) # bz x 256 x 128
+            lf0 = lf0.transpose(1,2) # bz x 64 x 256
+            self.lstm.flatten_parameters()
+            lf0, _ = self.lstm(lf0) # bz x 64 x 64
+        else:
+            if len(lf0.shape) == 2:
+                lf0 = lf0.unsqueeze(-1) # bz x 128 x 1 # no downsampling
+        return lf0
+
+
+
+def pad_layer(inp, layer, pad_type='reflect'):
+    kernel_size = layer.kernel_size[0]
+    if kernel_size % 2 == 0:
+        pad = (kernel_size//2, kernel_size//2 - 1)
+    else:
+        pad = (kernel_size//2, kernel_size//2)
+    # padding
+    inp = F.pad(inp, 
+            pad=pad,
+            mode=pad_type)
+    out = layer(inp)
+    return out
+
+def conv_bank(x, module_list, act, pad_type='reflect'):
+    outs = []
+    for layer in module_list:
+        out = act(pad_layer(x, layer, pad_type))
+        outs.append(out)
+    out = torch.cat(outs + [x], dim=1)
+    return out
+
+def get_act(act):
+    if act == 'relu':
+        return nn.ReLU()
+    elif act == 'lrelu':
+        return nn.LeakyReLU()
+    else:
+        return nn.ReLU()
+
+
+class SpeakerEncoder(nn.Module):
+    '''
+    reference from speaker-encoder of AdaIN-VC: https://github.com/jjery2243542/adaptive_voice_conversion/blob/master/model.py
+    '''
+    def __init__(self, c_in=80, c_h=128, c_out=256, kernel_size=5,
+            bank_size=8, bank_scale=1, c_bank=128, 
+            n_conv_blocks=6, n_dense_blocks=6, 
+            subsample=[1, 2, 1, 2, 1, 2], act='relu', dropout_rate=0):
+        super(SpeakerEncoder, self).__init__()
+        self.c_in = c_in
+        self.c_h = c_h
+        self.c_out = c_out
+        self.kernel_size = kernel_size
+        self.n_conv_blocks = n_conv_blocks
+        self.n_dense_blocks = n_dense_blocks
+        self.subsample = subsample
+        self.act = get_act(act)
+        self.conv_bank = nn.ModuleList(
+                [nn.Conv1d(c_in, c_bank, kernel_size=k) for k in range(bank_scale, bank_size + 1, bank_scale)])
+        in_channels = c_bank * (bank_size // bank_scale) + c_in
+        self.in_conv_layer = nn.Conv1d(in_channels, c_h, kernel_size=1)
+        self.first_conv_layers = nn.ModuleList([nn.Conv1d(c_h, c_h, kernel_size=kernel_size) for _ \
+                in range(n_conv_blocks)])
+        self.second_conv_layers = nn.ModuleList([nn.Conv1d(c_h, c_h, kernel_size=kernel_size, stride=sub) 
+            for sub, _ in zip(subsample, range(n_conv_blocks))])
+        self.pooling_layer = nn.AdaptiveAvgPool1d(1)
+        self.first_dense_layers = nn.ModuleList([nn.Linear(c_h, c_h) for _ in range(n_dense_blocks)])
+        self.second_dense_layers = nn.ModuleList([nn.Linear(c_h, c_h) for _ in range(n_dense_blocks)])
+        self.output_layer = nn.Linear(c_h, c_out)
+        self.dropout_layer = nn.Dropout(p=dropout_rate)
+
+    def conv_blocks(self, inp):
+        out = inp
+        # convolution blocks
+        for l in range(self.n_conv_blocks):
+            y = pad_layer(out, self.first_conv_layers[l])
+            y = self.act(y)
+            y = self.dropout_layer(y)
+            y = pad_layer(y, self.second_conv_layers[l])
+            y = self.act(y)
+            y = self.dropout_layer(y)
+            if self.subsample[l] > 1:
+                out = F.avg_pool1d(out, kernel_size=self.subsample[l], ceil_mode=True)
+            out = y + out
+        return out
+
+    def dense_blocks(self, inp):
+        out = inp
+        # dense layers
+        for l in range(self.n_dense_blocks):
+            y = self.first_dense_layers[l](out)
+            y = self.act(y)
+            y = self.dropout_layer(y)
+            y = self.second_dense_layers[l](y)
+            y = self.act(y)
+            y = self.dropout_layer(y)
+            out = y + out
+        return out
+
+    def forward(self, x):
+        out = conv_bank(x, self.conv_bank, act=self.act)
+        # dimension reduction layer
+        out = pad_layer(out, self.in_conv_layer)
+        out = self.act(out)
+        # conv blocks
+        out = self.conv_blocks(out)
+        # avg pooling
+        out = self.pooling_layer(out).squeeze(2)
+        # dense blocks
+        out = self.dense_blocks(out)
+        out = self.output_layer(out)
+        return out
+
+
+
+class Encoder(nn.Module):
+    '''
+    reference from: https://github.com/bshall/VectorQuantizedCPC/blob/master/model.py
+    '''
+    def __init__(self, in_channels, channels, n_embeddings, z_dim, c_dim):
+        super(Encoder, self).__init__()
+        self.conv = nn.Conv1d(in_channels, channels, 4, 2, 1, bias=False)
+        self.encoder = nn.Sequential(
+            nn.LayerNorm(channels),
+            nn.ReLU(True),
+            nn.Linear(channels, channels, bias=False),
+            nn.LayerNorm(channels),
+            nn.ReLU(True),
+            nn.Linear(channels, channels, bias=False),
+            nn.LayerNorm(channels),
+            nn.ReLU(True),
+            nn.Linear(channels, channels, bias=False),
+            nn.LayerNorm(channels),
+            nn.ReLU(True),
+            nn.Linear(channels, channels, bias=False),
+            nn.LayerNorm(channels),
+            nn.ReLU(True),
+            nn.Linear(channels, z_dim),
+        )
+        self.codebook = VQEmbeddingEMA(n_embeddings, z_dim)
+        self.rnn = nn.LSTM(z_dim, c_dim, batch_first=True)
+
+    def encode(self, mel):
+        z = self.conv(mel)
+        z_beforeVQ = self.encoder(z.transpose(1, 2))
+        z, r, indices = self.codebook.encode(z_beforeVQ)
+        c, _ = self.rnn(z)
+        return z, c, z_beforeVQ, indices
+
+    def forward(self, mels):
+        z = self.conv(mels.float()) # (bz, 80, 128) -> (bz, 512, 128/2)
+        z_beforeVQ = self.encoder(z.transpose(1, 2)) # (bz, 512, 128/2) -> (bz, 128/2, 512) -> (bz, 128/2, 64)
+        z, r, loss, perplexity = self.codebook(z_beforeVQ) # z: (bz, 128/2, 64)
+        c, _ = self.rnn(z) # (64, 140/2, 64) -> (64, 140/2, 256)
+        return z, c, z_beforeVQ, loss, perplexity
+    
+
+
+class VQEmbeddingEMA(nn.Module):
+    '''
+    reference from: https://github.com/bshall/VectorQuantizedCPC/blob/master/model.py
+    '''
+    def __init__(self, n_embeddings, embedding_dim, commitment_cost=0.25, decay=0.999, epsilon=1e-5):
+        super(VQEmbeddingEMA, self).__init__()
+        self.commitment_cost = commitment_cost
+        self.decay = decay
+        self.epsilon = epsilon
+
+        init_bound = 1 / 512
+        embedding = torch.Tensor(n_embeddings, embedding_dim)
+        embedding.uniform_(-init_bound, init_bound)
+        self.register_buffer("embedding", embedding) # only change during forward
+        self.register_buffer("ema_count", torch.zeros(n_embeddings))
+        self.register_buffer("ema_weight", self.embedding.clone())
+
+    def encode(self, x):
+        M, D = self.embedding.size()
+        x_flat = x.detach().reshape(-1, D)
+
+        distances = torch.addmm(torch.sum(self.embedding ** 2, dim=1) +
+                                torch.sum(x_flat ** 2, dim=1, keepdim=True),
+                                x_flat, self.embedding.t(),
+                                alpha=-2.0, beta=1.0)
+
+        indices = torch.argmin(distances.float(), dim=-1)
+        quantized = F.embedding(indices, self.embedding)
+        quantized = quantized.view_as(x)
+        residual = x - quantized
+        return quantized, residual, indices.view(x.size(0), x.size(1))
+
+    def forward(self, x):
+        M, D = self.embedding.size()
+        x_flat = x.detach().reshape(-1, D)
+
+        distances = torch.addmm(torch.sum(self.embedding ** 2, dim=1) +
+                                torch.sum(x_flat ** 2, dim=1, keepdim=True),
+                                x_flat, self.embedding.t(),
+                                alpha=-2.0, beta=1.0) # calculate the distance between each ele in embedding and x
+
+        indices = torch.argmin(distances.float(), dim=-1)
+        encodings = F.one_hot(indices, M).float()
+        quantized = F.embedding(indices, self.embedding)
+        quantized = quantized.view_as(x)
+
+        if self.training: # EMA based codebook learning
+            self.ema_count = self.decay * self.ema_count + (1 - self.decay) * torch.sum(encodings, dim=0)
+
+            n = torch.sum(self.ema_count)
+            self.ema_count = (self.ema_count + self.epsilon) / (n + M * self.epsilon) * n
+
+            dw = torch.matmul(encodings.t(), x_flat)
+            self.ema_weight = self.decay * self.ema_weight + (1 - self.decay) * dw
+
+            self.embedding = self.ema_weight / self.ema_count.unsqueeze(-1)
+
+        e_latent_loss = F.mse_loss(x, quantized.detach())
+        loss = self.commitment_cost * e_latent_loss
+        
+        residual = x - quantized
+        
+        quantized = x + (quantized - x).detach()
+
+        avg_probs = torch.mean(encodings, dim=0)
+        perplexity = torch.exp(-torch.sum(avg_probs * torch.log(avg_probs + 1e-10)))
+
+        return quantized, residual, loss, perplexity
+
+
+class CPCLoss(nn.Module):
+    '''
+    CPC-loss calculation: negative samples are drawn within-speaker
+    reference from: https://github.com/bshall/VectorQuantizedCPC/blob/master/model.py
+    '''
+    def __init__(self, n_speakers_per_batch, n_utterances_per_speaker, n_prediction_steps, n_negatives, z_dim, c_dim):
+        super(CPCLoss, self).__init__()
+        self.n_speakers_per_batch = n_speakers_per_batch
+        self.n_utterances_per_speaker = n_utterances_per_speaker
+        self.n_prediction_steps = n_prediction_steps // 2
+        self.n_negatives = n_negatives
+        self.z_dim = z_dim
+        self.c_dim = c_dim
+        self.predictors = nn.ModuleList([
+            nn.Linear(c_dim, z_dim) for _ in range(n_prediction_steps)
+        ])
+
+    def forward(self, z, c): # z:(64, 70, 64), c:(64, 70, 256)
+        length = z.size(1) - self.n_prediction_steps # 64
+
+        z = z.reshape(
+            self.n_speakers_per_batch,
+            self.n_utterances_per_speaker,
+            -1,
+            self.z_dim
+        ) # (64, 70, 64) -> (8, 8, 70, 64)
+        c = c[:, :-self.n_prediction_steps, :] # (64, 64, 256)
+
+        losses, accuracies = list(), list()
+        for k in range(1, self.n_prediction_steps+1):
+            z_shift = z[:, :, k:length + k, :] # (8, 8, 64, 64), positive samples
+
+            Wc = self.predictors[k-1](c) # (64, 64, 256) -> (64, 64, 64)
+            Wc = Wc.view(
+                self.n_speakers_per_batch,
+                self.n_utterances_per_speaker,
+                -1,
+                self.z_dim
+            ) # (64, 64, 64) -> (8, 8, 64, 64)
+
+            batch_index = torch.randint(
+                0, self.n_utterances_per_speaker,
+                size=(
+                    self.n_utterances_per_speaker,
+                    self.n_negatives
+                ),
+                device=z.device
+            )
+            batch_index = batch_index.view(
+                1, self.n_utterances_per_speaker, self.n_negatives, 1
+            ) # (1, 8, 17, 1)
+
+            # seq_index: (8, 8, 17, 64)
+            seq_index = torch.randint(
+                1, length,
+                size=(
+                    self.n_speakers_per_batch,
+                    self.n_utterances_per_speaker,
+                    self.n_negatives,
+                    length
+                ),
+                device=z.device
+            ) 
+            seq_index += torch.arange(length, device=z.device) #(1)
+            seq_index = torch.remainder(seq_index, length) #(2) (1)+(2) ensures that the current positive frame will not be selected as negative sample...
+            
+            speaker_index = torch.arange(self.n_speakers_per_batch, device=z.device) # within-speaker sampling
+            speaker_index = speaker_index.view(-1, 1, 1, 1)
+            
+            # z_negatives: (8,8,17,64,64); z_negatives[0,0,:,0,:] is (17, 64) that is negative samples for first frame of first utterance of first speaker...
+            z_negatives = z_shift[speaker_index, batch_index, seq_index, :] # speaker_index has the original order (within-speaker sampling)
+                                                                            # batch_index is randomly sampled from 0~7, each point has 17 negative samples
+                                                                            # seq_index is randomly sampled from 0~115
+                                                                        # so for each positive frame with time-id as t, the negative samples will be selected from 
+                                                                        # another or the current utterance and the seq-index (frame-index) will not conclude t  
+
+            zs = torch.cat((z_shift.unsqueeze(2), z_negatives), dim=2) # (8, 8, 1+17, 64, 64)
+
+            f = torch.sum(zs * Wc.unsqueeze(2) / math.sqrt(self.z_dim), dim=-1) # (8, 8, 1+17, 64), vector product in fact...
+            f = f.view(
+                self.n_speakers_per_batch * self.n_utterances_per_speaker,
+                self.n_negatives + 1,
+                -1
+            ) # (64, 1+17, 64)
+
+            labels = torch.zeros(
+                self.n_speakers_per_batch * self.n_utterances_per_speaker, length,
+                dtype=torch.long, device=z.device
+            ) # (64, 64)
+
+            loss = F.cross_entropy(f, labels)
+
+            accuracy = f.argmax(dim=1) == labels # (64, 116)
+            accuracy = torch.mean(accuracy.float())
+
+            losses.append(loss)
+            accuracies.append(accuracy.item())
+
+        loss = torch.stack(losses).mean()
+        return loss, accuracies
+
+
+class CPCLoss_sameSeq(nn.Module):
+    '''
+    CPC-loss calculation: negative samples are drawn within-sequence/utterance
+    '''
+    def __init__(self, n_speakers_per_batch, n_utterances_per_speaker, n_prediction_steps, n_negatives, z_dim, c_dim):
+        super(CPCLoss_sameSeq, self).__init__()
+        self.n_speakers_per_batch = n_speakers_per_batch
+        self.n_utterances_per_speaker = n_utterances_per_speaker
+        self.n_prediction_steps = n_prediction_steps 
+        self.n_negatives = n_negatives
+        self.z_dim = z_dim
+        self.c_dim = c_dim
+        self.predictors = nn.ModuleList([
+            nn.Linear(c_dim, z_dim) for _ in range(n_prediction_steps)
+        ])
+
+    def forward(self, z, c): # z:(256, 64, 64), c:(256, 64, 256)
+        length = z.size(1) - self.n_prediction_steps # 64-6=58, length is the total time-steps of each utterance used for calculated cpc loss
+        n_speakers_per_batch = z.shape[0] # each utterance is treated as a speaker
+        c = c[:, :-self.n_prediction_steps, :] # (256, 58, 256)
+
+        losses, accuracies = list(), list()
+        for k in range(1, self.n_prediction_steps+1):
+            z_shift = z[:, k:length + k, :] # (256, 58, 64), positive samples
+
+            Wc = self.predictors[k-1](c) # (256, 58, 256) -> (256, 58, 64)
+
+            # seq_index: (256, 10, 58)
+            seq_index = torch.randint(
+                1, length,
+                size=(
+                    n_speakers_per_batch,
+                    self.n_negatives,
+                    length
+                ),
+                device=z.device
+            ) 
+            seq_index += torch.arange(length, device=z.device) #(1)
+            seq_index = torch.remainder(seq_index, length) #(2) (1)+(2) ensures that the current positive frame will not be selected as negative sample...
+            
+            speaker_index = torch.arange(n_speakers_per_batch, device=z.device) # within-utterance sampling
+            speaker_index = speaker_index.view(-1, 1, 1)
+            
+            
+            z_negatives = z_shift[speaker_index, seq_index, :] # (256,10,58,64), z_negatives[i,:,j,:] is the negative samples set for ith utterance and jth time-step
+
+            zs = torch.cat((z_shift.unsqueeze(1), z_negatives), dim=1) # (256,11,58,64) 
+
+            f = torch.sum(zs * Wc.unsqueeze(1) / math.sqrt(self.z_dim), dim=-1) # (256,11,58), vector product in fact...
+            
+            labels = torch.zeros(
+                n_speakers_per_batch, length,
+                dtype=torch.long, device=z.device
+            ) 
+
+            loss = F.cross_entropy(f, labels)
+
+            accuracy = f.argmax(dim=1) == labels # (256, 58)
+            accuracy = torch.mean(accuracy.float())
+
+            losses.append(loss)
+            accuracies.append(accuracy.item())
+
+        loss = torch.stack(losses).mean()
+        return loss, accuracies
+    
+
+
diff --git a/predict.py b/predict.py
new file mode 100644
index 0000000000000000000000000000000000000000..4cd11ed038c091eae5aee050eaf7e925ed31c9c0
--- /dev/null
+++ b/predict.py
@@ -0,0 +1,147 @@
+import argparse
+import json
+import os
+import subprocess
+import tempfile
+import zipfile
+from pathlib import Path
+
+import cog
+import kaldiio
+import numpy as np
+import pyworld as pw
+import resampy
+import soundfile as sf
+import torch
+
+from model_decoder import Decoder_ac
+from model_encoder import Encoder, Encoder_lf0
+from model_encoder import SpeakerEncoder as Encoder_spk
+from spectrogram import logmelspectrogram
+
+
+def extract_logmel(wav_path, mean, std, sr=16000):
+    # wav, fs = librosa.load(wav_path, sr=sr)
+    wav, fs = sf.read(wav_path)
+    if fs != sr:
+        wav = resampy.resample(wav, fs, sr, axis=0)
+        fs = sr
+    # wav, _ = librosa.effects.trim(wav, top_db=15)
+    # duration = len(wav)/fs
+    assert fs == 16000
+    peak = np.abs(wav).max()
+    if peak > 1.0:
+        wav /= peak
+    mel = logmelspectrogram(
+        x=wav,
+        fs=fs,
+        n_mels=80,
+        n_fft=400,
+        n_shift=160,
+        win_length=400,
+        window="hann",
+        fmin=80,
+        fmax=7600,
+    )
+    mel = (mel - mean) / (std + 1e-8)
+    tlen = mel.shape[0]
+    frame_period = 160 / fs * 1000
+    f0, timeaxis = pw.dio(wav.astype("float64"), fs, frame_period=frame_period)
+    f0 = pw.stonemask(wav.astype("float64"), f0, timeaxis, fs)
+    f0 = f0[:tlen].reshape(-1).astype("float32")
+    nonzeros_indices = np.nonzero(f0)
+    lf0 = f0.copy()
+    lf0[nonzeros_indices] = np.log(
+        f0[nonzeros_indices]
+    )  # for f0(Hz), lf0 > 0 when f0 != 0
+    mean, std = np.mean(lf0[nonzeros_indices]), np.std(lf0[nonzeros_indices])
+    lf0[nonzeros_indices] = (lf0[nonzeros_indices] - mean) / (std + 1e-8)
+    return mel, lf0
+
+
+class Predictor(cog.Predictor):
+    def setup(self):
+        """Load models"""
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        checkpoint_path = "VQMIVC-pretrained models/checkpoints/useCSMITrue_useCPMITrue_usePSMITrue_useAmpTrue/VQMIVC-model.ckpt-500.pt"
+        mel_stats = np.load("./mel_stats/stats.npy")
+
+        encoder = Encoder(
+            in_channels=80, channels=512, n_embeddings=512, z_dim=64, c_dim=256
+        )
+        encoder_lf0 = Encoder_lf0()
+        encoder_spk = Encoder_spk()
+        decoder = Decoder_ac(dim_neck=64)
+        encoder.to(device)
+        encoder_lf0.to(device)
+        encoder_spk.to(device)
+        decoder.to(device)
+
+        checkpoint = torch.load(
+            checkpoint_path, map_location=lambda storage, loc: storage
+        )
+        encoder.load_state_dict(checkpoint["encoder"])
+        encoder_spk.load_state_dict(checkpoint["encoder_spk"])
+        decoder.load_state_dict(checkpoint["decoder"])
+
+        encoder.eval()
+        encoder_spk.eval()
+        decoder.eval()
+
+        self.mean = mel_stats[0]
+        self.std = mel_stats[1]
+        self.encoder = encoder
+        self.encoder_spk = encoder_spk
+        self.encoder_lf0 = encoder_lf0
+        self.decoder = decoder
+        self.device = device
+
+    @cog.input("input_source", type=Path, help="Source voice wav path")
+    @cog.input("input_reference", type=Path, help="Reference voice wav path")
+    def predict(self, input_source, input_reference):
+        """Compute prediction"""
+        # inference
+        out_dir = Path(tempfile.mkdtemp())
+        out_path = out_dir / Path(
+            os.path.basename(str(input_source)).split(".")[0] + "_converted_gen.wav"
+        )
+        src_wav_path = input_source
+        ref_wav_path = input_reference
+        feat_writer = kaldiio.WriteHelper(
+            "ark,scp:{o}.ark,{o}.scp".format(o=str(out_dir) + "/feats.1")
+        )
+        src_mel, src_lf0 = extract_logmel(src_wav_path, self.mean, self.std)
+        ref_mel, _ = extract_logmel(ref_wav_path, self.mean, self.std)
+
+        src_mel = torch.FloatTensor(src_mel.T).unsqueeze(0).to(self.device)
+        src_lf0 = torch.FloatTensor(src_lf0).unsqueeze(0).to(self.device)
+        ref_mel = torch.FloatTensor(ref_mel.T).unsqueeze(0).to(self.device)
+        out_filename = os.path.basename(src_wav_path).split(".")[0]
+
+        with torch.no_grad():
+            z, _, _, _ = self.encoder.encode(src_mel)
+            lf0_embs = self.encoder_lf0(src_lf0)
+            spk_emb = self.encoder_spk(ref_mel)
+            output = self.decoder(z, lf0_embs, spk_emb)
+
+            feat_writer[out_filename + "_converted"] = output.squeeze(0).cpu().numpy()
+            feat_writer[out_filename + "_source"] = src_mel.squeeze(0).cpu().numpy().T
+            feat_writer[out_filename + "_reference"] = (
+                ref_mel.squeeze(0).cpu().numpy().T
+            )
+
+        feat_writer.close()
+
+        print("synthesize waveform...")
+        cmd = [
+            "parallel-wavegan-decode",
+            "--checkpoint",
+            "./vocoder/checkpoint-3000000steps.pkl",
+            "--feats-scp",
+            f"{str(out_dir)}/feats.1.scp",
+            "--outdir",
+            str(out_dir),
+        ]
+        subprocess.call(cmd)
+
+        return out_path
diff --git a/preprocess.py b/preprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a7fad9a52b69706403a86bcc3d26c890f1c4ae2
--- /dev/null
+++ b/preprocess.py
@@ -0,0 +1,173 @@
+# -*- coding: utf-8 -*-
+
+from spectrogram import logmelspectrogram
+import numpy as np
+from joblib import Parallel, delayed
+import librosa
+import soundfile as sf
+import os
+from glob import glob
+from tqdm import tqdm
+import random
+import json
+import resampy
+import pyworld as pw
+
+def extract_logmel(wav_path, sr=16000):
+    # wav, fs = librosa.load(wav_path, sr=sr)
+    wav, fs = sf.read(wav_path)
+    wav, _ = librosa.effects.trim(wav, top_db=60)
+    if fs != sr:
+        wav = resampy.resample(wav, fs, sr, axis=0)
+        fs = sr
+    # duration = len(wav)/fs
+    assert fs == 16000
+    peak = np.abs(wav).max()
+    if peak > 1.0:
+        wav /= peak
+    mel = logmelspectrogram(
+                x=wav,
+                fs=fs,
+                n_mels=80,
+                n_fft=400,
+                n_shift=160,
+                win_length=400,
+                window='hann',
+                fmin=80,
+                fmax=7600,
+            )
+    
+    tlen = mel.shape[0]
+    frame_period = 160/fs*1000
+    f0, timeaxis = pw.dio(wav.astype('float64'), fs, frame_period=frame_period)
+    f0 = pw.stonemask(wav.astype('float64'), f0, timeaxis, fs)
+    f0 = f0[:tlen].reshape(-1).astype('float32')
+    nonzeros_indices = np.nonzero(f0)
+    lf0 = f0.copy()
+    lf0[nonzeros_indices] = np.log(f0[nonzeros_indices]) # for f0(Hz), lf0 > 0 when f0 != 0
+    
+    wav_name = os.path.basename(wav_path).split('.')[0]
+    # print(wav_name, mel.shape, duration)
+    return wav_name, mel, lf0, mel.shape[0]
+
+
+def normalize_logmel(wav_name, mel, mean, std):
+    mel = (mel - mean) / (std + 1e-8)
+    return wav_name, mel
+
+
+def save_one_file(save_path, arr):
+    os.makedirs(os.path.dirname(save_path), exist_ok=True)
+    np.save(save_path, arr)
+
+
+def save_logmel(save_root, wav_name, melinfo, mode):
+    mel, lf0, mel_len = melinfo
+    spk = wav_name.split('_')[0]
+    mel_save_path = f'{save_root}/{mode}/mels/{spk}/{wav_name}.npy'
+    lf0_save_path = f'{save_root}/{mode}/lf0/{spk}/{wav_name}.npy'
+    save_one_file(mel_save_path, mel)
+    save_one_file(lf0_save_path, lf0)
+    return mel_len, mel_save_path, lf0_save_path
+
+# def get_wavs_names(spks, data_root)
+data_root = '/Dataset/VCTK-Corpus/wav48_silence_trimmed'
+save_root = 'data'
+os.makedirs(save_root, exist_ok=True)
+
+spk_info_txt = '/Dataset/VCTK-Corpus/speaker-info.txt'
+f = open(spk_info_txt, 'r')
+gen2spk = {}
+all_spks = []
+for i, line in enumerate(f):
+    if i == 0:
+        continue
+    else:
+        tmp = line.split()
+        # print(tmp)
+        spk = tmp[0]
+        all_spks.append(spk)
+        gen = tmp[2]
+        if gen not in gen2spk:
+            gen2spk[gen] = [spk]
+        else:
+            gen2spk[gen].append(spk)
+
+random.shuffle(all_spks)
+train_spks = all_spks[:-20]
+test_spks = all_spks[-20:]
+
+train_wavs_names = []
+valid_wavs_names = []
+test_wavs_names = []
+    
+print('all_spks:', all_spks)
+for spk in train_spks:
+    spk_wavs = glob(f'{data_root}/{spk}/*mic1.flac')
+    print('len(spk_wavs):', len(spk_wavs))
+    spk_wavs_names = [os.path.basename(p).split('.')[0] for p in spk_wavs]
+    valid_names = random.sample(spk_wavs_names, int(len(spk_wavs_names)*0.1))
+    train_names = [n for n in spk_wavs_names if n not in valid_names]
+    train_wavs_names += train_names
+    valid_wavs_names += valid_names
+    
+for spk in test_spks:
+    spk_wavs = glob(f'{data_root}/{spk}/*mic1.flac')
+    print('len(spk_wavs):', len(spk_wavs))
+    spk_wavs_names = [os.path.basename(p).split('.')[0] for p in spk_wavs]
+    test_wavs_names += spk_wavs_names
+    
+print(len(train_wavs_names))
+print(len(valid_wavs_names))
+print(len(test_wavs_names))
+# extract log-mel
+print('extract log-mel...')
+all_wavs = glob(f'{data_root}/*/*mic1.flac')
+results = Parallel(n_jobs=-1)(delayed(extract_logmel)(wav_path) for wav_path in tqdm(all_wavs))
+wn2mel = {}
+for r in results:
+    wav_name, mel, lf0, mel_len = r
+    # print(wav_name, mel.shape, duration)
+    wn2mel[wav_name] = [mel, lf0, mel_len]
+
+# normalize log-mel
+print('normalize log-mel...')
+mels = []
+spk2lf0 = {}
+for wav_name in train_wavs_names:
+    mel, _, _ = wn2mel[wav_name]
+    mels.append(mel)
+
+mels = np.concatenate(mels, 0)
+mean = np.mean(mels, 0)
+std = np.std(mels, 0)
+mel_stats = np.concatenate([mean.reshape(1,-1), std.reshape(1,-1)], 0)
+np.save(f'{save_root}/mel_stats.npy', mel_stats)
+
+results = Parallel(n_jobs=-1)(delayed(normalize_logmel)(wav_name, wn2mel[wav_name][0], mean, std) for wav_name in tqdm(wn2mel.keys()))
+wn2mel_new = {}
+for r in results:
+    wav_name, mel = r
+    lf0 = wn2mel[wav_name][1]
+    mel_len = wn2mel[wav_name][2]
+    wn2mel_new[wav_name] = [mel, lf0, mel_len]
+
+# save log-mel
+print('save log-mel...')
+train_results = Parallel(n_jobs=-1)(delayed(save_logmel)(save_root, wav_name, wn2mel_new[wav_name], 'train') for wav_name in tqdm(train_wavs_names))
+valid_results = Parallel(n_jobs=-1)(delayed(save_logmel)(save_root, wav_name, wn2mel_new[wav_name], 'valid') for wav_name in tqdm(valid_wavs_names))
+test_results = Parallel(n_jobs=-1)(delayed(save_logmel)(save_root, wav_name, wn2mel_new[wav_name], 'test') for wav_name in tqdm(test_wavs_names))
+
+def save_json(save_root, results, mode):
+    fp = open(f'{save_root}/{mode}.json', 'w')
+    json.dump(results, fp, indent=4)
+    fp.close()
+    
+save_json(save_root, train_results, 'train')
+save_json(save_root, valid_results, 'valid')
+save_json(save_root, test_results, 'test')
+
+
+    
+
+
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2557381526810aba4195534aa402e605a6599a48
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,8 @@
+pyworld==0.3.0
+librosa==0.8.0
+soundfile==0.10.3.post1
+tqdm
+hydra-core==1.0.0
+torch==1.3.1
+numpy==1.19.1
+kaldiio==2.15.1
diff --git a/scheduler.py b/scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..af61ff6e879da93ab6aa4dd527eee0ef6b4888fa
--- /dev/null
+++ b/scheduler.py
@@ -0,0 +1,124 @@
+import torch.optim as optim
+
+from collections import Counter
+
+class WarmupScheduler(optim.lr_scheduler._LRScheduler):
+    def __init__(self, optimizer, warmup_epochs, initial_lr, max_lr, milestones, gamma=0.1, last_epoch=-1):
+        assert warmup_epochs < milestones[0]
+        self.warmup_epochs = warmup_epochs
+        self.milestones = Counter(milestones)
+        self.gamma = gamma
+
+        initial_lrs = self._format_param("initial_lr", optimizer, initial_lr)
+        max_lrs = self._format_param("max_lr", optimizer, max_lr)
+        if last_epoch == -1:
+            for idx, group in enumerate(optimizer.param_groups):
+                group["initial_lr"] = initial_lrs[idx]
+                group["max_lr"] = max_lrs[idx]
+
+        super(WarmupScheduler, self).__init__(optimizer, last_epoch)
+
+    def get_lr(self):
+        # if not self._get_lr_called_within_step:
+        #     warnings.warn("To get the last learning rate computed by the scheduler, "
+        #                   "please use `get_last_lr()`.", DeprecationWarning)
+
+        if self.last_epoch <= self.warmup_epochs:
+            pct = self.last_epoch / self.warmup_epochs
+            return [
+                (group["max_lr"] - group["initial_lr"]) * pct + group["initial_lr"]
+                for group in self.optimizer.param_groups]
+        else:
+            if self.last_epoch not in self.milestones:
+                return [group['lr'] for group in self.optimizer.param_groups]
+            return [group['lr'] * self.gamma ** self.milestones[self.last_epoch]
+                    for group in self.optimizer.param_groups]
+
+    @staticmethod
+    def _format_param(name, optimizer, param):
+        """Return correctly formatted lr/momentum for each param group."""
+        if isinstance(param, (list, tuple)):
+            if len(param) != len(optimizer.param_groups):
+                raise ValueError("expected {} values for {}, got {}".format(
+                    len(optimizer.param_groups), name, len(param)))
+            return param
+        else:
+            return [param] * len(optimizer.param_groups)
+
+
+class WarmupScheduler_noUseMilestones(optim.lr_scheduler._LRScheduler):
+    def __init__(self, optimizer, warmup_epochs, initial_lr, max_lr, milestones, gamma=0.1, last_epoch=-1):
+        assert warmup_epochs < milestones[0]
+        self.warmup_epochs = warmup_epochs
+        self.milestones = Counter(milestones)
+        self.gamma = gamma
+
+        initial_lrs = self._format_param("initial_lr", optimizer, initial_lr)
+        max_lrs = self._format_param("max_lr", optimizer, max_lr)
+        if last_epoch == -1:
+            for idx, group in enumerate(optimizer.param_groups):
+                group["initial_lr"] = initial_lrs[idx]
+                group["max_lr"] = max_lrs[idx]
+
+        super(WarmupScheduler_noUseMilestones, self).__init__(optimizer, last_epoch)
+
+    def get_lr(self):
+        # if not self._get_lr_called_within_step:
+        #     warnings.warn("To get the last learning rate computed by the scheduler, "
+        #                   "please use `get_last_lr()`.", DeprecationWarning)
+
+        if self.last_epoch <= self.warmup_epochs:
+            pct = self.last_epoch / self.warmup_epochs
+            return [
+                (group["max_lr"] - group["initial_lr"]) * pct + group["initial_lr"]
+                for group in self.optimizer.param_groups]
+        else:
+            # if self.last_epoch not in self.milestones:
+            return [group['lr'] for group in self.optimizer.param_groups]
+            # return [group['lr'] * self.gamma ** self.milestones[self.last_epoch]
+            #         for group in self.optimizer.param_groups]
+
+    @staticmethod
+    def _format_param(name, optimizer, param):
+        """Return correctly formatted lr/momentum for each param group."""
+        if isinstance(param, (list, tuple)):
+            if len(param) != len(optimizer.param_groups):
+                raise ValueError("expected {} values for {}, got {}".format(
+                    len(optimizer.param_groups), name, len(param)))
+            return param
+        else:
+            return [param] * len(optimizer.param_groups)
+
+
+
+if __name__ == '__main__':
+    import torch
+
+    model = [torch.nn.Parameter(torch.randn(2, 2, requires_grad=True))]
+    optimizer = optim.SGD(model, 0.1)
+
+    scheduler = WarmupScheduler(optimizer, 5, 0.05, 0.1, [6, 14], 0.5)
+
+    for epoch in range(1, 12):
+        optimizer.zero_grad()
+        print(epoch, optimizer.param_groups[0]['lr'])
+
+        optimizer.step()
+        scheduler.step()
+
+    checkpoint_dict = {
+        "optimizer": optimizer.state_dict(),
+        "scheduler": scheduler.state_dict()
+    }
+
+    optimizer = optim.SGD(model, 0.1)
+    scheduler = WarmupScheduler(optimizer, 5, 0.05, 0.1, [6, 14], 0.5)
+    optimizer.load_state_dict(checkpoint_dict["optimizer"])
+    scheduler.load_state_dict(checkpoint_dict["scheduler"])
+
+    for epoch in range(12, 20):
+        optimizer.zero_grad()
+        print(epoch, optimizer.param_groups[0]['lr'])
+
+        optimizer.step()
+        scheduler.step()
\ No newline at end of file
diff --git a/spectrogram.py b/spectrogram.py
new file mode 100644
index 0000000000000000000000000000000000000000..651513a3ff4041dd9b5952f3a267b188eacee6db
--- /dev/null
+++ b/spectrogram.py
@@ -0,0 +1,310 @@
+'''
+Copied from espnet: https://github.com/espnet/espnet/blob/master/espnet/transform/spectrogram.py
+'''
+import librosa
+import numpy as np
+
+
+def stft(
+    x, n_fft, n_shift, win_length=None, window="hann", center=True, pad_mode="reflect"
+):
+    # x: [Time, Channel]
+    if x.ndim == 1:
+        single_channel = True
+        # x: [Time] -> [Time, Channel]
+        x = x[:, None]
+    else:
+        single_channel = False
+    x = x.astype(np.float32)
+
+    # FIXME(kamo): librosa.stft can't use multi-channel?
+    # x: [Time, Channel, Freq]
+    x = np.stack(
+        [
+            librosa.stft(
+                x[:, ch],
+                n_fft=n_fft,
+                hop_length=n_shift,
+                win_length=win_length,
+                window=window,
+                center=center,
+                pad_mode=pad_mode,
+            ).T
+            for ch in range(x.shape[1])
+        ],
+        axis=1,
+    )
+
+    if single_channel:
+        # x: [Time, Channel, Freq] -> [Time, Freq]
+        x = x[:, 0]
+    return x
+
+
+def istft(x, n_shift, win_length=None, window="hann", center=True):
+    # x: [Time, Channel, Freq]
+    if x.ndim == 2:
+        single_channel = True
+        # x: [Time, Freq] -> [Time, Channel, Freq]
+        x = x[:, None, :]
+    else:
+        single_channel = False
+
+    # x: [Time, Channel]
+    x = np.stack(
+        [
+            librosa.istft(
+                x[:, ch].T,  # [Time, Freq] -> [Freq, Time]
+                hop_length=n_shift,
+                win_length=win_length,
+                window=window,
+                center=center,
+            )
+            for ch in range(x.shape[1])
+        ],
+        axis=1,
+    )
+
+    if single_channel:
+        # x: [Time, Channel] -> [Time]
+        x = x[:, 0]
+    return x
+
+
+def stft2logmelspectrogram(x_stft, fs, n_mels, n_fft, fmin=None, fmax=None, eps=1e-10):
+    # x_stft: (Time, Channel, Freq) or (Time, Freq)
+    fmin = 0 if fmin is None else fmin
+    fmax = fs / 2 if fmax is None else fmax
+
+    # spc: (Time, Channel, Freq) or (Time, Freq)
+    spc = np.abs(x_stft)
+    # mel_basis: (Mel_freq, Freq)
+    mel_basis = librosa.filters.mel(fs, n_fft, n_mels, fmin, fmax)
+    # lmspc: (Time, Channel, Mel_freq) or (Time, Mel_freq)
+    lmspc = np.log10(np.maximum(eps, np.dot(spc, mel_basis.T)))
+
+    return lmspc
+
+
+def spectrogram(x, n_fft, n_shift, win_length=None, window="hann"):
+    # x: (Time, Channel) -> spc: (Time, Channel, Freq)
+    spc = np.abs(stft(x, n_fft, n_shift, win_length, window=window))
+    return spc
+
+
+def logmelspectrogram(
+    x,
+    fs,
+    n_mels,
+    n_fft,
+    n_shift,
+    win_length=None,
+    window="hann",
+    fmin=None,
+    fmax=None,
+    eps=1e-10,
+    pad_mode="reflect",
+):
+    # stft: (Time, Channel, Freq) or (Time, Freq)
+    x_stft = stft(
+        x,
+        n_fft=n_fft,
+        n_shift=n_shift,
+        win_length=win_length,
+        window=window,
+        pad_mode=pad_mode,
+    )
+
+    return stft2logmelspectrogram(
+        x_stft, fs=fs, n_mels=n_mels, n_fft=n_fft, fmin=fmin, fmax=fmax, eps=eps
+    )
+
+
+class Spectrogram(object):
+    def __init__(self, n_fft, n_shift, win_length=None, window="hann"):
+        self.n_fft = n_fft
+        self.n_shift = n_shift
+        self.win_length = win_length
+        self.window = window
+
+    def __repr__(self):
+        return (
+            "{name}(n_fft={n_fft}, n_shift={n_shift}, "
+            "win_length={win_length}, window={window})".format(
+                name=self.__class__.__name__,
+                n_fft=self.n_fft,
+                n_shift=self.n_shift,
+                win_length=self.win_length,
+                window=self.window,
+            )
+        )
+
+    def __call__(self, x):
+        return spectrogram(
+            x,
+            n_fft=self.n_fft,
+            n_shift=self.n_shift,
+            win_length=self.win_length,
+            window=self.window,
+        )
+
+
+class LogMelSpectrogram(object):
+    def __init__(
+        self,
+        fs,
+        n_mels,
+        n_fft,
+        n_shift,
+        win_length=None,
+        window="hann",
+        fmin=None,
+        fmax=None,
+        eps=1e-10,
+    ):
+        self.fs = fs
+        self.n_mels = n_mels
+        self.n_fft = n_fft
+        self.n_shift = n_shift
+        self.win_length = win_length
+        self.window = window
+        self.fmin = fmin
+        self.fmax = fmax
+        self.eps = eps
+
+    def __repr__(self):
+        return (
+            "{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, "
+            "n_shift={n_shift}, win_length={win_length}, window={window}, "
+            "fmin={fmin}, fmax={fmax}, eps={eps}))".format(
+                name=self.__class__.__name__,
+                fs=self.fs,
+                n_mels=self.n_mels,
+                n_fft=self.n_fft,
+                n_shift=self.n_shift,
+                win_length=self.win_length,
+                window=self.window,
+                fmin=self.fmin,
+                fmax=self.fmax,
+                eps=self.eps,
+            )
+        )
+
+    def __call__(self, x):
+        return logmelspectrogram(
+            x,
+            fs=self.fs,
+            n_mels=self.n_mels,
+            n_fft=self.n_fft,
+            n_shift=self.n_shift,
+            win_length=self.win_length,
+            window=self.window,
+        )
+
+
+class Stft2LogMelSpectrogram(object):
+    def __init__(self, fs, n_mels, n_fft, fmin=None, fmax=None, eps=1e-10):
+        self.fs = fs
+        self.n_mels = n_mels
+        self.n_fft = n_fft
+        self.fmin = fmin
+        self.fmax = fmax
+        self.eps = eps
+
+    def __repr__(self):
+        return (
+            "{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, "
+            "fmin={fmin}, fmax={fmax}, eps={eps}))".format(
+                name=self.__class__.__name__,
+                fs=self.fs,
+                n_mels=self.n_mels,
+                n_fft=self.n_fft,
+                fmin=self.fmin,
+                fmax=self.fmax,
+                eps=self.eps,
+            )
+        )
+
+    def __call__(self, x):
+        return stft2logmelspectrogram(
+            x,
+            fs=self.fs,
+            n_mels=self.n_mels,
+            n_fft=self.n_fft,
+            fmin=self.fmin,
+            fmax=self.fmax,
+        )
+
+
+class Stft(object):
+    def __init__(
+        self,
+        n_fft,
+        n_shift,
+        win_length=None,
+        window="hann",
+        center=True,
+        pad_mode="reflect",
+    ):
+        self.n_fft = n_fft
+        self.n_shift = n_shift
+        self.win_length = win_length
+        self.window = window
+        self.center = center
+        self.pad_mode = pad_mode
+
+    def __repr__(self):
+        return (
+            "{name}(n_fft={n_fft}, n_shift={n_shift}, "
+            "win_length={win_length}, window={window},"
+            "center={center}, pad_mode={pad_mode})".format(
+                name=self.__class__.__name__,
+                n_fft=self.n_fft,
+                n_shift=self.n_shift,
+                win_length=self.win_length,
+                window=self.window,
+                center=self.center,
+                pad_mode=self.pad_mode,
+            )
+        )
+
+    def __call__(self, x):
+        return stft(
+            x,
+            self.n_fft,
+            self.n_shift,
+            win_length=self.win_length,
+            window=self.window,
+            center=self.center,
+            pad_mode=self.pad_mode,
+        )
+
+
+class IStft(object):
+    def __init__(self, n_shift, win_length=None, window="hann", center=True):
+        self.n_shift = n_shift
+        self.win_length = win_length
+        self.window = window
+        self.center = center
+
+    def __repr__(self):
+        return (
+            "{name}(n_shift={n_shift}, "
+            "win_length={win_length}, window={window},"
+            "center={center})".format(
+                name=self.__class__.__name__,
+                n_shift=self.n_shift,
+                win_length=self.win_length,
+                window=self.window,
+                center=self.center,
+            )
+        )
+
+    def __call__(self, x):
+        return istft(
+            x,
+            self.n_shift,
+            win_length=self.win_length,
+            window=self.window,
+            center=self.center,
+        )
diff --git a/test_wavs/p225_038.wav b/test_wavs/p225_038.wav
new file mode 100644
index 0000000000000000000000000000000000000000..4210506f67f334a4fa4b188ecbe1a0e28dc74336
Binary files /dev/null and b/test_wavs/p225_038.wav differ
diff --git a/test_wavs/p334_047.wav b/test_wavs/p334_047.wav
new file mode 100644
index 0000000000000000000000000000000000000000..194e2b63bee301abf910064b17343ff75c9a313a
Binary files /dev/null and b/test_wavs/p334_047.wav differ
diff --git a/testing speakers.txt b/testing speakers.txt
new file mode 100644
index 0000000000000000000000000000000000000000..228020f5048e938cd9d851874ef1042693fc3cc2
--- /dev/null
+++ b/testing speakers.txt	
@@ -0,0 +1 @@
+['p231', 'p284', 'p334', 'p251', 'p326', 'p274', 'p293', 'p360', 'p258', 'p374', 'p271', 'p262', 'p314', 'p239', 'p243', 'p225', 'p273', 'p302', 'p270', 'p340']
\ No newline at end of file
diff --git a/train.py b/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fab7eca4ccd2ea6e45352ee049d24cd68a50679
--- /dev/null
+++ b/train.py
@@ -0,0 +1,411 @@
+import hydra
+from hydra import utils
+from itertools import chain
+from pathlib import Path
+import numpy as np
+
+import torch
+import torch.optim as optim
+from torch.utils.data import DataLoader
+
+
+from dataset import CPCDataset_sameSeq as CPCDataset
+from scheduler import WarmupScheduler
+from model_encoder import Encoder, CPCLoss_sameSeq, Encoder_lf0
+from model_decoder import Decoder_ac
+from model_encoder import SpeakerEncoder as Encoder_spk
+from mi_estimators import CLUBSample_group, CLUBSample_reshape
+
+import apex.amp as amp
+import os
+import time
+
+torch.manual_seed(137)
+np.random.seed(137)
+
+def save_checkpoint(encoder, encoder_lf0, cpc, encoder_spk, \
+                    cs_mi_net, ps_mi_net, cp_mi_net, decoder, \
+                    optimizer, optimizer_cs_mi_net, optimizer_ps_mi_net, optimizer_cp_mi_net, scheduler, amp, epoch, checkpoint_dir, cfg):
+    if cfg.use_amp:
+        amp_state_dict = amp.state_dict()
+    else:
+        amp_state_dict = None
+    checkpoint_state = {
+        "encoder": encoder.state_dict(),
+        "encoder_lf0": encoder_lf0.state_dict(),
+        "cpc": cpc.state_dict(),
+        "encoder_spk": encoder_spk.state_dict(),
+        "ps_mi_net": ps_mi_net.state_dict(),
+        "cp_mi_net": cp_mi_net.state_dict(),
+        "cs_mi_net": cs_mi_net.state_dict(), 
+        "decoder": decoder.state_dict(),
+        "optimizer": optimizer.state_dict(),
+        "optimizer_cs_mi_net": optimizer_cs_mi_net.state_dict(),
+        "optimizer_ps_mi_net": optimizer_ps_mi_net.state_dict(),
+        "optimizer_cp_mi_net": optimizer_cp_mi_net.state_dict(),
+        "scheduler": scheduler.state_dict(),
+        "amp": amp_state_dict,
+        "epoch": epoch
+    }
+    checkpoint_dir.mkdir(exist_ok=True, parents=True)
+    checkpoint_path = checkpoint_dir / "model.ckpt-{}.pt".format(epoch)
+    torch.save(checkpoint_state, checkpoint_path)
+    print("Saved checkpoint: {}".format(checkpoint_path.stem))
+
+
+
+def mi_first_forward(mels, lf0, encoder, encoder_lf0, encoder_spk, cs_mi_net, optimizer_cs_mi_net,
+                     ps_mi_net, optimizer_ps_mi_net, cp_mi_net, optimizer_cp_mi_net, cfg):
+    optimizer_cs_mi_net.zero_grad()
+    optimizer_ps_mi_net.zero_grad()
+    optimizer_cp_mi_net.zero_grad()
+    z, _, _, _, _ = encoder(mels)
+    z = z.detach()
+    lf0_embs = encoder_lf0(lf0).detach()
+    spk_embs = encoder_spk(mels).detach()
+    if cfg.use_CSMI:
+        lld_cs_loss = -cs_mi_net.loglikeli(spk_embs, z)
+        if cfg.use_amp:
+            with amp.scale_loss(lld_cs_loss, optimizer_cs_mi_net) as sl:
+                sl.backward()
+        else:
+            lld_cs_loss.backward()
+        optimizer_cs_mi_net.step()
+    else:
+        lld_cs_loss = torch.tensor(0.)
+    
+    if cfg.use_CPMI:
+        lld_cp_loss = -cp_mi_net.loglikeli(lf0_embs.unsqueeze(1).reshape(lf0_embs.shape[0],-1,2,lf0_embs.shape[-1]).mean(2), z)
+        if cfg.use_amp:
+            with amp.scale_loss(lld_cp_loss, optimizer_cp_mi_net) as slll:
+                slll.backward()
+        else:
+            lld_cp_loss.backward()
+        torch.nn.utils.clip_grad_norm_(cp_mi_net.parameters(), 1)
+        optimizer_cp_mi_net.step()
+    else:
+        lld_cp_loss = torch.tensor(0.)
+        
+    if cfg.use_PSMI:
+        lld_ps_loss = -ps_mi_net.loglikeli(spk_embs, lf0_embs)
+        if cfg.use_amp:
+            with amp.scale_loss(lld_ps_loss, optimizer_ps_mi_net) as sll:
+                sll.backward()
+        else:
+            lld_ps_loss.backward()
+        optimizer_ps_mi_net.step()
+    else:
+        lld_ps_loss = torch.tensor(0.)
+            
+    return optimizer_cs_mi_net, lld_cs_loss, optimizer_ps_mi_net, lld_ps_loss, optimizer_cp_mi_net, lld_cp_loss
+
+
+def mi_second_forward(mels, lf0, encoder, encoder_lf0, cpc, encoder_spk, cs_mi_net, ps_mi_net, cp_mi_net, decoder, cfg, optimizer, scheduler):
+    optimizer.zero_grad()
+    z, c, _, vq_loss, perplexity = encoder(mels)
+    cpc_loss, accuracy = cpc(z, c)
+    spk_embs = encoder_spk(mels)
+    lf0_embs = encoder_lf0(lf0)
+    recon_loss, pred_mels = decoder(z, lf0_embs, spk_embs, mels.transpose(1,2))
+    
+    loss = recon_loss + cpc_loss + vq_loss
+    
+    if cfg.use_CSMI:
+        mi_cs_loss = cfg.mi_weight*cs_mi_net.mi_est(spk_embs, z)
+    else:
+        mi_cs_loss = torch.tensor(0.).to(loss.device)
+    
+    if cfg.use_CPMI:
+        mi_cp_loss = cfg.mi_weight*cp_mi_net.mi_est(lf0_embs.unsqueeze(1).reshape(lf0_embs.shape[0],-1,2,lf0_embs.shape[-1]).mean(2), z)
+    else:
+        mi_cp_loss = torch.tensor(0.).to(loss.device)
+        
+    if cfg.use_PSMI:
+        mi_ps_loss = cfg.mi_weight*ps_mi_net.mi_est(spk_embs, lf0_embs)
+    else:
+        mi_ps_loss = torch.tensor(0.).to(loss.device)
+    
+    loss = loss + mi_cs_loss + mi_ps_loss + mi_cp_loss
+    
+    if cfg.use_amp:
+        with amp.scale_loss(loss, optimizer) as scaled_loss:
+            scaled_loss.backward()
+    else:
+        loss.backward()
+    
+    optimizer.step()
+    return optimizer, recon_loss, vq_loss, cpc_loss, accuracy, perplexity, mi_cs_loss, mi_ps_loss, mi_cp_loss
+
+
+def calculate_eval_loss(mels, lf0, \
+                        encoder, encoder_lf0, cpc, \
+                        encoder_spk, cs_mi_net, ps_mi_net, \
+                        cp_mi_net, decoder, cfg):
+    with torch.no_grad():
+        z, c, z_beforeVQ, vq_loss, perplexity = encoder(mels)
+        c = c
+        lf0_embs = encoder_lf0(lf0)
+        spk_embs = encoder_spk(mels)
+        
+        if cfg.use_CSMI:
+            lld_cs_loss = -cs_mi_net.loglikeli(spk_embs, z)
+            mi_cs_loss = cfg.mi_weight*cs_mi_net.mi_est(spk_embs, z)
+        else:
+            lld_cs_loss = torch.tensor(0.)
+            mi_cs_loss = torch.tensor(0.)
+        
+        # z, c, z_beforeVQ, vq_loss, perplexity = encoder(mels)
+        cpc_loss, accuracy = cpc(z, c)
+        recon_loss, pred_mels = decoder(z, lf0_embs, spk_embs, mels.transpose(1,2))
+        
+        if cfg.use_CPMI:
+            mi_cp_loss = cfg.mi_weight*cp_mi_net.mi_est(lf0_embs.unsqueeze(1).reshape(lf0_embs.shape[0],-1,2,lf0_embs.shape[-1]).mean(2), z)
+            lld_cp_loss = -cp_mi_net.loglikeli(lf0_embs.unsqueeze(1).reshape(lf0_embs.shape[0],-1,2,lf0_embs.shape[-1]).mean(2), z)
+        else:
+            mi_cp_loss = torch.tensor(0.)
+            lld_cp_loss = torch.tensor(0.)
+            
+        if cfg.use_PSMI:
+            mi_ps_loss = cfg.mi_weight*ps_mi_net.mi_est(spk_embs, lf0_embs)
+            lld_ps_loss = -ps_mi_net.loglikeli(spk_embs, lf0_embs)
+        else:
+            mi_ps_loss = torch.tensor(0.)
+            lld_ps_loss = torch.tensor(0.)
+            
+        return recon_loss, vq_loss, cpc_loss, accuracy, perplexity, mi_cs_loss, lld_cs_loss, mi_ps_loss, lld_ps_loss, mi_cp_loss, lld_cp_loss
+
+
+def to_eval(all_models):
+    for m in all_models:
+        m.eval()
+        
+        
+def to_train(all_models):
+    for m in all_models:
+        m.train()
+        
+        
+def eval_model(epoch, checkpoint_dir, device, valid_dataloader, encoder, encoder_lf0, cpc, encoder_spk, cs_mi_net, ps_mi_net, cp_mi_net, decoder, cfg):
+    stime = time.time()
+    average_cpc_loss = average_vq_loss = average_perplexity = average_recon_loss = 0
+    average_accuracies = np.zeros(cfg.training.n_prediction_steps)
+    average_lld_cs_loss = average_mi_cs_loss = average_lld_ps_loss = average_mi_ps_loss = average_lld_cp_loss = average_mi_cp_loss = 0
+    all_models = [encoder, encoder_lf0, cpc, encoder_spk, cs_mi_net, ps_mi_net, cp_mi_net, decoder]
+    to_eval(all_models)
+    for i, (mels, lf0, speakers) in enumerate(valid_dataloader, 1):
+        lf0 = lf0.to(device)
+        mels = mels.to(device) # (bs, 80, 128)
+        recon_loss, vq_loss, cpc_loss, accuracy, perplexity, mi_cs_loss, lld_cs_loss, mi_ps_loss, lld_ps_loss, mi_cp_loss, lld_cp_loss = \
+            calculate_eval_loss(mels, lf0, \
+                        encoder, encoder_lf0, cpc, \
+                        encoder_spk, cs_mi_net, ps_mi_net, \
+                        cp_mi_net, decoder, cfg)
+       
+        average_recon_loss += (recon_loss.item() - average_recon_loss) / i
+        average_cpc_loss += (cpc_loss.item() - average_cpc_loss) / i
+        average_vq_loss += (vq_loss.item() - average_vq_loss) / i
+        average_perplexity += (perplexity.item() - average_perplexity) / i
+        average_accuracies += (np.array(accuracy) - average_accuracies) / i
+        average_lld_cs_loss += (lld_cs_loss.item() - average_lld_cs_loss) / i
+        average_mi_cs_loss += (mi_cs_loss.item() - average_mi_cs_loss) / i
+        average_lld_ps_loss += (lld_ps_loss.item() - average_lld_ps_loss) / i
+        average_mi_ps_loss += (mi_ps_loss.item() - average_mi_ps_loss) / i
+        average_lld_cp_loss += (lld_cp_loss.item() - average_lld_cp_loss) / i
+        average_mi_cp_loss += (mi_cp_loss.item() - average_mi_cp_loss) / i
+        
+    
+    ctime = time.time()
+    print("Eval | epoch:{}, recon loss:{:.3f}, cpc loss:{:.3f}, vq loss:{:.3f}, perpexlity:{:.3f}, lld cs loss:{:.3f}, mi cs loss:{:.3E}, lld ps loss:{:.3f}, mi ps loss:{:.3f}, lld cp loss:{:.3f}, mi cp loss:{:.3f}, used time:{:.3f}s"
+          .format(epoch, average_recon_loss, average_cpc_loss, average_vq_loss, average_perplexity, average_lld_cs_loss, average_mi_cs_loss, average_lld_ps_loss, average_mi_ps_loss, average_lld_cp_loss, average_mi_cp_loss, ctime-stime))
+    print(100 * average_accuracies)
+    results_txt = open(f'{str(checkpoint_dir)}/results.txt', 'a')
+    results_txt.write("Eval | epoch:{}, recon loss:{:.3f}, cpc loss:{:.3f}, vq loss:{:.3f}, perpexlity:{:.3f}, lld cs loss:{:.3f}, mi cs loss:{:.3E}, lld ps loss:{:.3f}, mi ps loss:{:.3f}, lld cp loss:{:.3f}, mi cp loss:{:.3f}"
+          .format(epoch, average_recon_loss, average_cpc_loss, average_vq_loss, average_perplexity, average_lld_cs_loss, average_mi_cs_loss, average_lld_ps_loss, average_mi_ps_loss, average_lld_cp_loss, average_mi_cp_loss)+'\n')
+    results_txt.write(' '.join([str(cpc_acc) for cpc_acc in average_accuracies])+'\n')
+    results_txt.close()
+    
+    to_train(all_models)
+    
+    
+@hydra.main(config_path="config/train.yaml")
+def train_model(cfg):
+    cfg.checkpoint_dir = f'{cfg.checkpoint_dir}/useCSMI{cfg.use_CSMI}_useCPMI{cfg.use_CPMI}_usePSMI{cfg.use_PSMI}_useAmp{cfg.use_amp}'
+    if cfg.encoder_lf0_type == 'no_emb': # default
+        dim_lf0 = 1
+    else:
+        dim_lf0 = 64
+    
+    checkpoint_dir = Path(utils.to_absolute_path(cfg.checkpoint_dir))
+    checkpoint_dir.mkdir(exist_ok=True, parents=True)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    
+    # define model
+    encoder = Encoder(**cfg.model.encoder)
+    encoder_lf0 = Encoder_lf0(cfg.encoder_lf0_type)
+    cpc = CPCLoss_sameSeq(**cfg.model.cpc)
+    encoder_spk = Encoder_spk()
+    cs_mi_net = CLUBSample_group(256, cfg.model.encoder.z_dim, 512)
+    ps_mi_net = CLUBSample_group(256, dim_lf0, 512)
+    cp_mi_net = CLUBSample_reshape(dim_lf0, cfg.model.encoder.z_dim, 512)
+    decoder = Decoder_ac(dim_neck=cfg.model.encoder.z_dim, dim_lf0=dim_lf0, use_l1_loss=True)
+    
+    encoder.to(device)
+    cpc.to(device)
+    encoder_lf0.to(device)
+    encoder_spk.to(device)
+    cs_mi_net.to(device)
+    ps_mi_net.to(device)
+    cp_mi_net.to(device)
+    decoder.to(device)
+
+    optimizer = optim.Adam(
+        chain(encoder.parameters(), encoder_lf0.parameters(), cpc.parameters(), encoder_spk.parameters(), decoder.parameters()),
+        lr=cfg.training.scheduler.initial_lr)
+    optimizer_cs_mi_net = optim.Adam(cs_mi_net.parameters(), lr=cfg.mi_lr)
+    optimizer_ps_mi_net = optim.Adam(ps_mi_net.parameters(), lr=cfg.mi_lr)
+    optimizer_cp_mi_net = optim.Adam(cp_mi_net.parameters(), lr=cfg.mi_lr)
+    # TODO: use_amp is set default to True to speed up training; no-amp -> more stable training? => need to be verified
+    if cfg.use_amp: 
+        [encoder, encoder_lf0, cpc, encoder_spk, decoder], optimizer = amp.initialize([encoder, encoder_lf0, cpc, encoder_spk, decoder], optimizer, opt_level='O1')
+        [cs_mi_net], optimizer_cs_mi_net = amp.initialize([cs_mi_net], optimizer_cs_mi_net, opt_level='O1')
+        [ps_mi_net], optimizer_ps_mi_net = amp.initialize([ps_mi_net], optimizer_ps_mi_net, opt_level='O1')
+        [cp_mi_net], optimizer_cp_mi_net = amp.initialize([cp_mi_net], optimizer_cp_mi_net, opt_level='O1')
+    
+    root_path = Path(utils.to_absolute_path("data"))
+    dataset = CPCDataset(
+        root=root_path,
+        n_sample_frames=cfg.training.sample_frames, # 128
+        mode='train')
+    valid_dataset = CPCDataset(
+        root=root_path,
+        n_sample_frames=cfg.training.sample_frames, # 128
+        mode='valid')
+    
+    warmup_epochs = 2000 // (len(dataset)//cfg.training.batch_size)
+    print('warmup_epochs:', warmup_epochs)
+    scheduler = WarmupScheduler(
+        optimizer,
+        warmup_epochs=warmup_epochs,
+        initial_lr=cfg.training.scheduler.initial_lr,
+        max_lr=cfg.training.scheduler.max_lr,
+        milestones=cfg.training.scheduler.milestones,
+        gamma=cfg.training.scheduler.gamma)
+    
+    dataloader = DataLoader(
+        dataset,
+        batch_size=cfg.training.batch_size, # 256
+        shuffle=True,
+        num_workers=cfg.training.n_workers,
+        pin_memory=True,
+        drop_last=False)
+    valid_dataloader = DataLoader(
+        valid_dataset,
+        batch_size=cfg.training.batch_size, # 256
+        shuffle=False,
+        num_workers=cfg.training.n_workers,
+        pin_memory=True,
+        drop_last=False)
+    
+    if cfg.resume:
+        print("Resume checkpoint from: {}:".format(cfg.resume))
+        resume_path = utils.to_absolute_path(cfg.resume)
+        checkpoint = torch.load(resume_path, map_location=lambda storage, loc: storage)
+        encoder.load_state_dict(checkpoint["encoder"])
+        encoder_lf0.load_state_dict(checkpoint["encoder_lf0"])
+        cpc.load_state_dict(checkpoint["cpc"])
+        encoder_spk.load_state_dict(checkpoint["encoder_spk"])
+        cs_mi_net.load_state_dict(checkpoint["cs_mi_net"])
+        ps_mi_net.load_state_dict(checkpoint["ps_mi_net"])
+        if cfg.use_CPMI:
+            cp_mi_net.load_state_dict(checkpoint["cp_mi_net"])
+        decoder.load_state_dict(checkpoint["decoder"])
+        optimizer.load_state_dict(checkpoint["optimizer"])
+        optimizer_cs_mi_net.load_state_dict(checkpoint["optimizer_cs_mi_net"])
+        optimizer_ps_mi_net.load_state_dict(checkpoint["optimizer_ps_mi_net"])
+        optimizer_cp_mi_net.load_state_dict(checkpoint["optimizer_cp_mi_net"])
+        if cfg.use_amp:
+            amp.load_state_dict(checkpoint["amp"])
+        scheduler.load_state_dict(checkpoint["scheduler"])
+        start_epoch = checkpoint["epoch"]
+    else:
+        start_epoch = 1
+    
+    if os.path.exists(f'{str(checkpoint_dir)}/results.txt'):
+        wmode = 'a'
+    else:
+        wmode = 'w'
+    results_txt = open(f'{str(checkpoint_dir)}/results.txt', wmode)
+    results_txt.write('save training info...\n')
+    results_txt.close()
+    
+    global_step = 0
+    stime = time.time()
+    for epoch in range(start_epoch, cfg.training.n_epochs + 1):
+        average_cpc_loss = average_vq_loss = average_perplexity = average_recon_loss = 0
+        average_accuracies = np.zeros(cfg.training.n_prediction_steps)
+        average_lld_cs_loss = average_mi_cs_loss = average_lld_ps_loss = average_mi_ps_loss = average_lld_cp_loss = average_mi_cp_loss = 0
+
+        for i, (mels, lf0, speakers) in enumerate(dataloader, 1):
+            lf0 = lf0.to(device)
+            mels = mels.to(device) # (bs, 80, 128)
+            if cfg.use_CSMI or cfg.use_CPMI or cfg.use_PSMI:
+                for j in range(cfg.mi_iters):
+                    optimizer_cs_mi_net, lld_cs_loss, optimizer_ps_mi_net, lld_ps_loss, optimizer_cp_mi_net, lld_cp_loss = mi_first_forward(mels, lf0, encoder, encoder_lf0, encoder_spk, cs_mi_net, optimizer_cs_mi_net, \
+                                                               ps_mi_net, optimizer_ps_mi_net, cp_mi_net, optimizer_cp_mi_net, cfg)
+            else:
+                lld_cs_loss = torch.tensor(0.)
+                lld_ps_loss = torch.tensor(0.)
+                lld_cp_loss = torch.tensor(0.)
+                
+            optimizer, recon_loss, vq_loss, cpc_loss, accuracy, perplexity, mi_cs_loss, mi_ps_loss, mi_cp_loss = mi_second_forward(mels, lf0, \
+                                                                                                                                encoder, encoder_lf0, cpc, \
+                                                                                                                                encoder_spk, cs_mi_net, ps_mi_net, \
+                                                                                                                                cp_mi_net, decoder, cfg, \
+                                                                                                                                optimizer, scheduler)
+           
+            average_recon_loss += (recon_loss.item() - average_recon_loss) / i
+            average_cpc_loss += (cpc_loss.item() - average_cpc_loss) / i
+            average_vq_loss += (vq_loss.item() - average_vq_loss) / i
+            average_perplexity += (perplexity.item() - average_perplexity) / i
+            average_accuracies += (np.array(accuracy) - average_accuracies) / i
+            average_lld_cs_loss += (lld_cs_loss.item() - average_lld_cs_loss) / i
+            average_mi_cs_loss += (mi_cs_loss.item() - average_mi_cs_loss) / i
+            average_lld_ps_loss += (lld_ps_loss.item() - average_lld_ps_loss) / i
+            average_mi_ps_loss += (mi_ps_loss.item() - average_mi_ps_loss) / i
+            average_lld_cp_loss += (lld_cp_loss.item() - average_lld_cp_loss) / i
+            average_mi_cp_loss += (mi_cp_loss.item() - average_mi_cp_loss) / i
+            
+            
+            ctime = time.time()
+            print("epoch:{}, global step:{}, recon loss:{:.3f}, cpc loss:{:.3f}, vq loss:{:.3f}, perpexlity:{:.3f}, lld cs loss:{:.3f}, mi cs loss:{:.3E}, lld ps loss:{:.3f}, mi ps loss:{:.3f}, lld cp loss:{:.3f}, mi cp loss:{:.3f}, used time:{:.3f}s"
+                  .format(epoch, global_step, average_recon_loss, average_cpc_loss, average_vq_loss, average_perplexity, average_lld_cs_loss, average_mi_cs_loss, average_lld_ps_loss, average_mi_ps_loss, average_lld_cp_loss, average_mi_cp_loss, ctime-stime))
+            print(100 * average_accuracies)
+            stime = time.time()
+            global_step += 1
+            # scheduler.step()
+            
+        results_txt = open(f'{str(checkpoint_dir)}/results.txt', 'a')
+        results_txt.write("epoch:{}, global step:{}, recon loss:{:.3f}, cpc loss:{:.3f}, vq loss:{:.3f}, perpexlity:{:.3f}, lld cs loss:{:.3f}, mi cs loss:{:.3E}, lld ps loss:{:.3f}, mi ps loss:{:.3f}, lld cp loss:{:.3f}, mi cp loss:{:.3f}"
+              .format(epoch, global_step, average_recon_loss, average_cpc_loss, average_vq_loss, average_perplexity, average_lld_cs_loss, average_mi_cs_loss, average_lld_ps_loss, average_mi_ps_loss, average_lld_cp_loss, average_mi_cp_loss)+'\n')
+        results_txt.write(' '.join([str(cpc_acc) for cpc_acc in average_accuracies])+'\n')
+        results_txt.close()
+        scheduler.step()
+        
+        
+        if epoch % cfg.training.log_interval == 0 and epoch != start_epoch:
+            eval_model(epoch, checkpoint_dir, device, valid_dataloader, encoder, encoder_lf0, cpc, encoder_spk, cs_mi_net, ps_mi_net, cp_mi_net, decoder, cfg)
+
+            ctime = time.time()
+            print("epoch:{}, global step:{}, recon loss:{:.3f}, cpc loss:{:.3f}, vq loss:{:.3f}, perpexlity:{:.3f}, lld cs loss:{:.3f}, mi cs loss:{:.3E}, lld ps loss:{:.3f}, mi ps loss:{:.3f}, lld cp loss:{:.3f}, mi cp loss:{:.3f}, used time:{:.3f}s"
+                  .format(epoch, global_step, average_recon_loss, average_cpc_loss, average_vq_loss, average_perplexity, average_lld_cs_loss, average_mi_cs_loss, average_lld_ps_loss, average_mi_ps_loss, average_lld_cp_loss, average_mi_cp_loss, ctime-stime))
+            print(100 * average_accuracies)
+            stime = time.time()
+            
+        if epoch % cfg.training.checkpoint_interval == 0 and epoch != start_epoch:
+            save_checkpoint(encoder, encoder_lf0, cpc, encoder_spk, \
+                            cs_mi_net, ps_mi_net, cp_mi_net, decoder, \
+                            optimizer, optimizer_cs_mi_net, optimizer_ps_mi_net, optimizer_cp_mi_net, scheduler, amp, epoch, checkpoint_dir, cfg)
+
+
+if __name__ == "__main__":
+    train_model()