Spaces:

tobiasc
/

conex

Build error

App Files Files Community

tobiasc commited on Jun 12, 2022

Commit

ad16788

•

1 Parent(s): 216ab96

Initial commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +160 -0
app.py +180 -0
config.yaml +266 -0
espnet/__init__.py +8 -0
espnet/asr/__init__.py +1 -0
espnet/asr/asr_mix_utils.py +187 -0
espnet/asr/asr_utils.py +1024 -0
espnet/asr/chainer_backend/__init__.py +1 -0
espnet/asr/chainer_backend/asr.py +575 -0
espnet/asr/pytorch_backend/__init__.py +1 -0
espnet/asr/pytorch_backend/asr.py +1500 -0
espnet/asr/pytorch_backend/asr_init.py +282 -0
espnet/asr/pytorch_backend/asr_mix.py +654 -0
espnet/asr/pytorch_backend/recog.py +152 -0
espnet/bin/__init__.py +1 -0
espnet/bin/asr_align.py +348 -0
espnet/bin/asr_enhance.py +191 -0
espnet/bin/asr_recog.py +363 -0
espnet/bin/asr_train.py +644 -0
espnet/bin/lm_train.py +288 -0
espnet/bin/mt_train.py +480 -0
espnet/bin/mt_trans.py +186 -0
espnet/bin/st_train.py +550 -0
espnet/bin/st_trans.py +183 -0
espnet/bin/tts_decode.py +180 -0
espnet/bin/tts_train.py +359 -0
espnet/bin/vc_decode.py +174 -0
espnet/bin/vc_train.py +368 -0
espnet/lm/__init__.py +1 -0
espnet/lm/chainer_backend/__init__.py +1 -0
espnet/lm/chainer_backend/extlm.py +199 -0
espnet/lm/chainer_backend/lm.py +484 -0
espnet/lm/lm_utils.py +293 -0
espnet/lm/pytorch_backend/__init__.py +1 -0
espnet/lm/pytorch_backend/extlm.py +218 -0
espnet/lm/pytorch_backend/lm.py +410 -0
espnet/mt/__init__.py +1 -0
espnet/mt/mt_utils.py +83 -0
espnet/mt/pytorch_backend/__init__.py +1 -0
espnet/mt/pytorch_backend/mt.py +600 -0
espnet/nets/__init__.py +1 -0
espnet/nets/asr_interface.py +172 -0
espnet/nets/batch_beam_search.py +348 -0
espnet/nets/batch_beam_search_online_sim.py +270 -0
espnet/nets/beam_search.py +512 -0
espnet/nets/beam_search_transducer.py +629 -0
espnet/nets/chainer_backend/__init__.py +1 -0
espnet/nets/chainer_backend/asr_interface.py +29 -0
espnet/nets/chainer_backend/ctc.py +184 -0
espnet/nets/chainer_backend/deterministic_embed_id.py +253 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,160 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

app.py ADDED Viewed

	@@ -0,0 +1,180 @@

+from espnet2.bin.tts_inference import Text2Speech
+import torch
+from parallel_wavegan.utils import download_pretrained_model, load_model
+from phonemizer import phonemize
+from phonemizer.separator import Separator
+import gradio as gr
+s = Separator(word=None, phone=" ")
+config_path = "config.yaml"
+model_path = "model.pth"
+vocoder_tag = "ljspeech_parallel_wavegan.v3"
+vocoder = load_model(download_pretrained_model(vocoder_tag)).to("cpu").eval()
+vocoder.remove_weight_norm()
+global_styles = {
+    "Style 1": torch.load("style1.pt"),
+    "Style 2": torch.load("style2.pt"),
+    "Style 3": torch.load("style3.pt"),
+    "Style 4": torch.load("style4.pt"),
+    "Style 5": torch.load("style5.pt"),
+    "Style 6": torch.load("style6.pt"),
+}
+def inference(text, global_style, alpha, prev_fg_inds, input_fg_inds):
+    with torch.no_grad():
+        text2speech = Text2Speech(
+            config_path,
+            model_path,
+            device="cpu",
+            # Only for Tacotron 2
+            threshold=0.5,
+            minlenratio=0.0,
+            maxlenratio=10.0,
+            use_att_constraint=False,
+            backward_window=1,
+            forward_window=3,
+            # Only for FastSpeech & FastSpeech2
+            speed_control_alpha=alpha,
+        )
+        text2speech.spc2wav = None  # Disable griffin-lim
+        style_emb = torch.flatten(global_styles[global_style])
+        phoneme_string = phonemize(
+            text, language="mb-us1", backend="espeak-mbrola", separator=s
+        )
+        phonemes = phoneme_string.split(" ")
+        max_edit_index = -1
+        for i in range(len(input_fg_inds) - 1, -1, -1):
+            if input_fg_inds[i] != "":
+                max_edit_index = i
+                break
+        if max_edit_index == -1:
+            _, c, _, _, _, _, _, output_fg_inds = text2speech(
+                phoneme_string, ref_embs=style_emb
+            )
+        else:
+            input_fg_inds_int_list = []
+            for i in range(max_edit_index + 1):
+                if input_fg_inds[i] != "":
+                    input_fg_inds_int_list.append(int(input_fg_inds[i]))
+                else:
+                    input_fg_inds_int_list.append(prev_fg_inds[i][1])
+            input_fg_inds = input_fg_inds_int_list
+            prev_fg_inds_list = [[[row[1], row[2], row[3]] for row in prev_fg_inds]]
+            prev_fg_inds = torch.tensor(prev_fg_inds_list, dtype=torch.int64)
+            fg_inds = torch.tensor(input_fg_inds_int_list).unsqueeze(0)
+            _, c, _, _, _, _, _, part_output_fg_inds = text2speech(
+                phoneme_string, ref_embs=style_emb, fg_inds=fg_inds
+            )
+            prev_fg_inds[0, max_edit_index + 1 :, :] = part_output_fg_inds[0]
+            output_fg_inds = prev_fg_inds
+        output_fg_inds_list = output_fg_inds.tolist()[0]
+        padded_phonemes = ["", *phonemes]
+        dataframe_values = [
+            [phoneme, *fgs]
+            for phoneme, fgs in zip(padded_phonemes, output_fg_inds_list)
+        ]
+        selected_inds = [
+            [input_fg_inds[i]] if i < len(input_fg_inds) else [""]
+            for i in range(len(padded_phonemes))
+        ]
+        wav = vocoder.inference(c)
+        return [
+            (22050, wav.view(-1).cpu().numpy()),
+            dataframe_values,
+            selected_inds,
+        ]
+demo = gr.Blocks()
+with demo:
+    gr.Markdown(
+        """
+    # ConEx Demo
+    This demo shows the capabilities of ConEx, a model for **Con**trollable **Ex**pressive speech synthesis.
+    ConEx allows you to generate speech in a certain speaking style, and gives you the ability to edit the prosody* of the generated speech at a fine level.
+    We proposed ConEx in our paper titled ["Interactive Multi-Level Prosody Control for Expressive Speech Synthesis"](https://jessa.github.io/assets/pdf/cornille2022icassp.pdf), published in proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP) 2022.
+    To convert text to speech: input some text, choose the desired speaking style, set the duration factor (higher = slower speech), and press "Generate speech".
+    **prosody refers to speech characteristics such as intonation, stress, rhythm*
+    """
+    )
+    with gr.Row():
+        text_input = gr.Textbox(
+            label="Input text",
+            lines=4,
+            placeholder="E.g. I didn't say he stole the money",
+        )
+        with gr.Column():
+            global_style_dropdown = gr.Dropdown(
+                ["Style 1", "Style 2", "Style 3", "Style 4", "Style 5", "Style 6"],
+                value="Style 1",
+                label="Global speaking style",
+            )
+            alpha_slider = gr.Slider(
+                0.1, 2, value=1, step=0.1, label="Alpha (duration factor)"
+            )
+    audio = gr.Audio()
+    with gr.Row():
+        button = gr.Button("Generate Speech")
+    gr.Markdown(
+        """
+    ### Fine-grained prosody editor
+    Once you've generated some speech, the following table will show the id of the prosody embedding used for each phoneme.
+    A prosody embedding determines the prosody of the phoneme.
+    The table not only shows the prosody embeddings that are used by default (the top predictions), but also two more likely prosody embeddings.
+    In order to change the prosody of a phoneme, write a new prosody embedding id in the "Chosen prosody embeddings" column and press "Generate speech" again.
+    You can use any number from 0-31, but the 2nd and 3rd predictions are more likely to give a fitting prosody.
+    Based on your edit, new prosody embeddings will be generated for the phonemes after the edit.
+    Thus, you can iteratively change the prosody by starting from the beginning of the utterance and working your through the utterance, making edits as you see fit.
+    The prosody embeddings before your edit will remain the same as before, and will be copied to the "Chosen prosody embeddings" column.
+    """
+    )
+    with gr.Row():
+        phoneme_preds_df = gr.Dataframe(
+            headers=["Phoneme", "🥇 Top pred", "🥈 2nd pred", "🥉 3rd pred"],
+            type="array",
+            col_count=(4, "static"),
+        )
+        phoneme_edits_df = gr.Dataframe(
+            headers=["Chosen prosody embeddings"], type="array", col_count=(1, "static")
+        )
+    button.click(
+        inference,
+        inputs=[
+            text_input,
+            global_style_dropdown,
+            alpha_slider,
+            phoneme_preds_df,
+            phoneme_edits_df,
+        ],
+        outputs=[audio, phoneme_preds_df, phoneme_edits_df],
+    )
+demo.launch()

config.yaml ADDED Viewed

	@@ -0,0 +1,266 @@

+config: conf/ar_prior_train.yaml
+print_config: false
+log_level: INFO
+dry_run: false
+iterator_type: sequence
+output_dir: exp/tts_finetune_ar_prior
+ngpu: 1
+seed: 0
+num_workers: 1
+num_att_plot: 3
+dist_backend: nccl
+dist_init_method: env://
+dist_world_size: null
+dist_rank: null
+local_rank: 0
+dist_master_addr: null
+dist_master_port: null
+dist_launcher: null
+multiprocessing_distributed: false
+unused_parameters: false
+sharded_ddp: false
+cudnn_enabled: true
+cudnn_benchmark: false
+cudnn_deterministic: true
+collect_stats: false
+write_collected_feats: false
+max_epoch: 500
+patience: null
+val_scheduler_criterion:
+- valid
+- loss
+early_stopping_criterion:
+- valid
+- loss
+- min
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
+keep_nbest_models: 5
+grad_clip: 1.0
+grad_clip_type: 2.0
+grad_noise: false
+accum_grad: 8
+no_forward_run: false
+resume: true
+train_dtype: float32
+use_amp: false
+log_interval: null
+use_tensorboard: true
+use_wandb: false
+wandb_project: null
+wandb_id: null
+detect_anomaly: false
+pretrain_path: null
+init_param:
+- /data/leuven/339/vsc33942/espnet-mirror/egs2/acapela_blizzard/tts1/exp/tts_train_raw_phn_none/valid.loss.best.pth:::tts.prosody_encoder.ar_prior
+freeze_param:
+- encoder.,prosody_encoder.ref_encoder.,prosody_encoder.fg_encoder.,prosody_encoder.global_encoder.,prosody_encoder.global_projection.,prosody_encoder.vq_layer.,prosody_encoder.qfg_projection,duration_predictor.,length_regulator,decoder.,feat_out,postnet
+num_iters_per_epoch: 50
+batch_size: 20
+valid_batch_size: null
+batch_bins: 3000000
+valid_batch_bins: null
+train_shape_file:
+- exp/tts_stats_raw_phn_none/train/text_shape.phn
+- exp/tts_stats_raw_phn_none/train/speech_shape
+valid_shape_file:
+- exp/tts_stats_raw_phn_none/valid/text_shape.phn
+- exp/tts_stats_raw_phn_none/valid/speech_shape
+batch_type: numel
+valid_batch_type: null
+fold_length:
+- 150
+- 204800
+sort_in_batch: descending
+sort_batch: descending
+multiple_iterator: false
+chunk_length: 500
+chunk_shift_ratio: 0.5
+num_cache_chunks: 1024
+train_data_path_and_name_and_type:
+-   - dump/raw/tr_no_dev/text
+    - text
+    - text
+-   - data/durations/tr_no_dev/durations
+    - durations
+    - text_int
+-   - dump/raw/tr_no_dev/wav.scp
+    - speech
+    - sound
+valid_data_path_and_name_and_type:
+-   - dump/raw/dev/text
+    - text
+    - text
+-   - data/durations/dev/durations
+    - durations
+    - text_int
+-   - dump/raw/dev/wav.scp
+    - speech
+    - sound
+allow_variable_data_keys: false
+max_cache_size: 0.0
+max_cache_fd: 32
+valid_max_cache_size: null
+optim: adam
+optim_conf:
+    lr: 1.0
+scheduler: noamlr
+scheduler_conf:
+    model_size: 384
+    warmup_steps: 4000
+token_list:
+- <blank>
+- <unk>
+- n
+- '@'
+- t
+- _
+- s
+- I
+- r
+- d
+- l
+- m
+- i
+- '{'
+- z
+- D
+- w
+- r=
+- f
+- v
+- E1
+- b
+- t_h
+- h
+- V
+- u
+- k
+- I1
+- '{1'
+- k_h
+- N
+- EI1
+- V1
+- O1
+- AI
+- H
+- S
+- p_h
+- '@U1'
+- i1
+- g
+- AI1
+- j
+- O
+- p
+- u1
+- r=1
+- tS
+- Or
+- '4'
+- A
+- Or1
+- E
+- dZ
+- T
+- aU1
+- U
+- Er1
+- '@U'
+- U1
+- Ar1
+- Er
+- aU
+- EI
+- ir1
+- l=
+- OI1
+- Ar
+- Ur1
+- n=
+- A1
+- Z
+- '?'
+- ir
+- Ur
+- OI
+- <sos/eos>
+odim: null
+model_conf: {}
+use_preprocessor: true
+token_type: phn
+bpemodel: null
+non_linguistic_symbols: null
+cleaner: null
+g2p: null
+feats_extract: fbank
+feats_extract_conf:
+    fs: 22050
+    fmin: 80
+    fmax: 7600
+    n_mels: 80
+    hop_length: 256
+    n_fft: 1024
+    win_length: null
+normalize: global_mvn
+normalize_conf:
+    stats_file: feats_stats.npz
+tts: fastespeech
+tts_conf:
+    adim: 128
+    aheads: 2
+    elayers: 4
+    eunits: 1536
+    dlayers: 4
+    dunits: 1536
+    positionwise_layer_type: conv1d
+    positionwise_conv_kernel_size: 3
+    duration_predictor_layers: 2
+    duration_predictor_chans: 128
+    duration_predictor_kernel_size: 3
+    duration_predictor_dropout_rate: 0.2
+    postnet_layers: 5
+    postnet_filts: 5
+    postnet_chans: 256
+    use_masking: true
+    use_scaled_pos_enc: true
+    encoder_normalize_before: true
+    decoder_normalize_before: true
+    reduction_factor: 1
+    init_type: xavier_uniform
+    init_enc_alpha: 1.0
+    init_dec_alpha: 1.0
+    transformer_enc_dropout_rate: 0.2
+    transformer_enc_positional_dropout_rate: 0.2
+    transformer_enc_attn_dropout_rate: 0.2
+    transformer_dec_dropout_rate: 0.2
+    transformer_dec_positional_dropout_rate: 0.2
+    transformer_dec_attn_dropout_rate: 0.2
+    ref_enc_conv_layers: 2
+    ref_enc_conv_kernel_size: 3
+    ref_enc_conv_stride: 2
+    ref_enc_gru_layers: 1
+    ref_enc_gru_units: 32
+    ref_emb_integration_type: add
+    prosody_num_embs: 32
+    prosody_hidden_dim: 3
+    prosody_emb_integration_type: add
+pitch_extract: null
+pitch_extract_conf: {}
+pitch_normalize: null
+pitch_normalize_conf: {}
+energy_extract: null
+energy_extract_conf: {}
+energy_normalize: null
+energy_normalize_conf: {}
+required:
+- output_dir
+- token_list
+version: 0.9.9
+distributed: false

espnet/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""Initialize espnet package."""
+import os
+dirname = os.path.dirname(__file__)
+version_file = os.path.join(dirname, "version.txt")
+with open(version_file, "r") as f:
+    __version__ = f.read().strip()

espnet/asr/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Initialize sub package."""

espnet/asr/asr_mix_utils.py ADDED Viewed

	@@ -0,0 +1,187 @@

+#!/usr/bin/env python3
+"""
+This script is used to provide utility functions designed for multi-speaker ASR.
+Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+Most functions can be directly used as in asr_utils.py:
+    CompareValueTrigger, restore_snapshot, adadelta_eps_decay, chainer_load,
+    torch_snapshot, torch_save, torch_resume, AttributeDict, get_model_conf.
+"""
+import copy
+import logging
+import os
+from chainer.training import extension
+import matplotlib
+from espnet.asr.asr_utils import parse_hypothesis
+matplotlib.use("Agg")
+# * -------------------- chainer extension related -------------------- *
+class PlotAttentionReport(extension.Extension):
+    """Plot attention reporter.
+    Args:
+        att_vis_fn (espnet.nets.*_backend.e2e_asr.calculate_all_attentions):
+            Function of attention visualization.
+        data (list[tuple(str, dict[str, dict[str, Any]])]): List json utt key items.
+        outdir (str): Directory to save figures.
+        converter (espnet.asr.*_backend.asr.CustomConverter):
+            CustomConverter object. Function to convert data.
+        device (torch.device): The destination device to send tensor.
+        reverse (bool): If True, input and output length are reversed.
+    """
+    def __init__(self, att_vis_fn, data, outdir, converter, device, reverse=False):
+        """Initialize PlotAttentionReport."""
+        self.att_vis_fn = att_vis_fn
+        self.data = copy.deepcopy(data)
+        self.outdir = outdir
+        self.converter = converter
+        self.device = device
+        self.reverse = reverse
+        if not os.path.exists(self.outdir):
+            os.makedirs(self.outdir)
+    def __call__(self, trainer):
+        """Plot and save imaged matrix of att_ws."""
+        att_ws_sd = self.get_attention_weights()
+        for ns, att_ws in enumerate(att_ws_sd):
+            for idx, att_w in enumerate(att_ws):
+                filename = "%s/%s.ep.{.updater.epoch}.output%d.png" % (
+                    self.outdir,
+                    self.data[idx][0],
+                    ns + 1,
+                )
+                att_w = self.get_attention_weight(idx, att_w, ns)
+                self._plot_and_save_attention(att_w, filename.format(trainer))
+    def log_attentions(self, logger, step):
+        """Add image files of attention matrix to tensorboard."""
+        att_ws_sd = self.get_attention_weights()
+        for ns, att_ws in enumerate(att_ws_sd):
+            for idx, att_w in enumerate(att_ws):
+                att_w = self.get_attention_weight(idx, att_w, ns)
+                plot = self.draw_attention_plot(att_w)
+                logger.add_figure("%s" % (self.data[idx][0]), plot.gcf(), step)
+                plot.clf()
+    def get_attention_weights(self):
+        """Return attention weights.
+        Returns:
+            arr_ws_sd (numpy.ndarray): attention weights. It's shape would be
+                differ from bachend.dtype=float
+                * pytorch-> 1) multi-head case => (B, H, Lmax, Tmax). 2)
+                  other case => (B, Lmax, Tmax).
+                * chainer-> attention weights (B, Lmax, Tmax).
+        """
+        batch = self.converter([self.converter.transform(self.data)], self.device)
+        att_ws_sd = self.att_vis_fn(*batch)
+        return att_ws_sd
+    def get_attention_weight(self, idx, att_w, spkr_idx):
+        """Transform attention weight in regard to self.reverse."""
+        if self.reverse:
+            dec_len = int(self.data[idx][1]["input"][0]["shape"][0])
+            enc_len = int(self.data[idx][1]["output"][spkr_idx]["shape"][0])
+        else:
+            dec_len = int(self.data[idx][1]["output"][spkr_idx]["shape"][0])
+            enc_len = int(self.data[idx][1]["input"][0]["shape"][0])
+        if len(att_w.shape) == 3:
+            att_w = att_w[:, :dec_len, :enc_len]
+        else:
+            att_w = att_w[:dec_len, :enc_len]
+        return att_w
+    def draw_attention_plot(self, att_w):
+        """Visualize attention weights matrix.
+        Args:
+            att_w(Tensor): Attention weight matrix.
+        Returns:
+            matplotlib.pyplot: pyplot object with attention matrix image.
+        """
+        import matplotlib.pyplot as plt
+        if len(att_w.shape) == 3:
+            for h, aw in enumerate(att_w, 1):
+                plt.subplot(1, len(att_w), h)
+                plt.imshow(aw, aspect="auto")
+                plt.xlabel("Encoder Index")
+                plt.ylabel("Decoder Index")
+        else:
+            plt.imshow(att_w, aspect="auto")
+            plt.xlabel("Encoder Index")
+            plt.ylabel("Decoder Index")
+        plt.tight_layout()
+        return plt
+    def _plot_and_save_attention(self, att_w, filename):
+        plt = self.draw_attention_plot(att_w)
+        plt.savefig(filename)
+        plt.close()
+def add_results_to_json(js, nbest_hyps_sd, char_list):
+    """Add N-best results to json.
+    Args:
+        js (dict[str, Any]): Groundtruth utterance dict.
+        nbest_hyps_sd (list[dict[str, Any]]):
+            List of hypothesis for multi_speakers (# Utts x # Spkrs).
+        char_list (list[str]): List of characters.
+    Returns:
+        dict[str, Any]: N-best results added utterance dict.
+    """
+    # copy old json info
+    new_js = dict()
+    new_js["utt2spk"] = js["utt2spk"]
+    num_spkrs = len(nbest_hyps_sd)
+    new_js["output"] = []
+    for ns in range(num_spkrs):
+        tmp_js = []
+        nbest_hyps = nbest_hyps_sd[ns]
+        for n, hyp in enumerate(nbest_hyps, 1):
+            # parse hypothesis
+            rec_text, rec_token, rec_tokenid, score = parse_hypothesis(hyp, char_list)
+            # copy ground-truth
+            out_dic = dict(js["output"][ns].items())
+            # update name
+            out_dic["name"] += "[%d]" % n
+            # add recognition results
+            out_dic["rec_text"] = rec_text
+            out_dic["rec_token"] = rec_token
+            out_dic["rec_tokenid"] = rec_tokenid
+            out_dic["score"] = score
+            # add to list of N-best result dicts
+            tmp_js.append(out_dic)
+            # show 1-best result
+            if n == 1:
+                logging.info("groundtruth: %s" % out_dic["text"])
+                logging.info("prediction : %s" % out_dic["rec_text"])
+        new_js["output"].append(tmp_js)
+    return new_js

espnet/asr/asr_utils.py ADDED Viewed

	@@ -0,0 +1,1024 @@

+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+# Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+import argparse
+import copy
+import json
+import logging
+import os
+import shutil
+import tempfile
+import numpy as np
+import torch
+# * -------------------- training iterator related -------------------- *
+class CompareValueTrigger(object):
+    """Trigger invoked when key value getting bigger or lower than before.
+    Args:
+        key (str) : Key of value.
+        compare_fn ((float, float) -> bool) : Function to compare the values.
+        trigger (tuple(int, str)) : Trigger that decide the comparison interval.
+    """
+    def __init__(self, key, compare_fn, trigger=(1, "epoch")):
+        from chainer import training
+        self._key = key
+        self._best_value = None
+        self._interval_trigger = training.util.get_trigger(trigger)
+        self._init_summary()
+        self._compare_fn = compare_fn
+    def __call__(self, trainer):
+        """Get value related to the key and compare with current value."""
+        observation = trainer.observation
+        summary = self._summary
+        key = self._key
+        if key in observation:
+            summary.add({key: observation[key]})
+        if not self._interval_trigger(trainer):
+            return False
+        stats = summary.compute_mean()
+        value = float(stats[key])  # copy to CPU
+        self._init_summary()
+        if self._best_value is None:
+            # initialize best value
+            self._best_value = value
+            return False
+        elif self._compare_fn(self._best_value, value):
+            return True
+        else:
+            self._best_value = value
+            return False
+    def _init_summary(self):
+        import chainer
+        self._summary = chainer.reporter.DictSummary()
+try:
+    from chainer.training import extension
+except ImportError:
+    PlotAttentionReport = None
+else:
+    class PlotAttentionReport(extension.Extension):
+        """Plot attention reporter.
+        Args:
+            att_vis_fn (espnet.nets.*_backend.e2e_asr.E2E.calculate_all_attentions):
+                Function of attention visualization.
+            data (list[tuple(str, dict[str, list[Any]])]): List json utt key items.
+            outdir (str): Directory to save figures.
+            converter (espnet.asr.*_backend.asr.CustomConverter):
+                Function to convert data.
+            device (int | torch.device): Device.
+            reverse (bool): If True, input and output length are reversed.
+            ikey (str): Key to access input
+                (for ASR/ST ikey="input", for MT ikey="output".)
+            iaxis (int): Dimension to access input
+                (for ASR/ST iaxis=0, for MT iaxis=1.)
+            okey (str): Key to access output
+                (for ASR/ST okey="input", MT okay="output".)
+            oaxis (int): Dimension to access output
+                (for ASR/ST oaxis=0, for MT oaxis=0.)
+            subsampling_factor (int): subsampling factor in encoder
+        """
+        def __init__(
+            self,
+            att_vis_fn,
+            data,
+            outdir,
+            converter,
+            transform,
+            device,
+            reverse=False,
+            ikey="input",
+            iaxis=0,
+            okey="output",
+            oaxis=0,
+            subsampling_factor=1,
+        ):
+            self.att_vis_fn = att_vis_fn
+            self.data = copy.deepcopy(data)
+            self.data_dict = {k: v for k, v in copy.deepcopy(data)}
+            # key is utterance ID
+            self.outdir = outdir
+            self.converter = converter
+            self.transform = transform
+            self.device = device
+            self.reverse = reverse
+            self.ikey = ikey
+            self.iaxis = iaxis
+            self.okey = okey
+            self.oaxis = oaxis
+            self.factor = subsampling_factor
+            if not os.path.exists(self.outdir):
+                os.makedirs(self.outdir)
+        def __call__(self, trainer):
+            """Plot and save image file of att_ws matrix."""
+            att_ws, uttid_list = self.get_attention_weights()
+            if isinstance(att_ws, list):  # multi-encoder case
+                num_encs = len(att_ws) - 1
+                # atts
+                for i in range(num_encs):
+                    for idx, att_w in enumerate(att_ws[i]):
+                        filename = "%s/%s.ep.{.updater.epoch}.att%d.png" % (
+                            self.outdir,
+                            uttid_list[idx],
+                            i + 1,
+                        )
+                        att_w = self.trim_attention_weight(uttid_list[idx], att_w)
+                        np_filename = "%s/%s.ep.{.updater.epoch}.att%d.npy" % (
+                            self.outdir,
+                            uttid_list[idx],
+                            i + 1,
+                        )
+                        np.save(np_filename.format(trainer), att_w)
+                        self._plot_and_save_attention(att_w, filename.format(trainer))
+                # han
+                for idx, att_w in enumerate(att_ws[num_encs]):
+                    filename = "%s/%s.ep.{.updater.epoch}.han.png" % (
+                        self.outdir,
+                        uttid_list[idx],
+                    )
+                    att_w = self.trim_attention_weight(uttid_list[idx], att_w)
+                    np_filename = "%s/%s.ep.{.updater.epoch}.han.npy" % (
+                        self.outdir,
+                        uttid_list[idx],
+                    )
+                    np.save(np_filename.format(trainer), att_w)
+                    self._plot_and_save_attention(
+                        att_w, filename.format(trainer), han_mode=True
+                    )
+            else:
+                for idx, att_w in enumerate(att_ws):
+                    filename = "%s/%s.ep.{.updater.epoch}.png" % (
+                        self.outdir,
+                        uttid_list[idx],
+                    )
+                    att_w = self.trim_attention_weight(uttid_list[idx], att_w)
+                    np_filename = "%s/%s.ep.{.updater.epoch}.npy" % (
+                        self.outdir,
+                        uttid_list[idx],
+                    )
+                    np.save(np_filename.format(trainer), att_w)
+                    self._plot_and_save_attention(att_w, filename.format(trainer))
+        def log_attentions(self, logger, step):
+            """Add image files of att_ws matrix to the tensorboard."""
+            att_ws, uttid_list = self.get_attention_weights()
+            if isinstance(att_ws, list):  # multi-encoder case
+                num_encs = len(att_ws) - 1
+                # atts
+                for i in range(num_encs):
+                    for idx, att_w in enumerate(att_ws[i]):
+                        att_w = self.trim_attention_weight(uttid_list[idx], att_w)
+                        plot = self.draw_attention_plot(att_w)
+                        logger.add_figure(
+                            "%s_att%d" % (uttid_list[idx], i + 1),
+                            plot.gcf(),
+                            step,
+                        )
+                # han
+                for idx, att_w in enumerate(att_ws[num_encs]):
+                    att_w = self.trim_attention_weight(uttid_list[idx], att_w)
+                    plot = self.draw_han_plot(att_w)
+                    logger.add_figure(
+                        "%s_han" % (uttid_list[idx]),
+                        plot.gcf(),
+                        step,
+                    )
+            else:
+                for idx, att_w in enumerate(att_ws):
+                    att_w = self.trim_attention_weight(uttid_list[idx], att_w)
+                    plot = self.draw_attention_plot(att_w)
+                    logger.add_figure("%s" % (uttid_list[idx]), plot.gcf(), step)
+        def get_attention_weights(self):
+            """Return attention weights.
+            Returns:
+                numpy.ndarray: attention weights. float. Its shape would be
+                    differ from backend.
+                    * pytorch-> 1) multi-head case => (B, H, Lmax, Tmax), 2)
+                      other case => (B, Lmax, Tmax).
+                    * chainer-> (B, Lmax, Tmax)
+            """
+            return_batch, uttid_list = self.transform(self.data, return_uttid=True)
+            batch = self.converter([return_batch], self.device)
+            if isinstance(batch, tuple):
+                att_ws = self.att_vis_fn(*batch)
+            else:
+                att_ws = self.att_vis_fn(**batch)
+            return att_ws, uttid_list
+        def trim_attention_weight(self, uttid, att_w):
+            """Transform attention matrix with regard to self.reverse."""
+            if self.reverse:
+                enc_key, enc_axis = self.okey, self.oaxis
+                dec_key, dec_axis = self.ikey, self.iaxis
+            else:
+                enc_key, enc_axis = self.ikey, self.iaxis
+                dec_key, dec_axis = self.okey, self.oaxis
+            dec_len = int(self.data_dict[uttid][dec_key][dec_axis]["shape"][0])
+            enc_len = int(self.data_dict[uttid][enc_key][enc_axis]["shape"][0])
+            if self.factor > 1:
+                enc_len //= self.factor
+            if len(att_w.shape) == 3:
+                att_w = att_w[:, :dec_len, :enc_len]
+            else:
+                att_w = att_w[:dec_len, :enc_len]
+            return att_w
+        def draw_attention_plot(self, att_w):
+            """Plot the att_w matrix.
+            Returns:
+                matplotlib.pyplot: pyplot object with attention matrix image.
+            """
+            import matplotlib
+            matplotlib.use("Agg")
+            import matplotlib.pyplot as plt
+            plt.clf()
+            att_w = att_w.astype(np.float32)
+            if len(att_w.shape) == 3:
+                for h, aw in enumerate(att_w, 1):
+                    plt.subplot(1, len(att_w), h)
+                    plt.imshow(aw, aspect="auto")
+                    plt.xlabel("Encoder Index")
+                    plt.ylabel("Decoder Index")
+            else:
+                plt.imshow(att_w, aspect="auto")
+                plt.xlabel("Encoder Index")
+                plt.ylabel("Decoder Index")
+            plt.tight_layout()
+            return plt
+        def draw_han_plot(self, att_w):
+            """Plot the att_w matrix for hierarchical attention.
+            Returns:
+                matplotlib.pyplot: pyplot object with attention matrix image.
+            """
+            import matplotlib
+            matplotlib.use("Agg")
+            import matplotlib.pyplot as plt
+            plt.clf()
+            if len(att_w.shape) == 3:
+                for h, aw in enumerate(att_w, 1):
+                    legends = []
+                    plt.subplot(1, len(att_w), h)
+                    for i in range(aw.shape[1]):
+                        plt.plot(aw[:, i])
+                        legends.append("Att{}".format(i))
+                    plt.ylim([0, 1.0])
+                    plt.xlim([0, aw.shape[0]])
+                    plt.grid(True)
+                    plt.ylabel("Attention Weight")
+                    plt.xlabel("Decoder Index")
+                    plt.legend(legends)
+            else:
+                legends = []
+                for i in range(att_w.shape[1]):
+                    plt.plot(att_w[:, i])
+                    legends.append("Att{}".format(i))
+                plt.ylim([0, 1.0])
+                plt.xlim([0, att_w.shape[0]])
+                plt.grid(True)
+                plt.ylabel("Attention Weight")
+                plt.xlabel("Decoder Index")
+                plt.legend(legends)
+            plt.tight_layout()
+            return plt
+        def _plot_and_save_attention(self, att_w, filename, han_mode=False):
+            if han_mode:
+                plt = self.draw_han_plot(att_w)
+            else:
+                plt = self.draw_attention_plot(att_w)
+            plt.savefig(filename)
+            plt.close()
+try:
+    from chainer.training import extension
+except ImportError:
+    PlotCTCReport = None
+else:
+    class PlotCTCReport(extension.Extension):
+        """Plot CTC reporter.
+        Args:
+            ctc_vis_fn (espnet.nets.*_backend.e2e_asr.E2E.calculate_all_ctc_probs):
+                Function of CTC visualization.
+            data (list[tuple(str, dict[str, list[Any]])]): List json utt key items.
+            outdir (str): Directory to save figures.
+            converter (espnet.asr.*_backend.asr.CustomConverter):
+                Function to convert data.
+            device (int | torch.device): Device.
+            reverse (bool): If True, input and output length are reversed.
+            ikey (str): Key to access input
+                (for ASR/ST ikey="input", for MT ikey="output".)
+            iaxis (int): Dimension to access input
+                (for ASR/ST iaxis=0, for MT iaxis=1.)
+            okey (str): Key to access output
+                (for ASR/ST okey="input", MT okay="output".)
+            oaxis (int): Dimension to access output
+                (for ASR/ST oaxis=0, for MT oaxis=0.)
+            subsampling_factor (int): subsampling factor in encoder
+        """
+        def __init__(
+            self,
+            ctc_vis_fn,
+            data,
+            outdir,
+            converter,
+            transform,
+            device,
+            reverse=False,
+            ikey="input",
+            iaxis=0,
+            okey="output",
+            oaxis=0,
+            subsampling_factor=1,
+        ):
+            self.ctc_vis_fn = ctc_vis_fn
+            self.data = copy.deepcopy(data)
+            self.data_dict = {k: v for k, v in copy.deepcopy(data)}
+            # key is utterance ID
+            self.outdir = outdir
+            self.converter = converter
+            self.transform = transform
+            self.device = device
+            self.reverse = reverse
+            self.ikey = ikey
+            self.iaxis = iaxis
+            self.okey = okey
+            self.oaxis = oaxis
+            self.factor = subsampling_factor
+            if not os.path.exists(self.outdir):
+                os.makedirs(self.outdir)
+        def __call__(self, trainer):
+            """Plot and save image file of ctc prob."""
+            ctc_probs, uttid_list = self.get_ctc_probs()
+            if isinstance(ctc_probs, list):  # multi-encoder case
+                num_encs = len(ctc_probs) - 1
+                for i in range(num_encs):
+                    for idx, ctc_prob in enumerate(ctc_probs[i]):
+                        filename = "%s/%s.ep.{.updater.epoch}.ctc%d.png" % (
+                            self.outdir,
+                            uttid_list[idx],
+                            i + 1,
+                        )
+                        ctc_prob = self.trim_ctc_prob(uttid_list[idx], ctc_prob)
+                        np_filename = "%s/%s.ep.{.updater.epoch}.ctc%d.npy" % (
+                            self.outdir,
+                            uttid_list[idx],
+                            i + 1,
+                        )
+                        np.save(np_filename.format(trainer), ctc_prob)
+                        self._plot_and_save_ctc(ctc_prob, filename.format(trainer))
+            else:
+                for idx, ctc_prob in enumerate(ctc_probs):
+                    filename = "%s/%s.ep.{.updater.epoch}.png" % (
+                        self.outdir,
+                        uttid_list[idx],
+                    )
+                    ctc_prob = self.trim_ctc_prob(uttid_list[idx], ctc_prob)
+                    np_filename = "%s/%s.ep.{.updater.epoch}.npy" % (
+                        self.outdir,
+                        uttid_list[idx],
+                    )
+                    np.save(np_filename.format(trainer), ctc_prob)
+                    self._plot_and_save_ctc(ctc_prob, filename.format(trainer))
+        def log_ctc_probs(self, logger, step):
+            """Add image files of ctc probs to the tensorboard."""
+            ctc_probs, uttid_list = self.get_ctc_probs()
+            if isinstance(ctc_probs, list):  # multi-encoder case
+                num_encs = len(ctc_probs) - 1
+                for i in range(num_encs):
+                    for idx, ctc_prob in enumerate(ctc_probs[i]):
+                        ctc_prob = self.trim_ctc_prob(uttid_list[idx], ctc_prob)
+                        plot = self.draw_ctc_plot(ctc_prob)
+                        logger.add_figure(
+                            "%s_ctc%d" % (uttid_list[idx], i + 1),
+                            plot.gcf(),
+                            step,
+                        )
+            else:
+                for idx, ctc_prob in enumerate(ctc_probs):
+                    ctc_prob = self.trim_ctc_prob(uttid_list[idx], ctc_prob)
+                    plot = self.draw_ctc_plot(ctc_prob)
+                    logger.add_figure("%s" % (uttid_list[idx]), plot.gcf(), step)
+        def get_ctc_probs(self):
+            """Return CTC probs.
+            Returns:
+                numpy.ndarray: CTC probs. float. Its shape would be
+                    differ from backend. (B, Tmax, vocab).
+            """
+            return_batch, uttid_list = self.transform(self.data, return_uttid=True)
+            batch = self.converter([return_batch], self.device)
+            if isinstance(batch, tuple):
+                probs = self.ctc_vis_fn(*batch)
+            else:
+                probs = self.ctc_vis_fn(**batch)
+            return probs, uttid_list
+        def trim_ctc_prob(self, uttid, prob):
+            """Trim CTC posteriors accoding to input lengths."""
+            enc_len = int(self.data_dict[uttid][self.ikey][self.iaxis]["shape"][0])
+            if self.factor > 1:
+                enc_len //= self.factor
+            prob = prob[:enc_len]
+            return prob
+        def draw_ctc_plot(self, ctc_prob):
+            """Plot the ctc_prob matrix.
+            Returns:
+                matplotlib.pyplot: pyplot object with CTC prob matrix image.
+            """
+            import matplotlib
+            matplotlib.use("Agg")
+            import matplotlib.pyplot as plt
+            ctc_prob = ctc_prob.astype(np.float32)
+            plt.clf()
+            topk_ids = np.argsort(ctc_prob, axis=1)
+            n_frames, vocab = ctc_prob.shape
+            times_probs = np.arange(n_frames)
+            plt.figure(figsize=(20, 8))
+            # NOTE: index 0 is reserved for blank
+            for idx in set(topk_ids.reshape(-1).tolist()):
+                if idx == 0:
+                    plt.plot(
+                        times_probs, ctc_prob[:, 0], ":", label="<blank>", color="grey"
+                    )
+                else:
+                    plt.plot(times_probs, ctc_prob[:, idx])
+            plt.xlabel(u"Input [frame]", fontsize=12)
+            plt.ylabel("Posteriors", fontsize=12)
+            plt.xticks(list(range(0, int(n_frames) + 1, 10)))
+            plt.yticks(list(range(0, 2, 1)))
+            plt.tight_layout()
+            return plt
+        def _plot_and_save_ctc(self, ctc_prob, filename):
+            plt = self.draw_ctc_plot(ctc_prob)
+            plt.savefig(filename)
+            plt.close()
+def restore_snapshot(model, snapshot, load_fn=None):
+    """Extension to restore snapshot.
+    Returns:
+        An extension function.
+    """
+    import chainer
+    from chainer import training
+    if load_fn is None:
+        load_fn = chainer.serializers.load_npz
+    @training.make_extension(trigger=(1, "epoch"))
+    def restore_snapshot(trainer):
+        _restore_snapshot(model, snapshot, load_fn)
+    return restore_snapshot
+def _restore_snapshot(model, snapshot, load_fn=None):
+    if load_fn is None:
+        import chainer
+        load_fn = chainer.serializers.load_npz
+    load_fn(snapshot, model)
+    logging.info("restored from " + str(snapshot))
+def adadelta_eps_decay(eps_decay):
+    """Extension to perform adadelta eps decay.
+    Args:
+        eps_decay (float): Decay rate of eps.
+    Returns:
+        An extension function.
+    """
+    from chainer import training
+    @training.make_extension(trigger=(1, "epoch"))
+    def adadelta_eps_decay(trainer):
+        _adadelta_eps_decay(trainer, eps_decay)
+    return adadelta_eps_decay
+def _adadelta_eps_decay(trainer, eps_decay):
+    optimizer = trainer.updater.get_optimizer("main")
+    # for chainer
+    if hasattr(optimizer, "eps"):
+        current_eps = optimizer.eps
+        setattr(optimizer, "eps", current_eps * eps_decay)
+        logging.info("adadelta eps decayed to " + str(optimizer.eps))
+    # pytorch
+    else:
+        for p in optimizer.param_groups:
+            p["eps"] *= eps_decay
+            logging.info("adadelta eps decayed to " + str(p["eps"]))
+def adam_lr_decay(eps_decay):
+    """Extension to perform adam lr decay.
+    Args:
+        eps_decay (float): Decay rate of lr.
+    Returns:
+        An extension function.
+    """
+    from chainer import training
+    @training.make_extension(trigger=(1, "epoch"))
+    def adam_lr_decay(trainer):
+        _adam_lr_decay(trainer, eps_decay)
+    return adam_lr_decay
+def _adam_lr_decay(trainer, eps_decay):
+    optimizer = trainer.updater.get_optimizer("main")
+    # for chainer
+    if hasattr(optimizer, "lr"):
+        current_lr = optimizer.lr
+        setattr(optimizer, "lr", current_lr * eps_decay)
+        logging.info("adam lr decayed to " + str(optimizer.lr))
+    # pytorch
+    else:
+        for p in optimizer.param_groups:
+            p["lr"] *= eps_decay
+            logging.info("adam lr decayed to " + str(p["lr"]))
+def torch_snapshot(savefun=torch.save, filename="snapshot.ep.{.updater.epoch}"):
+    """Extension to take snapshot of the trainer for pytorch.
+    Returns:
+        An extension function.
+    """
+    from chainer.training import extension
+    @extension.make_extension(trigger=(1, "epoch"), priority=-100)
+    def torch_snapshot(trainer):
+        _torch_snapshot_object(trainer, trainer, filename.format(trainer), savefun)
+    return torch_snapshot
+def _torch_snapshot_object(trainer, target, filename, savefun):
+    from chainer.serializers import DictionarySerializer
+    # make snapshot_dict dictionary
+    s = DictionarySerializer()
+    s.save(trainer)
+    if hasattr(trainer.updater.model, "model"):
+        # (for TTS)
+        if hasattr(trainer.updater.model.model, "module"):
+            model_state_dict = trainer.updater.model.model.module.state_dict()
+        else:
+            model_state_dict = trainer.updater.model.model.state_dict()
+    else:
+        # (for ASR)
+        if hasattr(trainer.updater.model, "module"):
+            model_state_dict = trainer.updater.model.module.state_dict()
+        else:
+            model_state_dict = trainer.updater.model.state_dict()
+    snapshot_dict = {
+        "trainer": s.target,
+        "model": model_state_dict,
+        "optimizer": trainer.updater.get_optimizer("main").state_dict(),
+    }
+    # save snapshot dictionary
+    fn = filename.format(trainer)
+    prefix = "tmp" + fn
+    tmpdir = tempfile.mkdtemp(prefix=prefix, dir=trainer.out)
+    tmppath = os.path.join(tmpdir, fn)
+    try:
+        savefun(snapshot_dict, tmppath)
+        shutil.move(tmppath, os.path.join(trainer.out, fn))
+    finally:
+        shutil.rmtree(tmpdir)
+def add_gradient_noise(model, iteration, duration=100, eta=1.0, scale_factor=0.55):
+    """Adds noise from a standard normal distribution to the gradients.
+    The standard deviation (`sigma`) is controlled by the three hyper-parameters below.
+    `sigma` goes to zero (no noise) with more iterations.
+    Args:
+        model (torch.nn.model): Model.
+        iteration (int): Number of iterations.
+        duration (int) {100, 1000}:
+            Number of durations to control the interval of the `sigma` change.
+        eta (float) {0.01, 0.3, 1.0}: The magnitude of `sigma`.
+        scale_factor (float) {0.55}: The scale of `sigma`.
+    """
+    interval = (iteration // duration) + 1
+    sigma = eta / interval ** scale_factor
+    for param in model.parameters():
+        if param.grad is not None:
+            _shape = param.grad.size()
+            noise = sigma * torch.randn(_shape).to(param.device)
+            param.grad += noise
+# * -------------------- general -------------------- *
+def get_model_conf(model_path, conf_path=None):
+    """Get model config information by reading a model config file (model.json).
+    Args:
+        model_path (str): Model path.
+        conf_path (str): Optional model config path.
+    Returns:
+        list[int, int, dict[str, Any]]: Config information loaded from json file.
+    """
+    if conf_path is None:
+        model_conf = os.path.dirname(model_path) + "/model.json"
+    else:
+        model_conf = conf_path
+    with open(model_conf, "rb") as f:
+        logging.info("reading a config file from " + model_conf)
+        confs = json.load(f)
+    if isinstance(confs, dict):
+        # for lm
+        args = confs
+        return argparse.Namespace(**args)
+    else:
+        # for asr, tts, mt
+        idim, odim, args = confs
+        return idim, odim, argparse.Namespace(**args)
+def chainer_load(path, model):
+    """Load chainer model parameters.
+    Args:
+        path (str): Model path or snapshot file path to be loaded.
+        model (chainer.Chain): Chainer model.
+    """
+    import chainer
+    if "snapshot" in os.path.basename(path):
+        chainer.serializers.load_npz(path, model, path="updater/model:main/")
+    else:
+        chainer.serializers.load_npz(path, model)
+def torch_save(path, model):
+    """Save torch model states.
+    Args:
+        path (str): Model path to be saved.
+        model (torch.nn.Module): Torch model.
+    """
+    if hasattr(model, "module"):
+        torch.save(model.module.state_dict(), path)
+    else:
+        torch.save(model.state_dict(), path)
+def snapshot_object(target, filename):
+    """Returns a trainer extension to take snapshots of a given object.
+    Args:
+        target (model): Object to serialize.
+        filename (str): Name of the file into which the object is serialized.It can
+            be a format string, where the trainer object is passed to
+            the :meth: `str.format` method. For example,
+            ``'snapshot_{.updater.iteration}'`` is converted to
+            ``'snapshot_10000'`` at the 10,000th iteration.
+    Returns:
+        An extension function.
+    """
+    from chainer.training import extension
+    @extension.make_extension(trigger=(1, "epoch"), priority=-100)
+    def snapshot_object(trainer):
+        torch_save(os.path.join(trainer.out, filename.format(trainer)), target)
+    return snapshot_object
+def torch_load(path, model):
+    """Load torch model states.
+    Args:
+        path (str): Model path or snapshot file path to be loaded.
+        model (torch.nn.Module): Torch model.
+    """
+    if "snapshot" in os.path.basename(path):
+        model_state_dict = torch.load(path, map_location=lambda storage, loc: storage)[
+            "model"
+        ]
+    else:
+        model_state_dict = torch.load(path, map_location=lambda storage, loc: storage)
+    if hasattr(model, "module"):
+        model.module.load_state_dict(model_state_dict)
+    else:
+        model.load_state_dict(model_state_dict)
+    del model_state_dict
+def torch_resume(snapshot_path, trainer):
+    """Resume from snapshot for pytorch.
+    Args:
+        snapshot_path (str): Snapshot file path.
+        trainer (chainer.training.Trainer): Chainer's trainer instance.
+    """
+    from chainer.serializers import NpzDeserializer
+    # load snapshot
+    snapshot_dict = torch.load(snapshot_path, map_location=lambda storage, loc: storage)
+    # restore trainer states
+    d = NpzDeserializer(snapshot_dict["trainer"])
+    d.load(trainer)
+    # restore model states
+    if hasattr(trainer.updater.model, "model"):
+        # (for TTS model)
+        if hasattr(trainer.updater.model.model, "module"):
+            trainer.updater.model.model.module.load_state_dict(snapshot_dict["model"])
+        else:
+            trainer.updater.model.model.load_state_dict(snapshot_dict["model"])
+    else:
+        # (for ASR model)
+        if hasattr(trainer.updater.model, "module"):
+            trainer.updater.model.module.load_state_dict(snapshot_dict["model"])
+        else:
+            trainer.updater.model.load_state_dict(snapshot_dict["model"])
+    # retore optimizer states
+    trainer.updater.get_optimizer("main").load_state_dict(snapshot_dict["optimizer"])
+    # delete opened snapshot
+    del snapshot_dict
+# * ------------------ recognition related ------------------ *
+def parse_hypothesis(hyp, char_list):
+    """Parse hypothesis.
+    Args:
+        hyp (list[dict[str, Any]]): Recognition hypothesis.
+        char_list (list[str]): List of characters.
+    Returns:
+        tuple(str, str, str, float)
+    """
+    # remove sos and get results
+    tokenid_as_list = list(map(int, hyp["yseq"][1:]))
+    token_as_list = [char_list[idx] for idx in tokenid_as_list]
+    score = float(hyp["score"])
+    # convert to string
+    tokenid = " ".join([str(idx) for idx in tokenid_as_list])
+    token = " ".join(token_as_list)
+    text = "".join(token_as_list).replace("<space>", " ")
+    return text, token, tokenid, score
+def add_results_to_json(js, nbest_hyps, char_list):
+    """Add N-best results to json.
+    Args:
+        js (dict[str, Any]): Groundtruth utterance dict.
+        nbest_hyps_sd (list[dict[str, Any]]):
+            List of hypothesis for multi_speakers: nutts x nspkrs.
+        char_list (list[str]): List of characters.
+    Returns:
+        dict[str, Any]: N-best results added utterance dict.
+    """
+    # copy old json info
+    new_js = dict()
+    new_js["utt2spk"] = js["utt2spk"]
+    new_js["output"] = []
+    for n, hyp in enumerate(nbest_hyps, 1):
+        # parse hypothesis
+        rec_text, rec_token, rec_tokenid, score = parse_hypothesis(hyp, char_list)
+        # copy ground-truth
+        if len(js["output"]) > 0:
+            out_dic = dict(js["output"][0].items())
+        else:
+            # for no reference case (e.g., speech translation)
+            out_dic = {"name": ""}
+        # update name
+        out_dic["name"] += "[%d]" % n
+        # add recognition results
+        out_dic["rec_text"] = rec_text
+        out_dic["rec_token"] = rec_token
+        out_dic["rec_tokenid"] = rec_tokenid
+        out_dic["score"] = score
+        # add to list of N-best result dicts
+        new_js["output"].append(out_dic)
+        # show 1-best result
+        if n == 1:
+            if "text" in out_dic.keys():
+                logging.info("groundtruth: %s" % out_dic["text"])
+            logging.info("prediction : %s" % out_dic["rec_text"])
+    return new_js
+def plot_spectrogram(
+    plt,
+    spec,
+    mode="db",
+    fs=None,
+    frame_shift=None,
+    bottom=True,
+    left=True,
+    right=True,
+    top=False,
+    labelbottom=True,
+    labelleft=True,
+    labelright=True,
+    labeltop=False,
+    cmap="inferno",
+):
+    """Plot spectrogram using matplotlib.
+    Args:
+        plt (matplotlib.pyplot): pyplot object.
+        spec (numpy.ndarray): Input stft (Freq, Time)
+        mode (str): db or linear.
+        fs (int): Sample frequency. To convert y-axis to kHz unit.
+        frame_shift (int): The frame shift of stft. To convert x-axis to second unit.
+        bottom (bool):Whether to draw the respective ticks.
+        left (bool):
+        right (bool):
+        top (bool):
+        labelbottom (bool):Whether to draw the respective tick labels.
+        labelleft (bool):
+        labelright (bool):
+        labeltop (bool):
+        cmap (str): Colormap defined in matplotlib.
+    """
+    spec = np.abs(spec)
+    if mode == "db":
+        x = 20 * np.log10(spec + np.finfo(spec.dtype).eps)
+    elif mode == "linear":
+        x = spec
+    else:
+        raise ValueError(mode)
+    if fs is not None:
+        ytop = fs / 2000
+        ylabel = "kHz"
+    else:
+        ytop = x.shape[0]
+        ylabel = "bin"
+    if frame_shift is not None and fs is not None:
+        xtop = x.shape[1] * frame_shift / fs
+        xlabel = "s"
+    else:
+        xtop = x.shape[1]
+        xlabel = "frame"
+    extent = (0, xtop, 0, ytop)
+    plt.imshow(x[::-1], cmap=cmap, extent=extent)
+    if labelbottom:
+        plt.xlabel("time [{}]".format(xlabel))
+    if labelleft:
+        plt.ylabel("freq [{}]".format(ylabel))
+    plt.colorbar().set_label("{}".format(mode))
+    plt.tick_params(
+        bottom=bottom,
+        left=left,
+        right=right,
+        top=top,
+        labelbottom=labelbottom,
+        labelleft=labelleft,
+        labelright=labelright,
+        labeltop=labeltop,
+    )
+    plt.axis("auto")
+# * ------------------ recognition related ------------------ *
+def format_mulenc_args(args):
+    """Format args for multi-encoder setup.
+    It deals with following situations:  (when args.num_encs=2):
+    1. args.elayers = None -> args.elayers = [4, 4];
+    2. args.elayers = 4 -> args.elayers = [4, 4];
+    3. args.elayers = [4, 4, 4] -> args.elayers = [4, 4].
+    """
+    # default values when None is assigned.
+    default_dict = {
+        "etype": "blstmp",
+        "elayers": 4,
+        "eunits": 300,
+        "subsample": "1",
+        "dropout_rate": 0.0,
+        "atype": "dot",
+        "adim": 320,
+        "awin": 5,
+        "aheads": 4,
+        "aconv_chans": -1,
+        "aconv_filts": 100,
+    }
+    for k in default_dict.keys():
+        if isinstance(vars(args)[k], list):
+            if len(vars(args)[k]) != args.num_encs:
+                logging.warning(
+                    "Length mismatch {}: Convert {} to {}.".format(
+                        k, vars(args)[k], vars(args)[k][: args.num_encs]
+                    )
+                )
+            vars(args)[k] = vars(args)[k][: args.num_encs]
+        else:
+            if not vars(args)[k]:
+                # assign default value if it is None
+                vars(args)[k] = default_dict[k]
+                logging.warning(
+                    "{} is not specified, use default value {}.".format(
+                        k, default_dict[k]
+                    )
+                )
+            # duplicate
+            logging.warning(
+                "Type mismatch {}: Convert {} to {}.".format(
+                    k, vars(args)[k], [vars(args)[k] for _ in range(args.num_encs)]
+                )
+            )
+            vars(args)[k] = [vars(args)[k] for _ in range(args.num_encs)]
+    return args

espnet/asr/chainer_backend/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Initialize sub package."""

espnet/asr/chainer_backend/asr.py ADDED Viewed

	@@ -0,0 +1,575 @@

+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""Training/decoding definition for the speech recognition task."""
+import json
+import logging
+import os
+import six
+# chainer related
+import chainer
+from chainer import training
+from chainer.datasets import TransformDataset
+from chainer.training import extensions
+# espnet related
+from espnet.asr.asr_utils import adadelta_eps_decay
+from espnet.asr.asr_utils import add_results_to_json
+from espnet.asr.asr_utils import chainer_load
+from espnet.asr.asr_utils import CompareValueTrigger
+from espnet.asr.asr_utils import get_model_conf
+from espnet.asr.asr_utils import restore_snapshot
+from espnet.nets.asr_interface import ASRInterface
+from espnet.utils.deterministic_utils import set_deterministic_chainer
+from espnet.utils.dynamic_import import dynamic_import
+from espnet.utils.io_utils import LoadInputsAndTargets
+from espnet.utils.training.batchfy import make_batchset
+from espnet.utils.training.evaluator import BaseEvaluator
+from espnet.utils.training.iterators import ShufflingEnabler
+from espnet.utils.training.iterators import ToggleableShufflingMultiprocessIterator
+from espnet.utils.training.iterators import ToggleableShufflingSerialIterator
+from espnet.utils.training.train_utils import check_early_stop
+from espnet.utils.training.train_utils import set_early_stop
+# rnnlm
+import espnet.lm.chainer_backend.extlm as extlm_chainer
+import espnet.lm.chainer_backend.lm as lm_chainer
+# numpy related
+import matplotlib
+from espnet.utils.training.tensorboard_logger import TensorboardLogger
+from tensorboardX import SummaryWriter
+matplotlib.use("Agg")
+def train(args):
+    """Train with the given args.
+    Args:
+        args (namespace): The program arguments.
+    """
+    # display chainer version
+    logging.info("chainer version = " + chainer.__version__)
+    set_deterministic_chainer(args)
+    # check cuda and cudnn availability
+    if not chainer.cuda.available:
+        logging.warning("cuda is not available")
+    if not chainer.cuda.cudnn_enabled:
+        logging.warning("cudnn is not available")
+    # get input and output dimension info
+    with open(args.valid_json, "rb") as f:
+        valid_json = json.load(f)["utts"]
+    utts = list(valid_json.keys())
+    idim = int(valid_json[utts[0]]["input"][0]["shape"][1])
+    odim = int(valid_json[utts[0]]["output"][0]["shape"][1])
+    logging.info("#input dims : " + str(idim))
+    logging.info("#output dims: " + str(odim))
+    # specify attention, CTC, hybrid mode
+    if args.mtlalpha == 1.0:
+        mtl_mode = "ctc"
+        logging.info("Pure CTC mode")
+    elif args.mtlalpha == 0.0:
+        mtl_mode = "att"
+        logging.info("Pure attention mode")
+    else:
+        mtl_mode = "mtl"
+        logging.info("Multitask learning mode")
+    # specify model architecture
+    logging.info("import model module: " + args.model_module)
+    model_class = dynamic_import(args.model_module)
+    model = model_class(idim, odim, args, flag_return=False)
+    assert isinstance(model, ASRInterface)
+    total_subsampling_factor = model.get_total_subsampling_factor()
+    # write model config
+    if not os.path.exists(args.outdir):
+        os.makedirs(args.outdir)
+    model_conf = args.outdir + "/model.json"
+    with open(model_conf, "wb") as f:
+        logging.info("writing a model config file to " + model_conf)
+        f.write(
+            json.dumps(
+                (idim, odim, vars(args)), indent=4, ensure_ascii=False, sort_keys=True
+            ).encode("utf_8")
+        )
+    for key in sorted(vars(args).keys()):
+        logging.info("ARGS: " + key + ": " + str(vars(args)[key]))
+    # Set gpu
+    ngpu = args.ngpu
+    if ngpu == 1:
+        gpu_id = 0
+        # Make a specified GPU current
+        chainer.cuda.get_device_from_id(gpu_id).use()
+        model.to_gpu()  # Copy the model to the GPU
+        logging.info("single gpu calculation.")
+    elif ngpu > 1:
+        gpu_id = 0
+        devices = {"main": gpu_id}
+        for gid in six.moves.xrange(1, ngpu):
+            devices["sub_%d" % gid] = gid
+        logging.info("multi gpu calculation (#gpus = %d)." % ngpu)
+        logging.warning(
+            "batch size is automatically increased (%d -> %d)"
+            % (args.batch_size, args.batch_size * args.ngpu)
+        )
+    else:
+        gpu_id = -1
+        logging.info("cpu calculation")
+    # Setup an optimizer
+    if args.opt == "adadelta":
+        optimizer = chainer.optimizers.AdaDelta(eps=args.eps)
+    elif args.opt == "adam":
+        optimizer = chainer.optimizers.Adam()
+    elif args.opt == "noam":
+        optimizer = chainer.optimizers.Adam(alpha=0, beta1=0.9, beta2=0.98, eps=1e-9)
+    else:
+        raise NotImplementedError("args.opt={}".format(args.opt))
+    optimizer.setup(model)
+    optimizer.add_hook(chainer.optimizer.GradientClipping(args.grad_clip))
+    # Setup a converter
+    converter = model.custom_converter(subsampling_factor=model.subsample[0])
+    # read json data
+    with open(args.train_json, "rb") as f:
+        train_json = json.load(f)["utts"]
+    with open(args.valid_json, "rb") as f:
+        valid_json = json.load(f)["utts"]
+    # set up training iterator and updater
+    load_tr = LoadInputsAndTargets(
+        mode="asr",
+        load_output=True,
+        preprocess_conf=args.preprocess_conf,
+        preprocess_args={"train": True},  # Switch the mode of preprocessing
+    )
+    load_cv = LoadInputsAndTargets(
+        mode="asr",
+        load_output=True,
+        preprocess_conf=args.preprocess_conf,
+        preprocess_args={"train": False},  # Switch the mode of preprocessing
+    )
+    use_sortagrad = args.sortagrad == -1 or args.sortagrad > 0
+    accum_grad = args.accum_grad
+    if ngpu <= 1:
+        # make minibatch list (variable length)
+        train = make_batchset(
+            train_json,
+            args.batch_size,
+            args.maxlen_in,
+            args.maxlen_out,
+            args.minibatches,
+            min_batch_size=args.ngpu if args.ngpu > 1 else 1,
+            shortest_first=use_sortagrad,
+            count=args.batch_count,
+            batch_bins=args.batch_bins,
+            batch_frames_in=args.batch_frames_in,
+            batch_frames_out=args.batch_frames_out,
+            batch_frames_inout=args.batch_frames_inout,
+            iaxis=0,
+            oaxis=0,
+        )
+        # hack to make batchsize argument as 1
+        # actual batchsize is included in a list
+        if args.n_iter_processes > 0:
+            train_iters = [
+                ToggleableShufflingMultiprocessIterator(
+                    TransformDataset(train, load_tr),
+                    batch_size=1,
+                    n_processes=args.n_iter_processes,
+                    n_prefetch=8,
+                    maxtasksperchild=20,
+                    shuffle=not use_sortagrad,
+                )
+            ]
+        else:
+            train_iters = [
+                ToggleableShufflingSerialIterator(
+                    TransformDataset(train, load_tr),
+                    batch_size=1,
+                    shuffle=not use_sortagrad,
+                )
+            ]
+        # set up updater
+        updater = model.custom_updater(
+            train_iters[0],
+            optimizer,
+            converter=converter,
+            device=gpu_id,
+            accum_grad=accum_grad,
+        )
+    else:
+        if args.batch_count not in ("auto", "seq") and args.batch_size == 0:
+            raise NotImplementedError(
+                "--batch-count 'bin' and 'frame' are not implemented "
+                "in chainer multi gpu"
+            )
+        # set up minibatches
+        train_subsets = []
+        for gid in six.moves.xrange(ngpu):
+            # make subset
+            train_json_subset = {
+                k: v for i, (k, v) in enumerate(train_json.items()) if i % ngpu == gid
+            }
+            # make minibatch list (variable length)
+            train_subsets += [
+                make_batchset(
+                    train_json_subset,
+                    args.batch_size,
+                    args.maxlen_in,
+                    args.maxlen_out,
+                    args.minibatches,
+                )
+            ]
+        # each subset must have same length for MultiprocessParallelUpdater
+        maxlen = max([len(train_subset) for train_subset in train_subsets])
+        for train_subset in train_subsets:
+            if maxlen != len(train_subset):
+                for i in six.moves.xrange(maxlen - len(train_subset)):
+                    train_subset += [train_subset[i]]
+        # hack to make batchsize argument as 1
+        # actual batchsize is included in a list
+        if args.n_iter_processes > 0:
+            train_iters = [
+                ToggleableShufflingMultiprocessIterator(
+                    TransformDataset(train_subsets[gid], load_tr),
+                    batch_size=1,
+                    n_processes=args.n_iter_processes,
+                    n_prefetch=8,
+                    maxtasksperchild=20,
+                    shuffle=not use_sortagrad,
+                )
+                for gid in six.moves.xrange(ngpu)
+            ]
+        else:
+            train_iters = [
+                ToggleableShufflingSerialIterator(
+                    TransformDataset(train_subsets[gid], load_tr),
+                    batch_size=1,
+                    shuffle=not use_sortagrad,
+                )
+                for gid in six.moves.xrange(ngpu)
+            ]
+        # set up updater
+        updater = model.custom_parallel_updater(
+            train_iters, optimizer, converter=converter, devices=devices
+        )
+    # Set up a trainer
+    trainer = training.Trainer(updater, (args.epochs, "epoch"), out=args.outdir)
+    if use_sortagrad:
+        trainer.extend(
+            ShufflingEnabler(train_iters),
+            trigger=(args.sortagrad if args.sortagrad != -1 else args.epochs, "epoch"),
+        )
+    if args.opt == "noam":
+        from espnet.nets.chainer_backend.transformer.training import VaswaniRule
+        trainer.extend(
+            VaswaniRule(
+                "alpha",
+                d=args.adim,
+                warmup_steps=args.transformer_warmup_steps,
+                scale=args.transformer_lr,
+            ),
+            trigger=(1, "iteration"),
+        )
+    # Resume from a snapshot
+    if args.resume:
+        chainer.serializers.load_npz(args.resume, trainer)
+    # set up validation iterator
+    valid = make_batchset(
+        valid_json,
+        args.batch_size,
+        args.maxlen_in,
+        args.maxlen_out,
+        args.minibatches,
+        min_batch_size=args.ngpu if args.ngpu > 1 else 1,
+        count=args.batch_count,
+        batch_bins=args.batch_bins,
+        batch_frames_in=args.batch_frames_in,
+        batch_frames_out=args.batch_frames_out,
+        batch_frames_inout=args.batch_frames_inout,
+        iaxis=0,
+        oaxis=0,
+    )
+    if args.n_iter_processes > 0:
+        valid_iter = chainer.iterators.MultiprocessIterator(
+            TransformDataset(valid, load_cv),
+            batch_size=1,
+            repeat=False,
+            shuffle=False,
+            n_processes=args.n_iter_processes,
+            n_prefetch=8,
+            maxtasksperchild=20,
+        )
+    else:
+        valid_iter = chainer.iterators.SerialIterator(
+            TransformDataset(valid, load_cv), batch_size=1, repeat=False, shuffle=False
+        )
+    # Evaluate the model with the test dataset for each epoch
+    trainer.extend(BaseEvaluator(valid_iter, model, converter=converter, device=gpu_id))
+    # Save attention weight each epoch
+    if args.num_save_attention > 0 and args.mtlalpha != 1.0:
+        data = sorted(
+            list(valid_json.items())[: args.num_save_attention],
+            key=lambda x: int(x[1]["input"][0]["shape"][1]),
+            reverse=True,
+        )
+        if hasattr(model, "module"):
+            att_vis_fn = model.module.calculate_all_attentions
+            plot_class = model.module.attention_plot_class
+        else:
+            att_vis_fn = model.calculate_all_attentions
+            plot_class = model.attention_plot_class
+        logging.info("Using custom PlotAttentionReport")
+        att_reporter = plot_class(
+            att_vis_fn,
+            data,
+            args.outdir + "/att_ws",
+            converter=converter,
+            transform=load_cv,
+            device=gpu_id,
+            subsampling_factor=total_subsampling_factor,
+        )
+        trainer.extend(att_reporter, trigger=(1, "epoch"))
+    else:
+        att_reporter = None
+    # Take a snapshot for each specified epoch
+    trainer.extend(
+        extensions.snapshot(filename="snapshot.ep.{.updater.epoch}"),
+        trigger=(1, "epoch"),
+    )
+    # Make a plot for training and validation values
+    trainer.extend(
+        extensions.PlotReport(
+            [
+                "main/loss",
+                "validation/main/loss",
+                "main/loss_ctc",
+                "validation/main/loss_ctc",
+                "main/loss_att",
+                "validation/main/loss_att",
+            ],
+            "epoch",
+            file_name="loss.png",
+        )
+    )
+    trainer.extend(
+        extensions.PlotReport(
+            ["main/acc", "validation/main/acc"], "epoch", file_name="acc.png"
+        )
+    )
+    # Save best models
+    trainer.extend(
+        extensions.snapshot_object(model, "model.loss.best"),
+        trigger=training.triggers.MinValueTrigger("validation/main/loss"),
+    )
+    if mtl_mode != "ctc":
+        trainer.extend(
+            extensions.snapshot_object(model, "model.acc.best"),
+            trigger=training.triggers.MaxValueTrigger("validation/main/acc"),
+        )
+    # epsilon decay in the optimizer
+    if args.opt == "adadelta":
+        if args.criterion == "acc" and mtl_mode != "ctc":
+            trainer.extend(
+                restore_snapshot(model, args.outdir + "/model.acc.best"),
+                trigger=CompareValueTrigger(
+                    "validation/main/acc",
+                    lambda best_value, current_value: best_value > current_value,
+                ),
+            )
+            trainer.extend(
+                adadelta_eps_decay(args.eps_decay),
+                trigger=CompareValueTrigger(
+                    "validation/main/acc",
+                    lambda best_value, current_value: best_value > current_value,
+                ),
+            )
+        elif args.criterion == "loss":
+            trainer.extend(
+                restore_snapshot(model, args.outdir + "/model.loss.best"),
+                trigger=CompareValueTrigger(
+                    "validation/main/loss",
+                    lambda best_value, current_value: best_value < current_value,
+                ),
+            )
+            trainer.extend(
+                adadelta_eps_decay(args.eps_decay),
+                trigger=CompareValueTrigger(
+                    "validation/main/loss",
+                    lambda best_value, current_value: best_value < current_value,
+                ),
+            )
+    # Write a log of evaluation statistics for each epoch
+    trainer.extend(
+        extensions.LogReport(trigger=(args.report_interval_iters, "iteration"))
+    )
+    report_keys = [
+        "epoch",
+        "iteration",
+        "main/loss",
+        "main/loss_ctc",
+        "main/loss_att",
+        "validation/main/loss",
+        "validation/main/loss_ctc",
+        "validation/main/loss_att",
+        "main/acc",
+        "validation/main/acc",
+        "elapsed_time",
+    ]
+    if args.opt == "adadelta":
+        trainer.extend(
+            extensions.observe_value(
+                "eps", lambda trainer: trainer.updater.get_optimizer("main").eps
+            ),
+            trigger=(args.report_interval_iters, "iteration"),
+        )
+        report_keys.append("eps")
+    trainer.extend(
+        extensions.PrintReport(report_keys),
+        trigger=(args.report_interval_iters, "iteration"),
+    )
+    trainer.extend(extensions.ProgressBar(update_interval=args.report_interval_iters))
+    set_early_stop(trainer, args)
+    if args.tensorboard_dir is not None and args.tensorboard_dir != "":
+        writer = SummaryWriter(args.tensorboard_dir)
+        trainer.extend(
+            TensorboardLogger(writer, att_reporter),
+            trigger=(args.report_interval_iters, "iteration"),
+        )
+    # Run the training
+    trainer.run()
+    check_early_stop(trainer, args.epochs)
+def recog(args):
+    """Decode with the given args.
+    Args:
+        args (namespace): The program arguments.
+    """
+    # display chainer version
+    logging.info("chainer version = " + chainer.__version__)
+    set_deterministic_chainer(args)
+    # read training config
+    idim, odim, train_args = get_model_conf(args.model, args.model_conf)
+    for key in sorted(vars(args).keys()):
+        logging.info("ARGS: " + key + ": " + str(vars(args)[key]))
+    # specify model architecture
+    logging.info("reading model parameters from " + args.model)
+    # To be compatible with v.0.3.0 models
+    if hasattr(train_args, "model_module"):
+        model_module = train_args.model_module
+    else:
+        model_module = "espnet.nets.chainer_backend.e2e_asr:E2E"
+    model_class = dynamic_import(model_module)
+    model = model_class(idim, odim, train_args)
+    assert isinstance(model, ASRInterface)
+    chainer_load(args.model, model)
+    # read rnnlm
+    if args.rnnlm:
+        rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf)
+        rnnlm = lm_chainer.ClassifierWithState(
+            lm_chainer.RNNLM(
+                len(train_args.char_list), rnnlm_args.layer, rnnlm_args.unit
+            )
+        )
+        chainer_load(args.rnnlm, rnnlm)
+    else:
+        rnnlm = None
+    if args.word_rnnlm:
+        rnnlm_args = get_model_conf(args.word_rnnlm, args.word_rnnlm_conf)
+        word_dict = rnnlm_args.char_list_dict
+        char_dict = {x: i for i, x in enumerate(train_args.char_list)}
+        word_rnnlm = lm_chainer.ClassifierWithState(
+            lm_chainer.RNNLM(len(word_dict), rnnlm_args.layer, rnnlm_args.unit)
+        )
+        chainer_load(args.word_rnnlm, word_rnnlm)
+        if rnnlm is not None:
+            rnnlm = lm_chainer.ClassifierWithState(
+                extlm_chainer.MultiLevelLM(
+                    word_rnnlm.predictor, rnnlm.predictor, word_dict, char_dict
+                )
+            )
+        else:
+            rnnlm = lm_chainer.ClassifierWithState(
+                extlm_chainer.LookAheadWordLM(
+                    word_rnnlm.predictor, word_dict, char_dict
+                )
+            )
+    # read json data
+    with open(args.recog_json, "rb") as f:
+        js = json.load(f)["utts"]
+    load_inputs_and_targets = LoadInputsAndTargets(
+        mode="asr",
+        load_output=False,
+        sort_in_input_length=False,
+        preprocess_conf=train_args.preprocess_conf
+        if args.preprocess_conf is None
+        else args.preprocess_conf,
+        preprocess_args={"train": False},  # Switch the mode of preprocessing
+    )
+    # decode each utterance
+    new_js = {}
+    with chainer.no_backprop_mode():
+        for idx, name in enumerate(js.keys(), 1):
+            logging.info("(%d/%d) decoding " + name, idx, len(js.keys()))
+            batch = [(name, js[name])]
+            feat = load_inputs_and_targets(batch)[0][0]
+            nbest_hyps = model.recognize(feat, args, train_args.char_list, rnnlm)
+            new_js[name] = add_results_to_json(
+                js[name], nbest_hyps, train_args.char_list
+            )
+    with open(args.result_label, "wb") as f:
+        f.write(
+            json.dumps(
+                {"utts": new_js}, indent=4, ensure_ascii=False, sort_keys=True
+            ).encode("utf_8")
+        )

espnet/asr/pytorch_backend/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Initialize sub package."""

espnet/asr/pytorch_backend/asr.py ADDED Viewed

	@@ -0,0 +1,1500 @@

+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""Training/decoding definition for the speech recognition task."""
+import copy
+import json
+import logging
+import math
+import os
+import sys
+from chainer import reporter as reporter_module
+from chainer import training
+from chainer.training import extensions
+from chainer.training.updater import StandardUpdater
+import numpy as np
+from tensorboardX import SummaryWriter
+import torch
+from torch.nn.parallel import data_parallel
+from espnet.asr.asr_utils import adadelta_eps_decay
+from espnet.asr.asr_utils import add_results_to_json
+from espnet.asr.asr_utils import CompareValueTrigger
+from espnet.asr.asr_utils import format_mulenc_args
+from espnet.asr.asr_utils import get_model_conf
+from espnet.asr.asr_utils import plot_spectrogram
+from espnet.asr.asr_utils import restore_snapshot
+from espnet.asr.asr_utils import snapshot_object
+from espnet.asr.asr_utils import torch_load
+from espnet.asr.asr_utils import torch_resume
+from espnet.asr.asr_utils import torch_snapshot
+from espnet.asr.pytorch_backend.asr_init import freeze_modules
+from espnet.asr.pytorch_backend.asr_init import load_trained_model
+from espnet.asr.pytorch_backend.asr_init import load_trained_modules
+import espnet.lm.pytorch_backend.extlm as extlm_pytorch
+from espnet.nets.asr_interface import ASRInterface
+from espnet.nets.beam_search_transducer import BeamSearchTransducer
+from espnet.nets.pytorch_backend.e2e_asr import pad_list
+import espnet.nets.pytorch_backend.lm.default as lm_pytorch
+from espnet.nets.pytorch_backend.streaming.segment import SegmentStreamingE2E
+from espnet.nets.pytorch_backend.streaming.window import WindowStreamingE2E
+from espnet.transform.spectrogram import IStft
+from espnet.transform.transformation import Transformation
+from espnet.utils.cli_writers import file_writer_helper
+from espnet.utils.dataset import ChainerDataLoader
+from espnet.utils.dataset import TransformDataset
+from espnet.utils.deterministic_utils import set_deterministic_pytorch
+from espnet.utils.dynamic_import import dynamic_import
+from espnet.utils.io_utils import LoadInputsAndTargets
+from espnet.utils.training.batchfy import make_batchset
+from espnet.utils.training.evaluator import BaseEvaluator
+from espnet.utils.training.iterators import ShufflingEnabler
+from espnet.utils.training.tensorboard_logger import TensorboardLogger
+from espnet.utils.training.train_utils import check_early_stop
+from espnet.utils.training.train_utils import set_early_stop
+import matplotlib
+matplotlib.use("Agg")
+if sys.version_info[0] == 2:
+    from itertools import izip_longest as zip_longest
+else:
+    from itertools import zip_longest as zip_longest
+def _recursive_to(xs, device):
+    if torch.is_tensor(xs):
+        return xs.to(device)
+    if isinstance(xs, tuple):
+        return tuple(_recursive_to(x, device) for x in xs)
+    return xs
+class CustomEvaluator(BaseEvaluator):
+    """Custom Evaluator for Pytorch.
+    Args:
+        model (torch.nn.Module): The model to evaluate.
+        iterator (chainer.dataset.Iterator) : The train iterator.
+        target (link | dict[str, link]) :Link object or a dictionary of
+            links to evaluate. If this is just a link object, the link is
+            registered by the name ``'main'``.
+        device (torch.device): The device used.
+        ngpu (int): The number of GPUs.
+    """
+    def __init__(self, model, iterator, target, device, ngpu=None):
+        super(CustomEvaluator, self).__init__(iterator, target)
+        self.model = model
+        self.device = device
+        if ngpu is not None:
+            self.ngpu = ngpu
+        elif device.type == "cpu":
+            self.ngpu = 0
+        else:
+            self.ngpu = 1
+    # The core part of the update routine can be customized by overriding
+    def evaluate(self):
+        """Main evaluate routine for CustomEvaluator."""
+        iterator = self._iterators["main"]
+        if self.eval_hook:
+            self.eval_hook(self)
+        if hasattr(iterator, "reset"):
+            iterator.reset()
+            it = iterator
+        else:
+            it = copy.copy(iterator)
+        summary = reporter_module.DictSummary()
+        self.model.eval()
+        with torch.no_grad():
+            for batch in it:
+                x = _recursive_to(batch, self.device)
+                observation = {}
+                with reporter_module.report_scope(observation):
+                    # read scp files
+                    # x: original json with loaded features
+                    #    will be converted to chainer variable later
+                    if self.ngpu == 0:
+                        self.model(*x)
+                    else:
+                        # apex does not support torch.nn.DataParallel
+                        data_parallel(self.model, x, range(self.ngpu))
+                summary.add(observation)
+        self.model.train()
+        return summary.compute_mean()
+class CustomUpdater(StandardUpdater):
+    """Custom Updater for Pytorch.
+    Args:
+        model (torch.nn.Module): The model to update.
+        grad_clip_threshold (float): The gradient clipping value to use.
+        train_iter (chainer.dataset.Iterator): The training iterator.
+        optimizer (torch.optim.optimizer): The training optimizer.
+        device (torch.device): The device to use.
+        ngpu (int): The number of gpus to use.
+        use_apex (bool): The flag to use Apex in backprop.
+    """
+    def __init__(
+        self,
+        model,
+        grad_clip_threshold,
+        train_iter,
+        optimizer,
+        device,
+        ngpu,
+        grad_noise=False,
+        accum_grad=1,
+        use_apex=False,
+    ):
+        super(CustomUpdater, self).__init__(train_iter, optimizer)
+        self.model = model
+        self.grad_clip_threshold = grad_clip_threshold
+        self.device = device
+        self.ngpu = ngpu
+        self.accum_grad = accum_grad
+        self.forward_count = 0
+        self.grad_noise = grad_noise
+        self.iteration = 0
+        self.use_apex = use_apex
+    # The core part of the update routine can be customized by overriding.
+    def update_core(self):
+        """Main update routine of the CustomUpdater."""
+        # When we pass one iterator and optimizer to StandardUpdater.__init__,
+        # they are automatically named 'main'.
+        train_iter = self.get_iterator("main")
+        optimizer = self.get_optimizer("main")
+        epoch = train_iter.epoch
+        # Get the next batch (a list of json files)
+        batch = train_iter.next()
+        # self.iteration += 1 # Increase may result in early report,
+        # which is done in other place automatically.
+        x = _recursive_to(batch, self.device)
+        is_new_epoch = train_iter.epoch != epoch
+        # When the last minibatch in the current epoch is given,
+        # gradient accumulation is turned off in order to evaluate the model
+        # on the validation set in every epoch.
+        # see details in https://github.com/espnet/espnet/pull/1388
+        # Compute the loss at this time step and accumulate it
+        if self.ngpu == 0:
+            loss = self.model(*x).mean() / self.accum_grad
+        else:
+            # apex does not support torch.nn.DataParallel
+            loss = (
+                data_parallel(self.model, x, range(self.ngpu)).mean() / self.accum_grad
+            )
+        if self.use_apex:
+            from apex import amp
+            # NOTE: for a compatibility with noam optimizer
+            opt = optimizer.optimizer if hasattr(optimizer, "optimizer") else optimizer
+            with amp.scale_loss(loss, opt) as scaled_loss:
+                scaled_loss.backward()
+        else:
+            loss.backward()
+        # gradient noise injection
+        if self.grad_noise:
+            from espnet.asr.asr_utils import add_gradient_noise
+            add_gradient_noise(
+                self.model, self.iteration, duration=100, eta=1.0, scale_factor=0.55
+            )
+        # update parameters
+        self.forward_count += 1
+        if not is_new_epoch and self.forward_count != self.accum_grad:
+            return
+        self.forward_count = 0
+        # compute the gradient norm to check if it is normal or not
+        grad_norm = torch.nn.utils.clip_grad_norm_(
+            self.model.parameters(), self.grad_clip_threshold
+        )
+        logging.info("grad norm={}".format(grad_norm))
+        if math.isnan(grad_norm):
+            logging.warning("grad norm is nan. Do not update model.")
+        else:
+            optimizer.step()
+        optimizer.zero_grad()
+    def update(self):
+        self.update_core()
+        # #iterations with accum_grad > 1
+        # Ref.: https://github.com/espnet/espnet/issues/777
+        if self.forward_count == 0:
+            self.iteration += 1
+class CustomConverter(object):
+    """Custom batch converter for Pytorch.
+    Args:
+        subsampling_factor (int): The subsampling factor.
+        dtype (torch.dtype): Data type to convert.
+    """
+    def __init__(self, subsampling_factor=1, dtype=torch.float32):
+        """Construct a CustomConverter object."""
+        self.subsampling_factor = subsampling_factor
+        self.ignore_id = -1
+        self.dtype = dtype
+    def __call__(self, batch, device=torch.device("cpu")):
+        """Transform a batch and send it to a device.
+        Args:
+            batch (list): The batch to transform.
+            device (torch.device): The device to send to.
+        Returns:
+            tuple(torch.Tensor, torch.Tensor, torch.Tensor)
+        """
+        # batch should be located in list
+        assert len(batch) == 1
+        xs, ys = batch[0]
+        # perform subsampling
+        if self.subsampling_factor > 1:
+            xs = [x[:: self.subsampling_factor, :] for x in xs]
+        # get batch of lengths of input sequences
+        ilens = np.array([x.shape[0] for x in xs])
+        # perform padding and convert to tensor
+        # currently only support real number
+        if xs[0].dtype.kind == "c":
+            xs_pad_real = pad_list(
+                [torch.from_numpy(x.real).float() for x in xs], 0
+            ).to(device, dtype=self.dtype)
+            xs_pad_imag = pad_list(
+                [torch.from_numpy(x.imag).float() for x in xs], 0
+            ).to(device, dtype=self.dtype)
+            # Note(kamo):
+            # {'real': ..., 'imag': ...} will be changed to ComplexTensor in E2E.
+            # Don't create ComplexTensor and give it E2E here
+            # because torch.nn.DataParellel can't handle it.
+            xs_pad = {"real": xs_pad_real, "imag": xs_pad_imag}
+        else:
+            xs_pad = pad_list([torch.from_numpy(x).float() for x in xs], 0).to(
+                device, dtype=self.dtype
+            )
+        ilens = torch.from_numpy(ilens).to(device)
+        # NOTE: this is for multi-output (e.g., speech translation)
+        ys_pad = pad_list(
+            [
+                torch.from_numpy(
+                    np.array(y[0][:]) if isinstance(y, tuple) else y
+                ).long()
+                for y in ys
+            ],
+            self.ignore_id,
+        ).to(device)
+        return xs_pad, ilens, ys_pad
+class CustomConverterMulEnc(object):
+    """Custom batch converter for Pytorch in multi-encoder case.
+    Args:
+        subsampling_factors (list): List of subsampling factors for each encoder.
+        dtype (torch.dtype): Data type to convert.
+    """
+    def __init__(self, subsamping_factors=[1, 1], dtype=torch.float32):
+        """Initialize the converter."""
+        self.subsamping_factors = subsamping_factors
+        self.ignore_id = -1
+        self.dtype = dtype
+        self.num_encs = len(subsamping_factors)
+    def __call__(self, batch, device=torch.device("cpu")):
+        """Transform a batch and send it to a device.
+        Args:
+            batch (list): The batch to transform.
+            device (torch.device): The device to send to.
+        Returns:
+            tuple( list(torch.Tensor), list(torch.Tensor), torch.Tensor)
+        """
+        # batch should be located in list
+        assert len(batch) == 1
+        xs_list = batch[0][: self.num_encs]
+        ys = batch[0][-1]
+        # perform subsampling
+        if np.sum(self.subsamping_factors) > self.num_encs:
+            xs_list = [
+                [x[:: self.subsampling_factors[i], :] for x in xs_list[i]]
+                for i in range(self.num_encs)
+            ]
+        # get batch of lengths of input sequences
+        ilens_list = [
+            np.array([x.shape[0] for x in xs_list[i]]) for i in range(self.num_encs)
+        ]
+        # perform padding and convert to tensor
+        # currently only support real number
+        xs_list_pad = [
+            pad_list([torch.from_numpy(x).float() for x in xs_list[i]], 0).to(
+                device, dtype=self.dtype
+            )
+            for i in range(self.num_encs)
+        ]
+        ilens_list = [
+            torch.from_numpy(ilens_list[i]).to(device) for i in range(self.num_encs)
+        ]
+        # NOTE: this is for multi-task learning (e.g., speech translation)
+        ys_pad = pad_list(
+            [
+                torch.from_numpy(np.array(y[0]) if isinstance(y, tuple) else y).long()
+                for y in ys
+            ],
+            self.ignore_id,
+        ).to(device)
+        return xs_list_pad, ilens_list, ys_pad
+def train(args):
+    """Train with the given args.
+    Args:
+        args (namespace): The program arguments.
+    """
+    set_deterministic_pytorch(args)
+    if args.num_encs > 1:
+        args = format_mulenc_args(args)
+    # check cuda availability
+    if not torch.cuda.is_available():
+        logging.warning("cuda is not available")
+    # get input and output dimension info
+    with open(args.valid_json, "rb") as f:
+        valid_json = json.load(f)["utts"]
+    utts = list(valid_json.keys())
+    idim_list = [
+        int(valid_json[utts[0]]["input"][i]["shape"][-1]) for i in range(args.num_encs)
+    ]
+    odim = int(valid_json[utts[0]]["output"][0]["shape"][-1])
+    for i in range(args.num_encs):
+        logging.info("stream{}: input dims : {}".format(i + 1, idim_list[i]))
+    logging.info("#output dims: " + str(odim))
+    # specify attention, CTC, hybrid mode
+    if "transducer" in args.model_module:
+        if (
+            getattr(args, "etype", False) == "custom"
+            or getattr(args, "dtype", False) == "custom"
+        ):
+            mtl_mode = "custom_transducer"
+        else:
+            mtl_mode = "transducer"
+        logging.info("Pure transducer mode")
+    elif args.mtlalpha == 1.0:
+        mtl_mode = "ctc"
+        logging.info("Pure CTC mode")
+    elif args.mtlalpha == 0.0:
+        mtl_mode = "att"
+        logging.info("Pure attention mode")
+    else:
+        mtl_mode = "mtl"
+        logging.info("Multitask learning mode")
+    if (args.enc_init is not None or args.dec_init is not None) and args.num_encs == 1:
+        model = load_trained_modules(idim_list[0], odim, args)
+    else:
+        model_class = dynamic_import(args.model_module)
+        model = model_class(
+            idim_list[0] if args.num_encs == 1 else idim_list, odim, args
+        )
+    assert isinstance(model, ASRInterface)
+    total_subsampling_factor = model.get_total_subsampling_factor()
+    logging.info(
+        " Total parameter of the model = "
+        + str(sum(p.numel() for p in model.parameters()))
+    )
+    if args.rnnlm is not None:
+        rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf)
+        rnnlm = lm_pytorch.ClassifierWithState(
+            lm_pytorch.RNNLM(len(args.char_list), rnnlm_args.layer, rnnlm_args.unit)
+        )
+        torch_load(args.rnnlm, rnnlm)
+        model.rnnlm = rnnlm
+    # write model config
+    if not os.path.exists(args.outdir):
+        os.makedirs(args.outdir)
+    model_conf = args.outdir + "/model.json"
+    with open(model_conf, "wb") as f:
+        logging.info("writing a model config file to " + model_conf)
+        f.write(
+            json.dumps(
+                (idim_list[0] if args.num_encs == 1 else idim_list, odim, vars(args)),
+                indent=4,
+                ensure_ascii=False,
+                sort_keys=True,
+            ).encode("utf_8")
+        )
+    for key in sorted(vars(args).keys()):
+        logging.info("ARGS: " + key + ": " + str(vars(args)[key]))
+    reporter = model.reporter
+    # check the use of multi-gpu
+    if args.ngpu > 1:
+        if args.batch_size != 0:
+            logging.warning(
+                "batch size is automatically increased (%d -> %d)"
+                % (args.batch_size, args.batch_size * args.ngpu)
+            )
+            args.batch_size *= args.ngpu
+        if args.num_encs > 1:
+            # TODO(ruizhili): implement data parallel for multi-encoder setup.
+            raise NotImplementedError(
+                "Data parallel is not supported for multi-encoder setup."
+            )
+    # set torch device
+    device = torch.device("cuda" if args.ngpu > 0 else "cpu")
+    if args.train_dtype in ("float16", "float32", "float64"):
+        dtype = getattr(torch, args.train_dtype)
+    else:
+        dtype = torch.float32
+    model = model.to(device=device, dtype=dtype)
+    if args.freeze_mods:
+        model, model_params = freeze_modules(model, args.freeze_mods)
+    else:
+        model_params = model.parameters()
+    logging.warning(
+        "num. model params: {:,} (num. trained: {:,} ({:.1f}%))".format(
+            sum(p.numel() for p in model.parameters()),
+            sum(p.numel() for p in model.parameters() if p.requires_grad),
+            sum(p.numel() for p in model.parameters() if p.requires_grad)
+            * 100.0
+            / sum(p.numel() for p in model.parameters()),
+        )
+    )
+    # Setup an optimizer
+    if args.opt == "adadelta":
+        optimizer = torch.optim.Adadelta(
+            model_params, rho=0.95, eps=args.eps, weight_decay=args.weight_decay
+        )
+    elif args.opt == "adam":
+        optimizer = torch.optim.Adam(model_params, weight_decay=args.weight_decay)
+    elif args.opt == "noam":
+        from espnet.nets.pytorch_backend.transformer.optimizer import get_std_opt
+        # For transformer-transducer, adim declaration is within the block definition.
+        # Thus, we need retrieve the most dominant value (d_hidden) for Noam scheduler.
+        if hasattr(args, "enc_block_arch") or hasattr(args, "dec_block_arch"):
+            adim = model.most_dom_dim
+        else:
+            adim = args.adim
+        optimizer = get_std_opt(
+            model_params, adim, args.transformer_warmup_steps, args.transformer_lr
+        )
+    else:
+        raise NotImplementedError("unknown optimizer: " + args.opt)
+    # setup apex.amp
+    if args.train_dtype in ("O0", "O1", "O2", "O3"):
+        try:
+            from apex import amp
+        except ImportError as e:
+            logging.error(
+                f"You need to install apex for --train-dtype {args.train_dtype}. "
+                "See https://github.com/NVIDIA/apex#linux"
+            )
+            raise e
+        if args.opt == "noam":
+            model, optimizer.optimizer = amp.initialize(
+                model, optimizer.optimizer, opt_level=args.train_dtype
+            )
+        else:
+            model, optimizer = amp.initialize(
+                model, optimizer, opt_level=args.train_dtype
+            )
+        use_apex = True
+        from espnet.nets.pytorch_backend.ctc import CTC
+        amp.register_float_function(CTC, "loss_fn")
+        amp.init()
+        logging.warning("register ctc as float function")
+    else:
+        use_apex = False
+    # FIXME: TOO DIRTY HACK
+    setattr(optimizer, "target", reporter)
+    setattr(optimizer, "serialize", lambda s: reporter.serialize(s))
+    # Setup a converter
+    if args.num_encs == 1:
+        converter = CustomConverter(subsampling_factor=model.subsample[0], dtype=dtype)
+    else:
+        converter = CustomConverterMulEnc(
+            [i[0] for i in model.subsample_list], dtype=dtype
+        )
+    # read json data
+    with open(args.train_json, "rb") as f:
+        train_json = json.load(f)["utts"]
+    with open(args.valid_json, "rb") as f:
+        valid_json = json.load(f)["utts"]
+    use_sortagrad = args.sortagrad == -1 or args.sortagrad > 0
+    # make minibatch list (variable length)
+    train = make_batchset(
+        train_json,
+        args.batch_size,
+        args.maxlen_in,
+        args.maxlen_out,
+        args.minibatches,
+        min_batch_size=args.ngpu if args.ngpu > 1 else 1,
+        shortest_first=use_sortagrad,
+        count=args.batch_count,
+        batch_bins=args.batch_bins,
+        batch_frames_in=args.batch_frames_in,
+        batch_frames_out=args.batch_frames_out,
+        batch_frames_inout=args.batch_frames_inout,
+        iaxis=0,
+        oaxis=0,
+    )
+    valid = make_batchset(
+        valid_json,
+        args.batch_size,
+        args.maxlen_in,
+        args.maxlen_out,
+        args.minibatches,
+        min_batch_size=args.ngpu if args.ngpu > 1 else 1,
+        count=args.batch_count,
+        batch_bins=args.batch_bins,
+        batch_frames_in=args.batch_frames_in,
+        batch_frames_out=args.batch_frames_out,
+        batch_frames_inout=args.batch_frames_inout,
+        iaxis=0,
+        oaxis=0,
+    )
+    load_tr = LoadInputsAndTargets(
+        mode="asr",
+        load_output=True,
+        preprocess_conf=args.preprocess_conf,
+        preprocess_args={"train": True},  # Switch the mode of preprocessing
+    )
+    load_cv = LoadInputsAndTargets(
+        mode="asr",
+        load_output=True,
+        preprocess_conf=args.preprocess_conf,
+        preprocess_args={"train": False},  # Switch the mode of preprocessing
+    )
+    # hack to make batchsize argument as 1
+    # actual bathsize is included in a list
+    # default collate function converts numpy array to pytorch tensor
+    # we used an empty collate function instead which returns list
+    train_iter = ChainerDataLoader(
+        dataset=TransformDataset(train, lambda data: converter([load_tr(data)])),
+        batch_size=1,
+        num_workers=args.n_iter_processes,
+        shuffle=not use_sortagrad,
+        collate_fn=lambda x: x[0],
+    )
+    valid_iter = ChainerDataLoader(
+        dataset=TransformDataset(valid, lambda data: converter([load_cv(data)])),
+        batch_size=1,
+        shuffle=False,
+        collate_fn=lambda x: x[0],
+        num_workers=args.n_iter_processes,
+    )
+    # Set up a trainer
+    updater = CustomUpdater(
+        model,
+        args.grad_clip,
+        {"main": train_iter},
+        optimizer,
+        device,
+        args.ngpu,
+        args.grad_noise,
+        args.accum_grad,
+        use_apex=use_apex,
+    )
+    trainer = training.Trainer(updater, (args.epochs, "epoch"), out=args.outdir)
+    if use_sortagrad:
+        trainer.extend(
+            ShufflingEnabler([train_iter]),
+            trigger=(args.sortagrad if args.sortagrad != -1 else args.epochs, "epoch"),
+        )
+    # Resume from a snapshot
+    if args.resume:
+        logging.info("resumed from %s" % args.resume)
+        torch_resume(args.resume, trainer)
+    # Evaluate the model with the test dataset for each epoch
+    if args.save_interval_iters > 0:
+        trainer.extend(
+            CustomEvaluator(model, {"main": valid_iter}, reporter, device, args.ngpu),
+            trigger=(args.save_interval_iters, "iteration"),
+        )
+    else:
+        trainer.extend(
+            CustomEvaluator(model, {"main": valid_iter}, reporter, device, args.ngpu)
+        )
+    # Save attention weight each epoch
+    is_attn_plot = (
+        "transformer" in args.model_module
+        or "conformer" in args.model_module
+        or mtl_mode in ["att", "mtl", "custom_transducer"]
+    )
+    if args.num_save_attention > 0 and is_attn_plot:
+        data = sorted(
+            list(valid_json.items())[: args.num_save_attention],
+            key=lambda x: int(x[1]["input"][0]["shape"][1]),
+            reverse=True,
+        )
+        if hasattr(model, "module"):
+            att_vis_fn = model.module.calculate_all_attentions
+            plot_class = model.module.attention_plot_class
+        else:
+            att_vis_fn = model.calculate_all_attentions
+            plot_class = model.attention_plot_class
+        att_reporter = plot_class(
+            att_vis_fn,
+            data,
+            args.outdir + "/att_ws",
+            converter=converter,
+            transform=load_cv,
+            device=device,
+            subsampling_factor=total_subsampling_factor,
+        )
+        trainer.extend(att_reporter, trigger=(1, "epoch"))
+    else:
+        att_reporter = None
+    # Save CTC prob at each epoch
+    if mtl_mode in ["ctc", "mtl"] and args.num_save_ctc > 0:
+        # NOTE: sort it by output lengths
+        data = sorted(
+            list(valid_json.items())[: args.num_save_ctc],
+            key=lambda x: int(x[1]["output"][0]["shape"][0]),
+            reverse=True,
+        )
+        if hasattr(model, "module"):
+            ctc_vis_fn = model.module.calculate_all_ctc_probs
+            plot_class = model.module.ctc_plot_class
+        else:
+            ctc_vis_fn = model.calculate_all_ctc_probs
+            plot_class = model.ctc_plot_class
+        ctc_reporter = plot_class(
+            ctc_vis_fn,
+            data,
+            args.outdir + "/ctc_prob",
+            converter=converter,
+            transform=load_cv,
+            device=device,
+            subsampling_factor=total_subsampling_factor,
+        )
+        trainer.extend(ctc_reporter, trigger=(1, "epoch"))
+    else:
+        ctc_reporter = None
+    # Make a plot for training and validation values
+    if args.num_encs > 1:
+        report_keys_loss_ctc = [
+            "main/loss_ctc{}".format(i + 1) for i in range(model.num_encs)
+        ] + ["validation/main/loss_ctc{}".format(i + 1) for i in range(model.num_encs)]
+        report_keys_cer_ctc = [
+            "main/cer_ctc{}".format(i + 1) for i in range(model.num_encs)
+        ] + ["validation/main/cer_ctc{}".format(i + 1) for i in range(model.num_encs)]
+    if hasattr(model, "is_rnnt"):
+        trainer.extend(
+            extensions.PlotReport(
+                [
+                    "main/loss",
+                    "validation/main/loss",
+                    "main/loss_trans",
+                    "validation/main/loss_trans",
+                    "main/loss_ctc",
+                    "validation/main/loss_ctc",
+                    "main/loss_lm",
+                    "validation/main/loss_lm",
+                    "main/loss_aux_trans",
+                    "validation/main/loss_aux_trans",
+                    "main/loss_aux_symm_kl",
+                    "validation/main/loss_aux_symm_kl",
+                ],
+                "epoch",
+                file_name="loss.png",
+            )
+        )
+    else:
+        trainer.extend(
+            extensions.PlotReport(
+                [
+                    "main/loss",
+                    "validation/main/loss",
+                    "main/loss_ctc",
+                    "validation/main/loss_ctc",
+                    "main/loss_att",
+                    "validation/main/loss_att",
+                ]
+                + ([] if args.num_encs == 1 else report_keys_loss_ctc),
+                "epoch",
+                file_name="loss.png",
+            )
+        )
+    trainer.extend(
+        extensions.PlotReport(
+            ["main/acc", "validation/main/acc"], "epoch", file_name="acc.png"
+        )
+    )
+    trainer.extend(
+        extensions.PlotReport(
+            ["main/cer_ctc", "validation/main/cer_ctc"]
+            + ([] if args.num_encs == 1 else report_keys_loss_ctc),
+            "epoch",
+            file_name="cer.png",
+        )
+    )
+    # Save best models
+    trainer.extend(
+        snapshot_object(model, "model.loss.best"),
+        trigger=training.triggers.MinValueTrigger("validation/main/loss"),
+    )
+    if mtl_mode not in ["ctc", "transducer", "custom_transducer"]:
+        trainer.extend(
+            snapshot_object(model, "model.acc.best"),
+            trigger=training.triggers.MaxValueTrigger("validation/main/acc"),
+        )
+    # save snapshot which contains model and optimizer states
+    if args.save_interval_iters > 0:
+        trainer.extend(
+            torch_snapshot(filename="snapshot.iter.{.updater.iteration}"),
+            trigger=(args.save_interval_iters, "iteration"),
+        )
+    # save snapshot at every epoch - for model averaging
+    trainer.extend(torch_snapshot(), trigger=(1, "epoch"))
+    # epsilon decay in the optimizer
+    if args.opt == "adadelta":
+        if args.criterion == "acc" and mtl_mode != "ctc":
+            trainer.extend(
+                restore_snapshot(
+                    model, args.outdir + "/model.acc.best", load_fn=torch_load
+                ),
+                trigger=CompareValueTrigger(
+                    "validation/main/acc",
+                    lambda best_value, current_value: best_value > current_value,
+                ),
+            )
+            trainer.extend(
+                adadelta_eps_decay(args.eps_decay),
+                trigger=CompareValueTrigger(
+                    "validation/main/acc",
+                    lambda best_value, current_value: best_value > current_value,
+                ),
+            )
+        elif args.criterion == "loss":
+            trainer.extend(
+                restore_snapshot(
+                    model, args.outdir + "/model.loss.best", load_fn=torch_load
+                ),
+                trigger=CompareValueTrigger(
+                    "validation/main/loss",
+                    lambda best_value, current_value: best_value < current_value,
+                ),
+            )
+            trainer.extend(
+                adadelta_eps_decay(args.eps_decay),
+                trigger=CompareValueTrigger(
+                    "validation/main/loss",
+                    lambda best_value, current_value: best_value < current_value,
+                ),
+            )
+        # NOTE: In some cases, it may take more than one epoch for the model's loss
+        # to escape from a local minimum.
+        # Thus, restore_snapshot extension is not used here.
+        # see details in https://github.com/espnet/espnet/pull/2171
+        elif args.criterion == "loss_eps_decay_only":
+            trainer.extend(
+                adadelta_eps_decay(args.eps_decay),
+                trigger=CompareValueTrigger(
+                    "validation/main/loss",
+                    lambda best_value, current_value: best_value < current_value,
+                ),
+            )
+    # Write a log of evaluation statistics for each epoch
+    trainer.extend(
+        extensions.LogReport(trigger=(args.report_interval_iters, "iteration"))
+    )
+    if hasattr(model, "is_rnnt"):
+        report_keys = [
+            "epoch",
+            "iteration",
+            "main/loss",
+            "main/loss_trans",
+            "main/loss_ctc",
+            "main/loss_lm",
+            "main/loss_aux_trans",
+            "main/loss_aux_symm_kl",
+            "validation/main/loss",
+            "validation/main/loss_trans",
+            "validation/main/loss_ctc",
+            "validation/main/loss_lm",
+            "validation/main/loss_aux_trans",
+            "validation/main/loss_aux_symm_kl",
+            "elapsed_time",
+        ]
+    else:
+        report_keys = [
+            "epoch",
+            "iteration",
+            "main/loss",
+            "main/loss_ctc",
+            "main/loss_att",
+            "validation/main/loss",
+            "validation/main/loss_ctc",
+            "validation/main/loss_att",
+            "main/acc",
+            "validation/main/acc",
+            "main/cer_ctc",
+            "validation/main/cer_ctc",
+            "elapsed_time",
+        ] + ([] if args.num_encs == 1 else report_keys_cer_ctc + report_keys_loss_ctc)
+    if args.opt == "adadelta":
+        trainer.extend(
+            extensions.observe_value(
+                "eps",
+                lambda trainer: trainer.updater.get_optimizer("main").param_groups[0][
+                    "eps"
+                ],
+            ),
+            trigger=(args.report_interval_iters, "iteration"),
+        )
+        report_keys.append("eps")
+    if args.report_cer:
+        report_keys.append("validation/main/cer")
+    if args.report_wer:
+        report_keys.append("validation/main/wer")
+    trainer.extend(
+        extensions.PrintReport(report_keys),
+        trigger=(args.report_interval_iters, "iteration"),
+    )
+    trainer.extend(extensions.ProgressBar(update_interval=args.report_interval_iters))
+    set_early_stop(trainer, args)
+    if args.tensorboard_dir is not None and args.tensorboard_dir != "":
+        trainer.extend(
+            TensorboardLogger(
+                SummaryWriter(args.tensorboard_dir),
+                att_reporter=att_reporter,
+                ctc_reporter=ctc_reporter,
+            ),
+            trigger=(args.report_interval_iters, "iteration"),
+        )
+    # Run the training
+    trainer.run()
+    check_early_stop(trainer, args.epochs)
+def recog(args):
+    """Decode with the given args.
+    Args:
+        args (namespace): The program arguments.
+    """
+    set_deterministic_pytorch(args)
+    model, train_args = load_trained_model(args.model, training=False)
+    assert isinstance(model, ASRInterface)
+    model.recog_args = args
+    if args.streaming_mode and "transformer" in train_args.model_module:
+        raise NotImplementedError("streaming mode for transformer is not implemented")
+    logging.info(
+        " Total parameter of the model = "
+        + str(sum(p.numel() for p in model.parameters()))
+    )
+    # read rnnlm
+    if args.rnnlm:
+        rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf)
+        if getattr(rnnlm_args, "model_module", "default") != "default":
+            raise ValueError(
+                "use '--api v2' option to decode with non-default language model"
+            )
+        rnnlm = lm_pytorch.ClassifierWithState(
+            lm_pytorch.RNNLM(
+                len(train_args.char_list),
+                rnnlm_args.layer,
+                rnnlm_args.unit,
+                getattr(rnnlm_args, "embed_unit", None),  # for backward compatibility
+            )
+        )
+        torch_load(args.rnnlm, rnnlm)
+        rnnlm.eval()
+    else:
+        rnnlm = None
+    if args.word_rnnlm:
+        rnnlm_args = get_model_conf(args.word_rnnlm, args.word_rnnlm_conf)
+        word_dict = rnnlm_args.char_list_dict
+        char_dict = {x: i for i, x in enumerate(train_args.char_list)}
+        word_rnnlm = lm_pytorch.ClassifierWithState(
+            lm_pytorch.RNNLM(
+                len(word_dict),
+                rnnlm_args.layer,
+                rnnlm_args.unit,
+                getattr(rnnlm_args, "embed_unit", None),  # for backward compatibility
+            )
+        )
+        torch_load(args.word_rnnlm, word_rnnlm)
+        word_rnnlm.eval()
+        if rnnlm is not None:
+            rnnlm = lm_pytorch.ClassifierWithState(
+                extlm_pytorch.MultiLevelLM(
+                    word_rnnlm.predictor, rnnlm.predictor, word_dict, char_dict
+                )
+            )
+        else:
+            rnnlm = lm_pytorch.ClassifierWithState(
+                extlm_pytorch.LookAheadWordLM(
+                    word_rnnlm.predictor, word_dict, char_dict
+                )
+            )
+    # gpu
+    if args.ngpu == 1:
+        gpu_id = list(range(args.ngpu))
+        logging.info("gpu id: " + str(gpu_id))
+        model.cuda()
+        if rnnlm:
+            rnnlm.cuda()
+    # read json data
+    with open(args.recog_json, "rb") as f:
+        js = json.load(f)["utts"]
+    new_js = {}
+    load_inputs_and_targets = LoadInputsAndTargets(
+        mode="asr",
+        load_output=False,
+        sort_in_input_length=False,
+        preprocess_conf=train_args.preprocess_conf
+        if args.preprocess_conf is None
+        else args.preprocess_conf,
+        preprocess_args={"train": False},
+    )
+    # load transducer beam search
+    if hasattr(model, "is_rnnt"):
+        if hasattr(model, "dec"):
+            trans_decoder = model.dec
+        else:
+            trans_decoder = model.decoder
+        joint_network = model.joint_network
+        beam_search_transducer = BeamSearchTransducer(
+            decoder=trans_decoder,
+            joint_network=joint_network,
+            beam_size=args.beam_size,
+            nbest=args.nbest,
+            lm=rnnlm,
+            lm_weight=args.lm_weight,
+            search_type=args.search_type,
+            max_sym_exp=args.max_sym_exp,
+            u_max=args.u_max,
+            nstep=args.nstep,
+            prefix_alpha=args.prefix_alpha,
+            score_norm=args.score_norm,
+        )
+    if args.batchsize == 0:
+        with torch.no_grad():
+            for idx, name in enumerate(js.keys(), 1):
+                logging.info("(%d/%d) decoding " + name, idx, len(js.keys()))
+                batch = [(name, js[name])]
+                feat = load_inputs_and_targets(batch)
+                feat = (
+                    feat[0][0]
+                    if args.num_encs == 1
+                    else [feat[idx][0] for idx in range(model.num_encs)]
+                )
+                if args.streaming_mode == "window" and args.num_encs == 1:
+                    logging.info(
+                        "Using streaming recognizer with window size %d frames",
+                        args.streaming_window,
+                    )
+                    se2e = WindowStreamingE2E(e2e=model, recog_args=args, rnnlm=rnnlm)
+                    for i in range(0, feat.shape[0], args.streaming_window):
+                        logging.info(
+                            "Feeding frames %d - %d", i, i + args.streaming_window
+                        )
+                        se2e.accept_input(feat[i : i + args.streaming_window])
+                    logging.info("Running offline attention decoder")
+                    se2e.decode_with_attention_offline()
+                    logging.info("Offline attention decoder finished")
+                    nbest_hyps = se2e.retrieve_recognition()
+                elif args.streaming_mode == "segment" and args.num_encs == 1:
+                    logging.info(
+                        "Using streaming recognizer with threshold value %d",
+                        args.streaming_min_blank_dur,
+                    )
+                    nbest_hyps = []
+                    for n in range(args.nbest):
+                        nbest_hyps.append({"yseq": [], "score": 0.0})
+                    se2e = SegmentStreamingE2E(e2e=model, recog_args=args, rnnlm=rnnlm)
+                    r = np.prod(model.subsample)
+                    for i in range(0, feat.shape[0], r):
+                        hyps = se2e.accept_input(feat[i : i + r])
+                        if hyps is not None:
+                            text = "".join(
+                                [
+                                    train_args.char_list[int(x)]
+                                    for x in hyps[0]["yseq"][1:-1]
+                                    if int(x) != -1
+                                ]
+                            )
+                            text = text.replace(
+                                "\u2581", " "
+                            ).strip()  # for SentencePiece
+                            text = text.replace(model.space, " ")
+                            text = text.replace(model.blank, "")
+                            logging.info(text)
+                            for n in range(args.nbest):
+                                nbest_hyps[n]["yseq"].extend(hyps[n]["yseq"])
+                                nbest_hyps[n]["score"] += hyps[n]["score"]
+                elif hasattr(model, "is_rnnt"):
+                    nbest_hyps = model.recognize(feat, beam_search_transducer)
+                else:
+                    nbest_hyps = model.recognize(
+                        feat, args, train_args.char_list, rnnlm
+                    )
+                new_js[name] = add_results_to_json(
+                    js[name], nbest_hyps, train_args.char_list
+                )
+    else:
+        def grouper(n, iterable, fillvalue=None):
+            kargs = [iter(iterable)] * n
+            return zip_longest(*kargs, fillvalue=fillvalue)
+        # sort data if batchsize > 1
+        keys = list(js.keys())
+        if args.batchsize > 1:
+            feat_lens = [js[key]["input"][0]["shape"][0] for key in keys]
+            sorted_index = sorted(range(len(feat_lens)), key=lambda i: -feat_lens[i])
+            keys = [keys[i] for i in sorted_index]
+        with torch.no_grad():
+            for names in grouper(args.batchsize, keys, None):
+                names = [name for name in names if name]
+                batch = [(name, js[name]) for name in names]
+                feats = (
+                    load_inputs_and_targets(batch)[0]
+                    if args.num_encs == 1
+                    else load_inputs_and_targets(batch)
+                )
+                if args.streaming_mode == "window" and args.num_encs == 1:
+                    raise NotImplementedError
+                elif args.streaming_mode == "segment" and args.num_encs == 1:
+                    if args.batchsize > 1:
+                        raise NotImplementedError
+                    feat = feats[0]
+                    nbest_hyps = []
+                    for n in range(args.nbest):
+                        nbest_hyps.append({"yseq": [], "score": 0.0})
+                    se2e = SegmentStreamingE2E(e2e=model, recog_args=args, rnnlm=rnnlm)
+                    r = np.prod(model.subsample)
+                    for i in range(0, feat.shape[0], r):
+                        hyps = se2e.accept_input(feat[i : i + r])
+                        if hyps is not None:
+                            text = "".join(
+                                [
+                                    train_args.char_list[int(x)]
+                                    for x in hyps[0]["yseq"][1:-1]
+                                    if int(x) != -1
+                                ]
+                            )
+                            text = text.replace(
+                                "\u2581", " "
+                            ).strip()  # for SentencePiece
+                            text = text.replace(model.space, " ")
+                            text = text.replace(model.blank, "")
+                            logging.info(text)
+                            for n in range(args.nbest):
+                                nbest_hyps[n]["yseq"].extend(hyps[n]["yseq"])
+                                nbest_hyps[n]["score"] += hyps[n]["score"]
+                    nbest_hyps = [nbest_hyps]
+                else:
+                    nbest_hyps = model.recognize_batch(
+                        feats, args, train_args.char_list, rnnlm=rnnlm
+                    )
+                for i, nbest_hyp in enumerate(nbest_hyps):
+                    name = names[i]
+                    new_js[name] = add_results_to_json(
+                        js[name], nbest_hyp, train_args.char_list
+                    )
+    with open(args.result_label, "wb") as f:
+        f.write(
+            json.dumps(
+                {"utts": new_js}, indent=4, ensure_ascii=False, sort_keys=True
+            ).encode("utf_8")
+        )
+def enhance(args):
+    """Dumping enhanced speech and mask.
+    Args:
+        args (namespace): The program arguments.
+    """
+    set_deterministic_pytorch(args)
+    # read training config
+    idim, odim, train_args = get_model_conf(args.model, args.model_conf)
+    # TODO(ruizhili): implement enhance for multi-encoder model
+    assert args.num_encs == 1, "number of encoder should be 1 ({} is given)".format(
+        args.num_encs
+    )
+    # load trained model parameters
+    logging.info("reading model parameters from " + args.model)
+    model_class = dynamic_import(train_args.model_module)
+    model = model_class(idim, odim, train_args)
+    assert isinstance(model, ASRInterface)
+    torch_load(args.model, model)
+    model.recog_args = args
+    # gpu
+    if args.ngpu == 1:
+        gpu_id = list(range(args.ngpu))
+        logging.info("gpu id: " + str(gpu_id))
+        model.cuda()
+    # read json data
+    with open(args.recog_json, "rb") as f:
+        js = json.load(f)["utts"]
+    load_inputs_and_targets = LoadInputsAndTargets(
+        mode="asr",
+        load_output=False,
+        sort_in_input_length=False,
+        preprocess_conf=None,  # Apply pre_process in outer func
+    )
+    if args.batchsize == 0:
+        args.batchsize = 1
+    # Creates writers for outputs from the network
+    if args.enh_wspecifier is not None:
+        enh_writer = file_writer_helper(args.enh_wspecifier, filetype=args.enh_filetype)
+    else:
+        enh_writer = None
+    # Creates a Transformation instance
+    preprocess_conf = (
+        train_args.preprocess_conf
+        if args.preprocess_conf is None
+        else args.preprocess_conf
+    )
+    if preprocess_conf is not None:
+        logging.info(f"Use preprocessing: {preprocess_conf}")
+        transform = Transformation(preprocess_conf)
+    else:
+        transform = None
+    # Creates a IStft instance
+    istft = None
+    frame_shift = args.istft_n_shift  # Used for plot the spectrogram
+    if args.apply_istft:
+        if preprocess_conf is not None:
+            # Read the conffile and find stft setting
+            with open(preprocess_conf) as f:
+                # Json format: e.g.
+                #    {"process": [{"type": "stft",
+                #                  "win_length": 400,
+                #                  "n_fft": 512, "n_shift": 160,
+                #                  "window": "han"},
+                #                 {"type": "foo", ...}, ...]}
+                conf = json.load(f)
+                assert "process" in conf, conf
+                # Find stft setting
+                for p in conf["process"]:
+                    if p["type"] == "stft":
+                        istft = IStft(
+                            win_length=p["win_length"],
+                            n_shift=p["n_shift"],
+                            window=p.get("window", "hann"),
+                        )
+                        logging.info(
+                            "stft is found in {}. "
+                            "Setting istft config from it\n{}".format(
+                                preprocess_conf, istft
+                            )
+                        )
+                        frame_shift = p["n_shift"]
+                        break
+        if istft is None:
+            # Set from command line arguments
+            istft = IStft(
+                win_length=args.istft_win_length,
+                n_shift=args.istft_n_shift,
+                window=args.istft_window,
+            )
+            logging.info(
+                "Setting istft config from the command line args\n{}".format(istft)
+            )
+    # sort data
+    keys = list(js.keys())
+    feat_lens = [js[key]["input"][0]["shape"][0] for key in keys]
+    sorted_index = sorted(range(len(feat_lens)), key=lambda i: -feat_lens[i])
+    keys = [keys[i] for i in sorted_index]
+    def grouper(n, iterable, fillvalue=None):
+        kargs = [iter(iterable)] * n
+        return zip_longest(*kargs, fillvalue=fillvalue)
+    num_images = 0
+    if not os.path.exists(args.image_dir):
+        os.makedirs(args.image_dir)
+    for names in grouper(args.batchsize, keys, None):
+        batch = [(name, js[name]) for name in names]
+        # May be in time region: (Batch, [Time, Channel])
+        org_feats = load_inputs_and_targets(batch)[0]
+        if transform is not None:
+            # May be in time-freq region: : (Batch, [Time, Channel, Freq])
+            feats = transform(org_feats, train=False)
+        else:
+            feats = org_feats
+        with torch.no_grad():
+            enhanced, mask, ilens = model.enhance(feats)
+        for idx, name in enumerate(names):
+            # Assuming mask, feats : [Batch, Time, Channel. Freq]
+            #          enhanced    : [Batch, Time, Freq]
+            enh = enhanced[idx][: ilens[idx]]
+            mas = mask[idx][: ilens[idx]]
+            feat = feats[idx]
+            # Plot spectrogram
+            if args.image_dir is not None and num_images < args.num_images:
+                import matplotlib.pyplot as plt
+                num_images += 1
+                ref_ch = 0
+                plt.figure(figsize=(20, 10))
+                plt.subplot(4, 1, 1)
+                plt.title("Mask [ref={}ch]".format(ref_ch))
+                plot_spectrogram(
+                    plt,
+                    mas[:, ref_ch].T,
+                    fs=args.fs,
+                    mode="linear",
+                    frame_shift=frame_shift,
+                    bottom=False,
+                    labelbottom=False,
+                )
+                plt.subplot(4, 1, 2)
+                plt.title("Noisy speech [ref={}ch]".format(ref_ch))
+                plot_spectrogram(
+                    plt,
+                    feat[:, ref_ch].T,
+                    fs=args.fs,
+                    mode="db",
+                    frame_shift=frame_shift,
+                    bottom=False,
+                    labelbottom=False,
+                )
+                plt.subplot(4, 1, 3)
+                plt.title("Masked speech [ref={}ch]".format(ref_ch))
+                plot_spectrogram(
+                    plt,
+                    (feat[:, ref_ch] * mas[:, ref_ch]).T,
+                    frame_shift=frame_shift,
+                    fs=args.fs,
+                    mode="db",
+                    bottom=False,
+                    labelbottom=False,
+                )
+                plt.subplot(4, 1, 4)
+                plt.title("Enhanced speech")
+                plot_spectrogram(
+                    plt, enh.T, fs=args.fs, mode="db", frame_shift=frame_shift
+                )
+                plt.savefig(os.path.join(args.image_dir, name + ".png"))
+                plt.clf()
+            # Write enhanced wave files
+            if enh_writer is not None:
+                if istft is not None:
+                    enh = istft(enh)
+                else:
+                    enh = enh
+                if args.keep_length:
+                    if len(org_feats[idx]) < len(enh):
+                        # Truncate the frames added by stft padding
+                        enh = enh[: len(org_feats[idx])]
+                    elif len(org_feats) > len(enh):
+                        padwidth = [(0, (len(org_feats[idx]) - len(enh)))] + [
+                            (0, 0)
+                        ] * (enh.ndim - 1)
+                        enh = np.pad(enh, padwidth, mode="constant")
+                if args.enh_filetype in ("sound", "sound.hdf5"):
+                    enh_writer[name] = (args.fs, enh)
+                else:
+                    # Hint: To dump stft_signal, mask or etc,
+                    # enh_filetype='hdf5' might be convenient.
+                    enh_writer[name] = enh
+            if num_images >= args.num_images and enh_writer is None:
+                logging.info("Breaking the process.")
+                break
+def ctc_align(args):
+    """CTC forced alignments with the given args.
+    Args:
+        args (namespace): The program arguments.
+    """
+    def add_alignment_to_json(js, alignment, char_list):
+        """Add N-best results to json.
+        Args:
+            js (dict[str, Any]): Groundtruth utterance dict.
+            alignment (list[int]): List of alignment.
+            char_list (list[str]): List of characters.
+        Returns:
+            dict[str, Any]: N-best results added utterance dict.
+        """
+        # copy old json info
+        new_js = dict()
+        new_js["ctc_alignment"] = []
+        alignment_tokens = []
+        for idx, a in enumerate(alignment):
+            alignment_tokens.append(char_list[a])
+        alignment_tokens = " ".join(alignment_tokens)
+        new_js["ctc_alignment"] = alignment_tokens
+        return new_js
+    set_deterministic_pytorch(args)
+    model, train_args = load_trained_model(args.model)
+    assert isinstance(model, ASRInterface)
+    model.eval()
+    load_inputs_and_targets = LoadInputsAndTargets(
+        mode="asr",
+        load_output=True,
+        sort_in_input_length=False,
+        preprocess_conf=train_args.preprocess_conf
+        if args.preprocess_conf is None
+        else args.preprocess_conf,
+        preprocess_args={"train": False},
+    )
+    if args.ngpu > 1:
+        raise NotImplementedError("only single GPU decoding is supported")
+    if args.ngpu == 1:
+        device = "cuda"
+    else:
+        device = "cpu"
+    dtype = getattr(torch, args.dtype)
+    logging.info(f"Decoding device={device}, dtype={dtype}")
+    model.to(device=device, dtype=dtype).eval()
+    # read json data
+    with open(args.align_json, "rb") as f:
+        js = json.load(f)["utts"]
+    new_js = {}
+    if args.batchsize == 0:
+        with torch.no_grad():
+            for idx, name in enumerate(js.keys(), 1):
+                logging.info("(%d/%d) aligning " + name, idx, len(js.keys()))
+                batch = [(name, js[name])]
+                feat, label = load_inputs_and_targets(batch)
+                feat = feat[0]
+                label = label[0]
+                enc = model.encode(torch.as_tensor(feat).to(device)).unsqueeze(0)
+                alignment = model.ctc.forced_align(enc, label)
+                new_js[name] = add_alignment_to_json(
+                    js[name], alignment, train_args.char_list
+                )
+    else:
+        raise NotImplementedError("Align_batch is not implemented.")
+    with open(args.result_label, "wb") as f:
+        f.write(
+            json.dumps(
+                {"utts": new_js}, indent=4, ensure_ascii=False, sort_keys=True
+            ).encode("utf_8")
+        )

espnet/asr/pytorch_backend/asr_init.py ADDED Viewed

	@@ -0,0 +1,282 @@

+"""Finetuning methods."""
+import logging
+import os
+import torch
+from collections import OrderedDict
+from espnet.asr.asr_utils import get_model_conf
+from espnet.asr.asr_utils import torch_load
+from espnet.nets.asr_interface import ASRInterface
+from espnet.nets.mt_interface import MTInterface
+from espnet.nets.pytorch_backend.transducer.utils import custom_torch_load
+from espnet.nets.tts_interface import TTSInterface
+from espnet.utils.dynamic_import import dynamic_import
+def freeze_modules(model, modules):
+    """Freeze model parameters according to modules list.
+    Args:
+        model (torch.nn.Module): main model to update
+        modules (list): specified module list for freezing
+    Return:
+        model (torch.nn.Module): updated model
+        model_params (filter): filtered model parameters
+    """
+    for mod, param in model.named_parameters():
+        if any(mod.startswith(m) for m in modules):
+            logging.info(f"freezing {mod}, it will not be updated.")
+            param.requires_grad = False
+    model_params = filter(lambda x: x.requires_grad, model.parameters())
+    return model, model_params
+def transfer_verification(model_state_dict, partial_state_dict, modules):
+    """Verify tuples (key, shape) for input model modules match specified modules.
+    Args:
+        model_state_dict (OrderedDict): the initial model state_dict
+        partial_state_dict (OrderedDict): the trained model state_dict
+        modules (list): specified module list for transfer
+    Return:
+        (boolean): allow transfer
+    """
+    modules_model = []
+    partial_modules = []
+    for key_p, value_p in partial_state_dict.items():
+        if any(key_p.startswith(m) for m in modules):
+            partial_modules += [(key_p, value_p.shape)]
+    for key_m, value_m in model_state_dict.items():
+        if any(key_m.startswith(m) for m in modules):
+            modules_model += [(key_m, value_m.shape)]
+    len_match = len(modules_model) == len(partial_modules)
+    module_match = sorted(modules_model, key=lambda x: (x[0], x[1])) == sorted(
+        partial_modules, key=lambda x: (x[0], x[1])
+    )
+    return len_match and module_match
+def get_partial_state_dict(model_state_dict, modules):
+    """Create state_dict with specified modules matching input model modules.
+    Note that get_partial_lm_state_dict is used if a LM specified.
+    Args:
+        model_state_dict (OrderedDict): trained model state_dict
+        modules (list): specified module list for transfer
+    Return:
+        new_state_dict (OrderedDict): the updated state_dict
+    """
+    new_state_dict = OrderedDict()
+    for key, value in model_state_dict.items():
+        if any(key.startswith(m) for m in modules):
+            new_state_dict[key] = value
+    return new_state_dict
+def get_lm_state_dict(lm_state_dict):
+    """Create compatible ASR decoder state dict from LM state dict.
+    Args:
+        lm_state_dict (OrderedDict): pre-trained LM state_dict
+    Return:
+        new_state_dict (OrderedDict): LM state_dict with updated keys
+    """
+    new_state_dict = OrderedDict()
+    for key, value in list(lm_state_dict.items()):
+        if key == "predictor.embed.weight":
+            new_state_dict["dec.embed.weight"] = value
+        elif key.startswith("predictor.rnn."):
+            _split = key.split(".")
+            new_key = "dec.decoder." + _split[2] + "." + _split[3] + "_l0"
+            new_state_dict[new_key] = value
+    return new_state_dict
+def filter_modules(model_state_dict, modules):
+    """Filter non-matched modules in module_state_dict.
+    Args:
+        model_state_dict (OrderedDict): trained model state_dict
+        modules (list): specified module list for transfer
+    Return:
+        new_mods (list): the update module list
+    """
+    new_mods = []
+    incorrect_mods = []
+    mods_model = list(model_state_dict.keys())
+    for mod in modules:
+        if any(key.startswith(mod) for key in mods_model):
+            new_mods += [mod]
+        else:
+            incorrect_mods += [mod]
+    if incorrect_mods:
+        logging.warning(
+            "module(s) %s don't match or (partially match) "
+            "available modules in model.",
+            incorrect_mods,
+        )
+        logging.warning("for information, the existing modules in model are:")
+        logging.warning("%s", mods_model)
+    return new_mods
+def load_trained_model(model_path, training=True):
+    """Load the trained model for recognition.
+    Args:
+        model_path (str): Path to model.***.best
+    """
+    idim, odim, train_args = get_model_conf(
+        model_path, os.path.join(os.path.dirname(model_path), "model.json")
+    )
+    logging.warning("reading model parameters from " + model_path)
+    if hasattr(train_args, "model_module"):
+        model_module = train_args.model_module
+    else:
+        model_module = "espnet.nets.pytorch_backend.e2e_asr:E2E"
+    # CTC Loss is not needed, default to builtin to prevent import errors
+    if hasattr(train_args, "ctc_type"):
+        train_args.ctc_type = "builtin"
+    model_class = dynamic_import(model_module)
+    if "transducer" in model_module:
+        model = model_class(idim, odim, train_args, training=training)
+        custom_torch_load(model_path, model, training=training)
+    else:
+        model = model_class(idim, odim, train_args)
+        torch_load(model_path, model)
+    return model, train_args
+def get_trained_model_state_dict(model_path):
+    """Extract the trained model state dict for pre-initialization.
+    Args:
+        model_path (str): Path to model.***.best
+    Return:
+        model.state_dict() (OrderedDict): the loaded model state_dict
+        (bool): Boolean defining whether the model is an LM
+    """
+    conf_path = os.path.join(os.path.dirname(model_path), "model.json")
+    if "rnnlm" in model_path:
+        logging.warning("reading model parameters from %s", model_path)
+        return get_lm_state_dict(torch.load(model_path))
+    idim, odim, args = get_model_conf(model_path, conf_path)
+    logging.warning("reading model parameters from " + model_path)
+    if hasattr(args, "model_module"):
+        model_module = args.model_module
+    else:
+        model_module = "espnet.nets.pytorch_backend.e2e_asr:E2E"
+    model_class = dynamic_import(model_module)
+    model = model_class(idim, odim, args)
+    torch_load(model_path, model)
+    assert (
+        isinstance(model, MTInterface)
+        or isinstance(model, ASRInterface)
+        or isinstance(model, TTSInterface)
+    )
+    return model.state_dict()
+def load_trained_modules(idim, odim, args, interface=ASRInterface):
+    """Load model encoder or/and decoder modules with ESPNET pre-trained model(s).
+    Args:
+        idim (int): initial input dimension.
+        odim (int): initial output dimension.
+        args (Namespace): The initial model arguments.
+        interface (Interface): ASRInterface or STInterface or TTSInterface.
+    Return:
+        model (torch.nn.Module): The model with pretrained modules.
+    """
+    def print_new_keys(state_dict, modules, model_path):
+        logging.warning("loading %s from model: %s", modules, model_path)
+        for k in state_dict.keys():
+            logging.warning("override %s" % k)
+    enc_model_path = args.enc_init
+    dec_model_path = args.dec_init
+    enc_modules = args.enc_init_mods
+    dec_modules = args.dec_init_mods
+    model_class = dynamic_import(args.model_module)
+    main_model = model_class(idim, odim, args)
+    assert isinstance(main_model, interface)
+    main_state_dict = main_model.state_dict()
+    logging.warning("model(s) found for pre-initialization")
+    for model_path, modules in [
+        (enc_model_path, enc_modules),
+        (dec_model_path, dec_modules),
+    ]:
+        if model_path is not None:
+            if os.path.isfile(model_path):
+                model_state_dict = get_trained_model_state_dict(model_path)
+                modules = filter_modules(model_state_dict, modules)
+                partial_state_dict = get_partial_state_dict(model_state_dict, modules)
+                if partial_state_dict:
+                    if transfer_verification(
+                        main_state_dict, partial_state_dict, modules
+                    ):
+                        print_new_keys(partial_state_dict, modules, model_path)
+                        main_state_dict.update(partial_state_dict)
+                    else:
+                        logging.warning(
+                            f"modules {modules} in model {model_path} "
+                            f"don't match your training config",
+                        )
+            else:
+                logging.warning("model was not found : %s", model_path)
+    main_model.load_state_dict(main_state_dict)
+    return main_model

espnet/asr/pytorch_backend/asr_mix.py ADDED Viewed

	@@ -0,0 +1,654 @@

+#!/usr/bin/env python3
+"""
+This script is used for multi-speaker speech recognition.
+Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+ Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+import json
+import logging
+import os
+# chainer related
+from chainer import training
+from chainer.training import extensions
+from itertools import zip_longest as zip_longest
+import numpy as np
+from tensorboardX import SummaryWriter
+import torch
+from espnet.asr.asr_mix_utils import add_results_to_json
+from espnet.asr.asr_utils import adadelta_eps_decay
+from espnet.asr.asr_utils import CompareValueTrigger
+from espnet.asr.asr_utils import get_model_conf
+from espnet.asr.asr_utils import restore_snapshot
+from espnet.asr.asr_utils import snapshot_object
+from espnet.asr.asr_utils import torch_load
+from espnet.asr.asr_utils import torch_resume
+from espnet.asr.asr_utils import torch_snapshot
+from espnet.asr.pytorch_backend.asr import CustomEvaluator
+from espnet.asr.pytorch_backend.asr import CustomUpdater
+from espnet.asr.pytorch_backend.asr import load_trained_model
+import espnet.lm.pytorch_backend.extlm as extlm_pytorch
+from espnet.nets.asr_interface import ASRInterface
+from espnet.nets.pytorch_backend.e2e_asr_mix import pad_list
+import espnet.nets.pytorch_backend.lm.default as lm_pytorch
+from espnet.utils.dataset import ChainerDataLoader
+from espnet.utils.dataset import TransformDataset
+from espnet.utils.deterministic_utils import set_deterministic_pytorch
+from espnet.utils.dynamic_import import dynamic_import
+from espnet.utils.io_utils import LoadInputsAndTargets
+from espnet.utils.training.batchfy import make_batchset
+from espnet.utils.training.iterators import ShufflingEnabler
+from espnet.utils.training.tensorboard_logger import TensorboardLogger
+from espnet.utils.training.train_utils import check_early_stop
+from espnet.utils.training.train_utils import set_early_stop
+import matplotlib
+matplotlib.use("Agg")
+class CustomConverter(object):
+    """Custom batch converter for Pytorch.
+    Args:
+        subsampling_factor (int): The subsampling factor.
+        dtype (torch.dtype): Data type to convert.
+    """
+    def __init__(self, subsampling_factor=1, dtype=torch.float32, num_spkrs=2):
+        """Initialize the converter."""
+        self.subsampling_factor = subsampling_factor
+        self.ignore_id = -1
+        self.dtype = dtype
+        self.num_spkrs = num_spkrs
+    def __call__(self, batch, device=torch.device("cpu")):
+        """Transform a batch and send it to a device.
+        Args:
+            batch (list(tuple(str, dict[str, dict[str, Any]]))): The batch to transform.
+            device (torch.device): The device to send to.
+        Returns:
+            tuple(torch.Tensor, torch.Tensor, torch.Tensor): Transformed batch.
+        """
+        # batch should be located in list
+        assert len(batch) == 1
+        xs, ys = batch[0][0], batch[0][-self.num_spkrs :]
+        # perform subsampling
+        if self.subsampling_factor > 1:
+            xs = [x[:: self.subsampling_factor, :] for x in xs]
+        # get batch of lengths of input sequences
+        ilens = np.array([x.shape[0] for x in xs])
+        # perform padding and convert to tensor
+        # currently only support real number
+        if xs[0].dtype.kind == "c":
+            xs_pad_real = pad_list(
+                [torch.from_numpy(x.real).float() for x in xs], 0
+            ).to(device, dtype=self.dtype)
+            xs_pad_imag = pad_list(
+                [torch.from_numpy(x.imag).float() for x in xs], 0
+            ).to(device, dtype=self.dtype)
+            # Note(kamo):
+            # {'real': ..., 'imag': ...} will be changed to ComplexTensor in E2E.
+            # Don't create ComplexTensor and give it to E2E here
+            # because torch.nn.DataParallel can't handle it.
+            xs_pad = {"real": xs_pad_real, "imag": xs_pad_imag}
+        else:
+            xs_pad = pad_list([torch.from_numpy(x).float() for x in xs], 0).to(
+                device, dtype=self.dtype
+            )
+        ilens = torch.from_numpy(ilens).to(device)
+        if not isinstance(ys[0], np.ndarray):
+            ys_pad = []
+            for i in range(len(ys)):  # speakers
+                ys_pad += [torch.from_numpy(y).long() for y in ys[i]]
+            ys_pad = pad_list(ys_pad, self.ignore_id)
+            ys_pad = (
+                ys_pad.view(self.num_spkrs, -1, ys_pad.size(1))
+                .transpose(0, 1)
+                .to(device)
+            )  # (B, num_spkrs, Tmax)
+        else:
+            ys_pad = pad_list(
+                [torch.from_numpy(y).long() for y in ys], self.ignore_id
+            ).to(device)
+        return xs_pad, ilens, ys_pad
+def train(args):
+    """Train with the given args.
+    Args:
+        args (namespace): The program arguments.
+    """
+    set_deterministic_pytorch(args)
+    # check cuda availability
+    if not torch.cuda.is_available():
+        logging.warning("cuda is not available")
+    # get input and output dimension info
+    with open(args.valid_json, "rb") as f:
+        valid_json = json.load(f)["utts"]
+    utts = list(valid_json.keys())
+    idim = int(valid_json[utts[0]]["input"][0]["shape"][-1])
+    odim = int(valid_json[utts[0]]["output"][0]["shape"][-1])
+    logging.info("#input dims : " + str(idim))
+    logging.info("#output dims: " + str(odim))
+    # specify attention, CTC, hybrid mode
+    if args.mtlalpha == 1.0:
+        mtl_mode = "ctc"
+        logging.info("Pure CTC mode")
+    elif args.mtlalpha == 0.0:
+        mtl_mode = "att"
+        logging.info("Pure attention mode")
+    else:
+        mtl_mode = "mtl"
+        logging.info("Multitask learning mode")
+    # specify model architecture
+    model_class = dynamic_import(args.model_module)
+    model = model_class(idim, odim, args)
+    assert isinstance(model, ASRInterface)
+    subsampling_factor = model.subsample[0]
+    if args.rnnlm is not None:
+        rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf)
+        rnnlm = lm_pytorch.ClassifierWithState(
+            lm_pytorch.RNNLM(
+                len(args.char_list),
+                rnnlm_args.layer,
+                rnnlm_args.unit,
+                getattr(rnnlm_args, "embed_unit", None),  # for backward compatibility
+            )
+        )
+        torch.load(args.rnnlm, rnnlm)
+        model.rnnlm = rnnlm
+    # write model config
+    if not os.path.exists(args.outdir):
+        os.makedirs(args.outdir)
+    model_conf = args.outdir + "/model.json"
+    with open(model_conf, "wb") as f:
+        logging.info("writing a model config file to " + model_conf)
+        f.write(
+            json.dumps(
+                (idim, odim, vars(args)), indent=4, ensure_ascii=False, sort_keys=True
+            ).encode("utf_8")
+        )
+    for key in sorted(vars(args).keys()):
+        logging.info("ARGS: " + key + ": " + str(vars(args)[key]))
+    reporter = model.reporter
+    # check the use of multi-gpu
+    if args.ngpu > 1:
+        if args.batch_size != 0:
+            logging.warning(
+                "batch size is automatically increased (%d -> %d)"
+                % (args.batch_size, args.batch_size * args.ngpu)
+            )
+            args.batch_size *= args.ngpu
+    # set torch device
+    device = torch.device("cuda" if args.ngpu > 0 else "cpu")
+    if args.train_dtype in ("float16", "float32", "float64"):
+        dtype = getattr(torch, args.train_dtype)
+    else:
+        dtype = torch.float32
+    model = model.to(device=device, dtype=dtype)
+    logging.warning(
+        "num. model params: {:,} (num. trained: {:,} ({:.1f}%))".format(
+            sum(p.numel() for p in model.parameters()),
+            sum(p.numel() for p in model.parameters() if p.requires_grad),
+            sum(p.numel() for p in model.parameters() if p.requires_grad)
+            * 100.0
+            / sum(p.numel() for p in model.parameters()),
+        )
+    )
+    # Setup an optimizer
+    if args.opt == "adadelta":
+        optimizer = torch.optim.Adadelta(
+            model.parameters(), rho=0.95, eps=args.eps, weight_decay=args.weight_decay
+        )
+    elif args.opt == "adam":
+        optimizer = torch.optim.Adam(model.parameters(), weight_decay=args.weight_decay)
+    elif args.opt == "noam":
+        from espnet.nets.pytorch_backend.transformer.optimizer import get_std_opt
+        optimizer = get_std_opt(
+            model.parameters(),
+            args.adim,
+            args.transformer_warmup_steps,
+            args.transformer_lr,
+        )
+    else:
+        raise NotImplementedError("unknown optimizer: " + args.opt)
+    # setup apex.amp
+    if args.train_dtype in ("O0", "O1", "O2", "O3"):
+        try:
+            from apex import amp
+        except ImportError as e:
+            logging.error(
+                f"You need to install apex for --train-dtype {args.train_dtype}. "
+                "See https://github.com/NVIDIA/apex#linux"
+            )
+            raise e
+        if args.opt == "noam":
+            model, optimizer.optimizer = amp.initialize(
+                model, optimizer.optimizer, opt_level=args.train_dtype
+            )
+        else:
+            model, optimizer = amp.initialize(
+                model, optimizer, opt_level=args.train_dtype
+            )
+        use_apex = True
+    else:
+        use_apex = False
+    # FIXME: TOO DIRTY HACK
+    setattr(optimizer, "target", reporter)
+    setattr(optimizer, "serialize", lambda s: reporter.serialize(s))
+    # Setup a converter
+    converter = CustomConverter(
+        subsampling_factor=subsampling_factor, dtype=dtype, num_spkrs=args.num_spkrs
+    )
+    # read json data
+    with open(args.train_json, "rb") as f:
+        train_json = json.load(f)["utts"]
+    with open(args.valid_json, "rb") as f:
+        valid_json = json.load(f)["utts"]
+    use_sortagrad = args.sortagrad == -1 or args.sortagrad > 0
+    # make minibatch list (variable length)
+    train = make_batchset(
+        train_json,
+        args.batch_size,
+        args.maxlen_in,
+        args.maxlen_out,
+        args.minibatches,
+        min_batch_size=args.ngpu if args.ngpu > 1 else 1,
+        shortest_first=use_sortagrad,
+        count=args.batch_count,
+        batch_bins=args.batch_bins,
+        batch_frames_in=args.batch_frames_in,
+        batch_frames_out=args.batch_frames_out,
+        batch_frames_inout=args.batch_frames_inout,
+        iaxis=0,
+        oaxis=-1,
+    )
+    valid = make_batchset(
+        valid_json,
+        args.batch_size,
+        args.maxlen_in,
+        args.maxlen_out,
+        args.minibatches,
+        min_batch_size=args.ngpu if args.ngpu > 1 else 1,
+        count=args.batch_count,
+        batch_bins=args.batch_bins,
+        batch_frames_in=args.batch_frames_in,
+        batch_frames_out=args.batch_frames_out,
+        batch_frames_inout=args.batch_frames_inout,
+        iaxis=0,
+        oaxis=-1,
+    )
+    load_tr = LoadInputsAndTargets(
+        mode="asr",
+        load_output=True,
+        preprocess_conf=args.preprocess_conf,
+        preprocess_args={"train": True},  # Switch the mode of preprocessing
+    )
+    load_cv = LoadInputsAndTargets(
+        mode="asr",
+        load_output=True,
+        preprocess_conf=args.preprocess_conf,
+        preprocess_args={"train": False},  # Switch the mode of preprocessing
+    )
+    # hack to make batchsize argument as 1
+    # actual bathsize is included in a list
+    # default collate function converts numpy array to pytorch tensor
+    # we used an empty collate function instead which returns list
+    train_iter = {
+        "main": ChainerDataLoader(
+            dataset=TransformDataset(train, lambda data: converter([load_tr(data)])),
+            batch_size=1,
+            num_workers=args.n_iter_processes,
+            shuffle=True,
+            collate_fn=lambda x: x[0],
+        )
+    }
+    valid_iter = {
+        "main": ChainerDataLoader(
+            dataset=TransformDataset(valid, lambda data: converter([load_cv(data)])),
+            batch_size=1,
+            shuffle=False,
+            collate_fn=lambda x: x[0],
+            num_workers=args.n_iter_processes,
+        )
+    }
+    # Set up a trainer
+    updater = CustomUpdater(
+        model,
+        args.grad_clip,
+        train_iter,
+        optimizer,
+        device,
+        args.ngpu,
+        args.grad_noise,
+        args.accum_grad,
+        use_apex=use_apex,
+    )
+    trainer = training.Trainer(updater, (args.epochs, "epoch"), out=args.outdir)
+    if use_sortagrad:
+        trainer.extend(
+            ShufflingEnabler([train_iter]),
+            trigger=(args.sortagrad if args.sortagrad != -1 else args.epochs, "epoch"),
+        )
+    # Resume from a snapshot
+    if args.resume:
+        logging.info("resumed from %s" % args.resume)
+        torch_resume(args.resume, trainer)
+    # Evaluate the model with the test dataset for each epoch
+    trainer.extend(CustomEvaluator(model, valid_iter, reporter, device, args.ngpu))
+    # Save attention weight each epoch
+    if args.num_save_attention > 0 and args.mtlalpha != 1.0:
+        data = sorted(
+            list(valid_json.items())[: args.num_save_attention],
+            key=lambda x: int(x[1]["input"][0]["shape"][1]),
+            reverse=True,
+        )
+        if hasattr(model, "module"):
+            att_vis_fn = model.module.calculate_all_attentions
+            plot_class = model.module.attention_plot_class
+        else:
+            att_vis_fn = model.calculate_all_attentions
+            plot_class = model.attention_plot_class
+        att_reporter = plot_class(
+            att_vis_fn,
+            data,
+            args.outdir + "/att_ws",
+            converter=converter,
+            transform=load_cv,
+            device=device,
+        )
+        trainer.extend(att_reporter, trigger=(1, "epoch"))
+    else:
+        att_reporter = None
+    # Make a plot for training and validation values
+    trainer.extend(
+        extensions.PlotReport(
+            [
+                "main/loss",
+                "validation/main/loss",
+                "main/loss_ctc",
+                "validation/main/loss_ctc",
+                "main/loss_att",
+                "validation/main/loss_att",
+            ],
+            "epoch",
+            file_name="loss.png",
+        )
+    )
+    trainer.extend(
+        extensions.PlotReport(
+            ["main/acc", "validation/main/acc"], "epoch", file_name="acc.png"
+        )
+    )
+    trainer.extend(
+        extensions.PlotReport(
+            ["main/cer_ctc", "validation/main/cer_ctc"], "epoch", file_name="cer.png"
+        )
+    )
+    # Save best models
+    trainer.extend(
+        snapshot_object(model, "model.loss.best"),
+        trigger=training.triggers.MinValueTrigger("validation/main/loss"),
+    )
+    if mtl_mode != "ctc":
+        trainer.extend(
+            snapshot_object(model, "model.acc.best"),
+            trigger=training.triggers.MaxValueTrigger("validation/main/acc"),
+        )
+    # save snapshot which contains model and optimizer states
+    trainer.extend(torch_snapshot(), trigger=(1, "epoch"))
+    # epsilon decay in the optimizer
+    if args.opt == "adadelta":
+        if args.criterion == "acc" and mtl_mode != "ctc":
+            trainer.extend(
+                restore_snapshot(
+                    model, args.outdir + "/model.acc.best", load_fn=torch_load
+                ),
+                trigger=CompareValueTrigger(
+                    "validation/main/acc",
+                    lambda best_value, current_value: best_value > current_value,
+                ),
+            )
+            trainer.extend(
+                adadelta_eps_decay(args.eps_decay),
+                trigger=CompareValueTrigger(
+                    "validation/main/acc",
+                    lambda best_value, current_value: best_value > current_value,
+                ),
+            )
+        elif args.criterion == "loss":
+            trainer.extend(
+                restore_snapshot(
+                    model, args.outdir + "/model.loss.best", load_fn=torch_load
+                ),
+                trigger=CompareValueTrigger(
+                    "validation/main/loss",
+                    lambda best_value, current_value: best_value < current_value,
+                ),
+            )
+            trainer.extend(
+                adadelta_eps_decay(args.eps_decay),
+                trigger=CompareValueTrigger(
+                    "validation/main/loss",
+                    lambda best_value, current_value: best_value < current_value,
+                ),
+            )
+    # Write a log of evaluation statistics for each epoch
+    trainer.extend(
+        extensions.LogReport(trigger=(args.report_interval_iters, "iteration"))
+    )
+    report_keys = [
+        "epoch",
+        "iteration",
+        "main/loss",
+        "main/loss_ctc",
+        "main/loss_att",
+        "validation/main/loss",
+        "validation/main/loss_ctc",
+        "validation/main/loss_att",
+        "main/acc",
+        "validation/main/acc",
+        "main/cer_ctc",
+        "validation/main/cer_ctc",
+        "elapsed_time",
+    ]
+    if args.opt == "adadelta":
+        trainer.extend(
+            extensions.observe_value(
+                "eps",
+                lambda trainer: trainer.updater.get_optimizer("main").param_groups[0][
+                    "eps"
+                ],
+            ),
+            trigger=(args.report_interval_iters, "iteration"),
+        )
+        report_keys.append("eps")
+    if args.report_cer:
+        report_keys.append("validation/main/cer")
+    if args.report_wer:
+        report_keys.append("validation/main/wer")
+    trainer.extend(
+        extensions.PrintReport(report_keys),
+        trigger=(args.report_interval_iters, "iteration"),
+    )
+    trainer.extend(extensions.ProgressBar(update_interval=args.report_interval_iters))
+    set_early_stop(trainer, args)
+    if args.tensorboard_dir is not None and args.tensorboard_dir != "":
+        trainer.extend(
+            TensorboardLogger(SummaryWriter(args.tensorboard_dir), att_reporter),
+            trigger=(args.report_interval_iters, "iteration"),
+        )
+    # Run the training
+    trainer.run()
+    check_early_stop(trainer, args.epochs)
+def recog(args):
+    """Decode with the given args.
+    Args:
+        args (namespace): The program arguments.
+    """
+    set_deterministic_pytorch(args)
+    model, train_args = load_trained_model(args.model)
+    assert isinstance(model, ASRInterface)
+    model.recog_args = args
+    # read rnnlm
+    if args.rnnlm:
+        rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf)
+        if getattr(rnnlm_args, "model_module", "default") != "default":
+            raise ValueError(
+                "use '--api v2' option to decode with non-default language model"
+            )
+        rnnlm = lm_pytorch.ClassifierWithState(
+            lm_pytorch.RNNLM(
+                len(train_args.char_list),
+                rnnlm_args.layer,
+                rnnlm_args.unit,
+                getattr(rnnlm_args, "embed_unit", None),  # for backward compatibility
+            )
+        )
+        torch_load(args.rnnlm, rnnlm)
+        rnnlm.eval()
+    else:
+        rnnlm = None
+    if args.word_rnnlm:
+        rnnlm_args = get_model_conf(args.word_rnnlm, args.word_rnnlm_conf)
+        word_dict = rnnlm_args.char_list_dict
+        char_dict = {x: i for i, x in enumerate(train_args.char_list)}
+        word_rnnlm = lm_pytorch.ClassifierWithState(
+            lm_pytorch.RNNLM(len(word_dict), rnnlm_args.layer, rnnlm_args.unit)
+        )
+        torch_load(args.word_rnnlm, word_rnnlm)
+        word_rnnlm.eval()
+        if rnnlm is not None:
+            rnnlm = lm_pytorch.ClassifierWithState(
+                extlm_pytorch.MultiLevelLM(
+                    word_rnnlm.predictor, rnnlm.predictor, word_dict, char_dict
+                )
+            )
+        else:
+            rnnlm = lm_pytorch.ClassifierWithState(
+                extlm_pytorch.LookAheadWordLM(
+                    word_rnnlm.predictor, word_dict, char_dict
+                )
+            )
+    # gpu
+    if args.ngpu == 1:
+        gpu_id = list(range(args.ngpu))
+        logging.info("gpu id: " + str(gpu_id))
+        model.cuda()
+        if rnnlm:
+            rnnlm.cuda()
+    # read json data
+    with open(args.recog_json, "rb") as f:
+        js = json.load(f)["utts"]
+    new_js = {}
+    load_inputs_and_targets = LoadInputsAndTargets(
+        mode="asr",
+        load_output=False,
+        sort_in_input_length=False,
+        preprocess_conf=train_args.preprocess_conf
+        if args.preprocess_conf is None
+        else args.preprocess_conf,
+        preprocess_args={"train": False},
+    )
+    if args.batchsize == 0:
+        with torch.no_grad():
+            for idx, name in enumerate(js.keys(), 1):
+                logging.info("(%d/%d) decoding " + name, idx, len(js.keys()))
+                batch = [(name, js[name])]
+                feat = load_inputs_and_targets(batch)[0][0]
+                nbest_hyps = model.recognize(feat, args, train_args.char_list, rnnlm)
+                new_js[name] = add_results_to_json(
+                    js[name], nbest_hyps, train_args.char_list
+                )
+    else:
+        def grouper(n, iterable, fillvalue=None):
+            kargs = [iter(iterable)] * n
+            return zip_longest(*kargs, fillvalue=fillvalue)
+        # sort data if batchsize > 1
+        keys = list(js.keys())
+        if args.batchsize > 1:
+            feat_lens = [js[key]["input"][0]["shape"][0] for key in keys]
+            sorted_index = sorted(range(len(feat_lens)), key=lambda i: -feat_lens[i])
+            keys = [keys[i] for i in sorted_index]
+        with torch.no_grad():
+            for names in grouper(args.batchsize, keys, None):
+                names = [name for name in names if name]
+                batch = [(name, js[name]) for name in names]
+                feats = load_inputs_and_targets(batch)[0]
+                nbest_hyps = model.recognize_batch(
+                    feats, args, train_args.char_list, rnnlm=rnnlm
+                )
+                for i, name in enumerate(names):
+                    nbest_hyp = [hyp[i] for hyp in nbest_hyps]
+                    new_js[name] = add_results_to_json(
+                        js[name], nbest_hyp, train_args.char_list
+                    )
+    with open(args.result_label, "wb") as f:
+        f.write(
+            json.dumps(
+                {"utts": new_js}, indent=4, ensure_ascii=False, sort_keys=True
+            ).encode("utf_8")
+        )

espnet/asr/pytorch_backend/recog.py ADDED Viewed

	@@ -0,0 +1,152 @@

+"""V2 backend for `asr_recog.py` using py:class:`espnet.nets.beam_search.BeamSearch`."""
+import json
+import logging
+import torch
+from espnet.asr.asr_utils import add_results_to_json
+from espnet.asr.asr_utils import get_model_conf
+from espnet.asr.asr_utils import torch_load
+from espnet.asr.pytorch_backend.asr import load_trained_model
+from espnet.nets.asr_interface import ASRInterface
+from espnet.nets.batch_beam_search import BatchBeamSearch
+from espnet.nets.beam_search import BeamSearch
+from espnet.nets.lm_interface import dynamic_import_lm
+from espnet.nets.scorer_interface import BatchScorerInterface
+from espnet.nets.scorers.length_bonus import LengthBonus
+from espnet.utils.deterministic_utils import set_deterministic_pytorch
+from espnet.utils.io_utils import LoadInputsAndTargets
+def recog_v2(args):
+    """Decode with custom models that implements ScorerInterface.
+    Notes:
+        The previous backend espnet.asr.pytorch_backend.asr.recog
+        only supports E2E and RNNLM
+    Args:
+        args (namespace): The program arguments.
+        See py:func:`espnet.bin.asr_recog.get_parser` for details
+    """
+    logging.warning("experimental API for custom LMs is selected by --api v2")
+    if args.batchsize > 1:
+        raise NotImplementedError("multi-utt batch decoding is not implemented")
+    if args.streaming_mode is not None:
+        raise NotImplementedError("streaming mode is not implemented")
+    if args.word_rnnlm:
+        raise NotImplementedError("word LM is not implemented")
+    set_deterministic_pytorch(args)
+    model, train_args = load_trained_model(args.model)
+    assert isinstance(model, ASRInterface)
+    model.eval()
+    load_inputs_and_targets = LoadInputsAndTargets(
+        mode="asr",
+        load_output=False,
+        sort_in_input_length=False,
+        preprocess_conf=train_args.preprocess_conf
+        if args.preprocess_conf is None
+        else args.preprocess_conf,
+        preprocess_args={"train": False},
+    )
+    if args.rnnlm:
+        lm_args = get_model_conf(args.rnnlm, args.rnnlm_conf)
+        # NOTE: for a compatibility with less than 0.5.0 version models
+        lm_model_module = getattr(lm_args, "model_module", "default")
+        lm_class = dynamic_import_lm(lm_model_module, lm_args.backend)
+        lm = lm_class(len(train_args.char_list), lm_args)
+        torch_load(args.rnnlm, lm)
+        lm.eval()
+    else:
+        lm = None
+    if args.ngram_model:
+        from espnet.nets.scorers.ngram import NgramFullScorer
+        from espnet.nets.scorers.ngram import NgramPartScorer
+        if args.ngram_scorer == "full":
+            ngram = NgramFullScorer(args.ngram_model, train_args.char_list)
+        else:
+            ngram = NgramPartScorer(args.ngram_model, train_args.char_list)
+    else:
+        ngram = None
+    scorers = model.scorers()
+    scorers["lm"] = lm
+    scorers["ngram"] = ngram
+    scorers["length_bonus"] = LengthBonus(len(train_args.char_list))
+    weights = dict(
+        decoder=1.0 - args.ctc_weight,
+        ctc=args.ctc_weight,
+        lm=args.lm_weight,
+        ngram=args.ngram_weight,
+        length_bonus=args.penalty,
+    )
+    beam_search = BeamSearch(
+        beam_size=args.beam_size,
+        vocab_size=len(train_args.char_list),
+        weights=weights,
+        scorers=scorers,
+        sos=model.sos,
+        eos=model.eos,
+        token_list=train_args.char_list,
+        pre_beam_score_key=None if args.ctc_weight == 1.0 else "full",
+    )
+    # TODO(karita): make all scorers batchfied
+    if args.batchsize == 1:
+        non_batch = [
+            k
+            for k, v in beam_search.full_scorers.items()
+            if not isinstance(v, BatchScorerInterface)
+        ]
+        if len(non_batch) == 0:
+            beam_search.__class__ = BatchBeamSearch
+            logging.info("BatchBeamSearch implementation is selected.")
+        else:
+            logging.warning(
+                f"As non-batch scorers {non_batch} are found, "
+                f"fall back to non-batch implementation."
+            )
+    if args.ngpu > 1:
+        raise NotImplementedError("only single GPU decoding is supported")
+    if args.ngpu == 1:
+        device = "cuda"
+    else:
+        device = "cpu"
+    dtype = getattr(torch, args.dtype)
+    logging.info(f"Decoding device={device}, dtype={dtype}")
+    model.to(device=device, dtype=dtype).eval()
+    beam_search.to(device=device, dtype=dtype).eval()
+    # read json data
+    with open(args.recog_json, "rb") as f:
+        js = json.load(f)["utts"]
+    new_js = {}
+    with torch.no_grad():
+        for idx, name in enumerate(js.keys(), 1):
+            logging.info("(%d/%d) decoding " + name, idx, len(js.keys()))
+            batch = [(name, js[name])]
+            feat = load_inputs_and_targets(batch)[0][0]
+            enc = model.encode(torch.as_tensor(feat).to(device=device, dtype=dtype))
+            nbest_hyps = beam_search(
+                x=enc, maxlenratio=args.maxlenratio, minlenratio=args.minlenratio
+            )
+            nbest_hyps = [
+                h.asdict() for h in nbest_hyps[: min(len(nbest_hyps), args.nbest)]
+            ]
+            new_js[name] = add_results_to_json(
+                js[name], nbest_hyps, train_args.char_list
+            )
+    with open(args.result_label, "wb") as f:
+        f.write(
+            json.dumps(
+                {"utts": new_js}, indent=4, ensure_ascii=False, sort_keys=True
+            ).encode("utf_8")
+        )

espnet/bin/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Initialize sub package."""

espnet/bin/asr_align.py ADDED Viewed

	@@ -0,0 +1,348 @@

+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2020 Johns Hopkins University (Xuankai Chang)
+#           2020, Technische Universität München;  Dominik Winkelbauer, Ludwig Kürzinger
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""
+This program performs CTC segmentation to align utterances within audio files.
+Inputs:
+    `--data-json`:
+        A json containing list of utterances and audio files
+    `--model`:
+        An already trained ASR model
+Output:
+    `--output`:
+        A plain `segments` file with utterance positions in the audio files.
+Selected parameters:
+    `--min-window-size`:
+        Minimum window size considered for a single utterance. The current default value
+        should be OK in most cases. Larger values might give better results; too large
+        values cause IndexErrors.
+    `--subsampling-factor`:
+        If the encoder sub-samples its input, the number of frames at the CTC layer is
+        reduced by this factor.
+    `--frame-duration`:
+        This is the non-overlapping duration of a single frame in milliseconds (the
+        inverse of frames per millisecond).
+    `--set-blank`:
+        In the rare case that the blank token has not the index 0 in the character
+        dictionary, this parameter sets the index of the blank token.
+    `--gratis-blank`:
+        Sets the transition cost for blank tokens to zero. Useful if there are longer
+        unrelated segments between segments.
+    `--replace-spaces-with-blanks`:
+        Spaces are replaced with blanks. Helps to model pauses between words. May
+        increase length of ground truth. May lead to misaligned segments when combined
+        with the option `--gratis-blank`.
+"""
+import configargparse
+import logging
+import os
+import sys
+# imports for inference
+from espnet.asr.pytorch_backend.asr_init import load_trained_model
+from espnet.nets.asr_interface import ASRInterface
+from espnet.utils.io_utils import LoadInputsAndTargets
+import json
+import torch
+# imports for CTC segmentation
+from ctc_segmentation import ctc_segmentation
+from ctc_segmentation import CtcSegmentationParameters
+from ctc_segmentation import determine_utterance_segments
+from ctc_segmentation import prepare_text
+# NOTE: you need this func to generate our sphinx doc
+def get_parser():
+    """Get default arguments."""
+    parser = configargparse.ArgumentParser(
+        description="Align text to audio using CTC segmentation."
+        "using a pre-trained speech recognition model.",
+        config_file_parser_class=configargparse.YAMLConfigFileParser,
+        formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
+    )
+    # general configuration
+    parser.add("--config", is_config_file=True, help="Decoding config file path.")
+    parser.add_argument(
+        "--ngpu", type=int, default=0, help="Number of GPUs (max. 1 is supported)"
+    )
+    parser.add_argument(
+        "--dtype",
+        choices=("float16", "float32", "float64"),
+        default="float32",
+        help="Float precision (only available in --api v2)",
+    )
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="pytorch",
+        choices=["pytorch"],
+        help="Backend library",
+    )
+    parser.add_argument("--debugmode", type=int, default=1, help="Debugmode")
+    parser.add_argument("--verbose", "-V", type=int, default=1, help="Verbose option")
+    parser.add_argument(
+        "--preprocess-conf",
+        type=str,
+        default=None,
+        help="The configuration file for the pre-processing",
+    )
+    # task related
+    parser.add_argument(
+        "--data-json", type=str, help="Json of recognition data for audio and text"
+    )
+    parser.add_argument("--utt-text", type=str, help="Text separated into utterances")
+    # model (parameter) related
+    parser.add_argument(
+        "--model", type=str, required=True, help="Model file parameters to read"
+    )
+    parser.add_argument(
+        "--model-conf", type=str, default=None, help="Model config file"
+    )
+    parser.add_argument(
+        "--num-encs", default=1, type=int, help="Number of encoders in the model."
+    )
+    # ctc-segmentation related
+    parser.add_argument(
+        "--subsampling-factor",
+        type=int,
+        default=None,
+        help="Subsampling factor."
+        " If the encoder sub-samples its input, the number of frames at the CTC layer"
+        " is reduced by this factor. For example, a BLSTMP with subsampling 1_2_2_1_1"
+        " has a subsampling factor of 4.",
+    )
+    parser.add_argument(
+        "--frame-duration",
+        type=int,
+        default=None,
+        help="Non-overlapping duration of a single frame in milliseconds.",
+    )
+    parser.add_argument(
+        "--min-window-size",
+        type=int,
+        default=None,
+        help="Minimum window size considered for utterance.",
+    )
+    parser.add_argument(
+        "--max-window-size",
+        type=int,
+        default=None,
+        help="Maximum window size considered for utterance.",
+    )
+    parser.add_argument(
+        "--use-dict-blank",
+        type=int,
+        default=None,
+        help="DEPRECATED.",
+    )
+    parser.add_argument(
+        "--set-blank",
+        type=int,
+        default=None,
+        help="Index of model dictionary for blank token (default: 0).",
+    )
+    parser.add_argument(
+        "--gratis-blank",
+        type=int,
+        default=None,
+        help="Set the transition cost of the blank token to zero. Audio sections"
+        " labeled with blank tokens can then be skipped without penalty. Useful"
+        " if there are unrelated audio segments between utterances.",
+    )
+    parser.add_argument(
+        "--replace-spaces-with-blanks",
+        type=int,
+        default=None,
+        help="Fill blanks in between words to better model pauses between words."
+        " Segments can be misaligned if this option is combined with --gratis-blank."
+        " May increase length of ground truth.",
+    )
+    parser.add_argument(
+        "--scoring-length",
+        type=int,
+        default=None,
+        help="Changes partitioning length L for calculation of the confidence score.",
+    )
+    parser.add_argument(
+        "--output",
+        type=configargparse.FileType("w"),
+        required=True,
+        help="Output segments file",
+    )
+    return parser
+def main(args):
+    """Run the main decoding function."""
+    parser = get_parser()
+    args, extra = parser.parse_known_args(args)
+    # logging info
+    if args.verbose == 1:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    elif args.verbose == 2:
+        logging.basicConfig(
+            level=logging.DEBUG,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+    if args.ngpu == 0 and args.dtype == "float16":
+        raise ValueError(f"--dtype {args.dtype} does not support the CPU backend.")
+    # check CUDA_VISIBLE_DEVICES
+    device = "cpu"
+    if args.ngpu == 1:
+        device = "cuda"
+        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
+        if cvd is None:
+            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
+    elif args.ngpu > 1:
+        logging.error("Decoding only supports ngpu=1.")
+        sys.exit(1)
+    # display PYTHONPATH
+    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))
+    # recog
+    logging.info("backend = " + args.backend)
+    if args.backend == "pytorch":
+        ctc_align(args, device)
+    else:
+        raise ValueError("Only pytorch is supported.")
+    sys.exit(0)
+def ctc_align(args, device):
+    """ESPnet-specific interface for CTC segmentation.
+    Parses configuration, infers the CTC posterior probabilities,
+    and then aligns start and end of utterances using CTC segmentation.
+    Results are written to the output file given in the args.
+    :param args: given configuration
+    :param device: for inference; one of ['cuda', 'cpu']
+    :return:  0 on success
+    """
+    model, train_args = load_trained_model(args.model)
+    assert isinstance(model, ASRInterface)
+    load_inputs_and_targets = LoadInputsAndTargets(
+        mode="asr",
+        load_output=True,
+        sort_in_input_length=False,
+        preprocess_conf=train_args.preprocess_conf
+        if args.preprocess_conf is None
+        else args.preprocess_conf,
+        preprocess_args={"train": False},
+    )
+    logging.info(f"Decoding device={device}")
+    # Warn for nets with high memory consumption on long audio files
+    if hasattr(model, "enc"):
+        encoder_module = model.enc.__class__.__module__
+    elif hasattr(model, "encoder"):
+        encoder_module = model.encoder.__class__.__module__
+    else:
+        encoder_module = "Unknown"
+    logging.info(f"Encoder module: {encoder_module}")
+    logging.info(f"CTC module:     {model.ctc.__class__.__module__}")
+    if "rnn" not in encoder_module:
+        logging.warning("No BLSTM model detected; memory consumption may be high.")
+    model.to(device=device).eval()
+    # read audio and text json data
+    with open(args.data_json, "rb") as f:
+        js = json.load(f)["utts"]
+    with open(args.utt_text, "r", encoding="utf-8") as f:
+        lines = f.readlines()
+        i = 0
+        text = {}
+        segment_names = {}
+        for name in js.keys():
+            text_per_audio = []
+            segment_names_per_audio = []
+            while i < len(lines) and lines[i].startswith(name):
+                text_per_audio.append(lines[i][lines[i].find(" ") + 1 :])
+                segment_names_per_audio.append(lines[i][: lines[i].find(" ")])
+                i += 1
+            text[name] = text_per_audio
+            segment_names[name] = segment_names_per_audio
+    # apply configuration
+    config = CtcSegmentationParameters()
+    if args.subsampling_factor is not None:
+        config.subsampling_factor = args.subsampling_factor
+    if args.frame_duration is not None:
+        config.frame_duration_ms = args.frame_duration
+    if args.min_window_size is not None:
+        config.min_window_size = args.min_window_size
+    if args.max_window_size is not None:
+        config.max_window_size = args.max_window_size
+    config.char_list = train_args.char_list
+    if args.use_dict_blank is not None:
+        logging.warning(
+            "The option --use-dict-blank is deprecated. If needed,"
+            " use --set-blank instead."
+        )
+    if args.set_blank is not None:
+        config.blank = args.set_blank
+    if args.replace_spaces_with_blanks is not None:
+        if args.replace_spaces_with_blanks:
+            config.replace_spaces_with_blanks = True
+        else:
+            config.replace_spaces_with_blanks = False
+    if args.gratis_blank:
+        config.blank_transition_cost_zero = True
+    if config.blank_transition_cost_zero and args.replace_spaces_with_blanks:
+        logging.error(
+            "Blanks are inserted between words, and also the transition cost of blank"
+            " is zero. This configuration may lead to misalignments!"
+        )
+    if args.scoring_length is not None:
+        config.score_min_mean_over_L = args.scoring_length
+    logging.info(
+        f"Frame timings: {config.frame_duration_ms}ms * {config.subsampling_factor}"
+    )
+    # Iterate over audio files to decode and align
+    for idx, name in enumerate(js.keys(), 1):
+        logging.info("(%d/%d) Aligning " + name, idx, len(js.keys()))
+        batch = [(name, js[name])]
+        feat, label = load_inputs_and_targets(batch)
+        feat = feat[0]
+        with torch.no_grad():
+            # Encode input frames
+            enc_output = model.encode(torch.as_tensor(feat).to(device)).unsqueeze(0)
+            # Apply ctc layer to obtain log character probabilities
+            lpz = model.ctc.log_softmax(enc_output)[0].cpu().numpy()
+        # Prepare the text for aligning
+        ground_truth_mat, utt_begin_indices = prepare_text(config, text[name])
+        # Align using CTC segmentation
+        timings, char_probs, state_list = ctc_segmentation(
+            config, lpz, ground_truth_mat
+        )
+        logging.debug(f"state_list = {state_list}")
+        # Obtain list of utterances with time intervals and confidence score
+        segments = determine_utterance_segments(
+            config, utt_begin_indices, char_probs, timings, text[name]
+        )
+        # Write to "segments" file
+        for i, boundary in enumerate(segments):
+            utt_segment = (
+                f"{segment_names[name][i]} {name} {boundary[0]:.2f}"
+                f" {boundary[1]:.2f} {boundary[2]:.9f}\n"
+            )
+            args.output.write(utt_segment)
+    return 0
+if __name__ == "__main__":
+    main(sys.argv[1:])

espnet/bin/asr_enhance.py ADDED Viewed

	@@ -0,0 +1,191 @@

+#!/usr/bin/env python3
+import configargparse
+from distutils.util import strtobool
+import logging
+import os
+import random
+import sys
+import numpy as np
+from espnet.asr.pytorch_backend.asr import enhance
+# NOTE: you need this func to generate our sphinx doc
+def get_parser():
+    parser = configargparse.ArgumentParser(
+        description="Enhance noisy speech for speech recognition",
+        config_file_parser_class=configargparse.YAMLConfigFileParser,
+        formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
+    )
+    # general configuration
+    parser.add("--config", is_config_file=True, help="config file path")
+    parser.add(
+        "--config2",
+        is_config_file=True,
+        help="second config file path that overwrites the settings in `--config`.",
+    )
+    parser.add(
+        "--config3",
+        is_config_file=True,
+        help="third config file path that overwrites the settings "
+        "in `--config` and `--config2`.",
+    )
+    parser.add_argument("--ngpu", default=0, type=int, help="Number of GPUs")
+    parser.add_argument(
+        "--backend",
+        default="chainer",
+        type=str,
+        choices=["chainer", "pytorch"],
+        help="Backend library",
+    )
+    parser.add_argument("--debugmode", default=1, type=int, help="Debugmode")
+    parser.add_argument("--seed", default=1, type=int, help="Random seed")
+    parser.add_argument("--verbose", "-V", default=1, type=int, help="Verbose option")
+    parser.add_argument(
+        "--batchsize",
+        default=1,
+        type=int,
+        help="Batch size for beam search (0: means no batch processing)",
+    )
+    parser.add_argument(
+        "--preprocess-conf",
+        type=str,
+        default=None,
+        help="The configuration file for the pre-processing",
+    )
+    # task related
+    parser.add_argument(
+        "--recog-json", type=str, help="Filename of recognition data (json)"
+    )
+    # model (parameter) related
+    parser.add_argument(
+        "--model", type=str, required=True, help="Model file parameters to read"
+    )
+    parser.add_argument(
+        "--model-conf", type=str, default=None, help="Model config file"
+    )
+    # Outputs configuration
+    parser.add_argument(
+        "--enh-wspecifier",
+        type=str,
+        default=None,
+        help="Specify the output way for enhanced speech."
+        "e.g. ark,scp:outdir,wav.scp",
+    )
+    parser.add_argument(
+        "--enh-filetype",
+        type=str,
+        default="sound",
+        choices=["mat", "hdf5", "sound.hdf5", "sound"],
+        help="Specify the file format for enhanced speech. "
+        '"mat" is the matrix format in kaldi',
+    )
+    parser.add_argument("--fs", type=int, default=16000, help="The sample frequency")
+    parser.add_argument(
+        "--keep-length",
+        type=strtobool,
+        default=True,
+        help="Adjust the output length to match " "with the input for enhanced speech",
+    )
+    parser.add_argument(
+        "--image-dir", type=str, default=None, help="The directory saving the images."
+    )
+    parser.add_argument(
+        "--num-images",
+        type=int,
+        default=20,
+        help="The number of images files to be saved. "
+        "If negative, all samples are to be saved.",
+    )
+    # IStft
+    parser.add_argument(
+        "--apply-istft",
+        type=strtobool,
+        default=True,
+        help="Apply istft to the output from the network",
+    )
+    parser.add_argument(
+        "--istft-win-length",
+        type=int,
+        default=512,
+        help="The window length for istft. "
+        "This option is ignored "
+        "if stft is found in the preprocess-conf",
+    )
+    parser.add_argument(
+        "--istft-n-shift",
+        type=str,
+        default=256,
+        help="The window type for istft. "
+        "This option is ignored "
+        "if stft is found in the preprocess-conf",
+    )
+    parser.add_argument(
+        "--istft-window",
+        type=str,
+        default="hann",
+        help="The window type for istft. "
+        "This option is ignored "
+        "if stft is found in the preprocess-conf",
+    )
+    return parser
+def main(args):
+    parser = get_parser()
+    args = parser.parse_args(args)
+    # logging info
+    if args.verbose == 1:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    elif args.verbose == 2:
+        logging.basicConfig(
+            level=logging.DEBUG,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+    # check CUDA_VISIBLE_DEVICES
+    if args.ngpu > 0:
+        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
+        if cvd is None:
+            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
+        elif args.ngpu != len(cvd.split(",")):
+            logging.error("#gpus is not matched with CUDA_VISIBLE_DEVICES.")
+            sys.exit(1)
+        # TODO(kamo): support of multiple GPUs
+        if args.ngpu > 1:
+            logging.error("The program only supports ngpu=1.")
+            sys.exit(1)
+    # display PYTHONPATH
+    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))
+    # seed setting
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    logging.info("set random seed = %d" % args.seed)
+    # recog
+    logging.info("backend = " + args.backend)
+    if args.backend == "pytorch":
+        enhance(args)
+    else:
+        raise ValueError("Only pytorch is supported.")
+if __name__ == "__main__":
+    main(sys.argv[1:])

espnet/bin/asr_recog.py ADDED Viewed

	@@ -0,0 +1,363 @@

+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""End-to-end speech recognition model decoding script."""
+import configargparse
+import logging
+import os
+import random
+import sys
+import numpy as np
+from espnet.utils.cli_utils import strtobool
+# NOTE: you need this func to generate our sphinx doc
+def get_parser():
+    """Get default arguments."""
+    parser = configargparse.ArgumentParser(
+        description="Transcribe text from speech using "
+        "a speech recognition model on one CPU or GPU",
+        config_file_parser_class=configargparse.YAMLConfigFileParser,
+        formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
+    )
+    # general configuration
+    parser.add("--config", is_config_file=True, help="Config file path")
+    parser.add(
+        "--config2",
+        is_config_file=True,
+        help="Second config file path that overwrites the settings in `--config`",
+    )
+    parser.add(
+        "--config3",
+        is_config_file=True,
+        help="Third config file path that overwrites the settings "
+        "in `--config` and `--config2`",
+    )
+    parser.add_argument("--ngpu", type=int, default=0, help="Number of GPUs")
+    parser.add_argument(
+        "--dtype",
+        choices=("float16", "float32", "float64"),
+        default="float32",
+        help="Float precision (only available in --api v2)",
+    )
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="chainer",
+        choices=["chainer", "pytorch"],
+        help="Backend library",
+    )
+    parser.add_argument("--debugmode", type=int, default=1, help="Debugmode")
+    parser.add_argument("--seed", type=int, default=1, help="Random seed")
+    parser.add_argument("--verbose", "-V", type=int, default=1, help="Verbose option")
+    parser.add_argument(
+        "--batchsize",
+        type=int,
+        default=1,
+        help="Batch size for beam search (0: means no batch processing)",
+    )
+    parser.add_argument(
+        "--preprocess-conf",
+        type=str,
+        default=None,
+        help="The configuration file for the pre-processing",
+    )
+    parser.add_argument(
+        "--api",
+        default="v1",
+        choices=["v1", "v2"],
+        help="Beam search APIs "
+        "v1: Default API. It only supports the ASRInterface.recognize method "
+        "and DefaultRNNLM. "
+        "v2: Experimental API. It supports any models that implements ScorerInterface.",
+    )
+    # task related
+    parser.add_argument(
+        "--recog-json", type=str, help="Filename of recognition data (json)"
+    )
+    parser.add_argument(
+        "--result-label",
+        type=str,
+        required=True,
+        help="Filename of result label data (json)",
+    )
+    # model (parameter) related
+    parser.add_argument(
+        "--model", type=str, required=True, help="Model file parameters to read"
+    )
+    parser.add_argument(
+        "--model-conf", type=str, default=None, help="Model config file"
+    )
+    parser.add_argument(
+        "--num-spkrs",
+        type=int,
+        default=1,
+        choices=[1, 2],
+        help="Number of speakers in the speech",
+    )
+    parser.add_argument(
+        "--num-encs", default=1, type=int, help="Number of encoders in the model."
+    )
+    # search related
+    parser.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
+    parser.add_argument("--beam-size", type=int, default=1, help="Beam size")
+    parser.add_argument("--penalty", type=float, default=0.0, help="Incertion penalty")
+    parser.add_argument(
+        "--maxlenratio",
+        type=float,
+        default=0.0,
+        help="""Input length ratio to obtain max output length.
+                        If maxlenratio=0.0 (default), it uses a end-detect function
+                        to automatically find maximum hypothesis lengths""",
+    )
+    parser.add_argument(
+        "--minlenratio",
+        type=float,
+        default=0.0,
+        help="Input length ratio to obtain min output length",
+    )
+    parser.add_argument(
+        "--ctc-weight", type=float, default=0.0, help="CTC weight in joint decoding"
+    )
+    parser.add_argument(
+        "--weights-ctc-dec",
+        type=float,
+        action="append",
+        help="ctc weight assigned to each encoder during decoding."
+        "[in multi-encoder mode only]",
+    )
+    parser.add_argument(
+        "--ctc-window-margin",
+        type=int,
+        default=0,
+        help="""Use CTC window with margin parameter to accelerate
+                        CTC/attention decoding especially on GPU. Smaller magin
+                        makes decoding faster, but may increase search errors.
+                        If margin=0 (default), this function is disabled""",
+    )
+    # transducer related
+    parser.add_argument(
+        "--search-type",
+        type=str,
+        default="default",
+        choices=["default", "nsc", "tsd", "alsd"],
+        help="""Type of beam search implementation to use during inference.
+        Can be either: default beam search, n-step constrained beam search ("nsc"),
+        time-synchronous decoding ("tsd") or alignment-length synchronous decoding
+        ("alsd").
+        Additional associated parameters: "nstep" + "prefix-alpha" (for nsc),
+        "max-sym-exp" (for tsd) and "u-max" (for alsd)""",
+    )
+    parser.add_argument(
+        "--nstep",
+        type=int,
+        default=1,
+        help="Number of expansion steps allowed in NSC beam search.",
+    )
+    parser.add_argument(
+        "--prefix-alpha",
+        type=int,
+        default=2,
+        help="Length prefix difference allowed in NSC beam search.",
+    )
+    parser.add_argument(
+        "--max-sym-exp",
+        type=int,
+        default=2,
+        help="Number of symbol expansions allowed in TSD decoding.",
+    )
+    parser.add_argument(
+        "--u-max",
+        type=int,
+        default=400,
+        help="Length prefix difference allowed in ALSD beam search.",
+    )
+    parser.add_argument(
+        "--score-norm",
+        type=strtobool,
+        nargs="?",
+        default=True,
+        help="Normalize transducer scores by length",
+    )
+    # rnnlm related
+    parser.add_argument(
+        "--rnnlm", type=str, default=None, help="RNNLM model file to read"
+    )
+    parser.add_argument(
+        "--rnnlm-conf", type=str, default=None, help="RNNLM model config file to read"
+    )
+    parser.add_argument(
+        "--word-rnnlm", type=str, default=None, help="Word RNNLM model file to read"
+    )
+    parser.add_argument(
+        "--word-rnnlm-conf",
+        type=str,
+        default=None,
+        help="Word RNNLM model config file to read",
+    )
+    parser.add_argument("--word-dict", type=str, default=None, help="Word list to read")
+    parser.add_argument("--lm-weight", type=float, default=0.1, help="RNNLM weight")
+    # ngram related
+    parser.add_argument(
+        "--ngram-model", type=str, default=None, help="ngram model file to read"
+    )
+    parser.add_argument("--ngram-weight", type=float, default=0.1, help="ngram weight")
+    parser.add_argument(
+        "--ngram-scorer",
+        type=str,
+        default="part",
+        choices=("full", "part"),
+        help="""if the ngram is set as a part scorer, similar with CTC scorer,
+                ngram scorer only scores topK hypethesis.
+                if the ngram is set as full scorer, ngram scorer scores all hypthesis
+                the decoding speed of part scorer is musch faster than full one""",
+    )
+    # streaming related
+    parser.add_argument(
+        "--streaming-mode",
+        type=str,
+        default=None,
+        choices=["window", "segment"],
+        help="""Use streaming recognizer for inference.
+                        `--batchsize` must be set to 0 to enable this mode""",
+    )
+    parser.add_argument("--streaming-window", type=int, default=10, help="Window size")
+    parser.add_argument(
+        "--streaming-min-blank-dur",
+        type=int,
+        default=10,
+        help="Minimum blank duration threshold",
+    )
+    parser.add_argument(
+        "--streaming-onset-margin", type=int, default=1, help="Onset margin"
+    )
+    parser.add_argument(
+        "--streaming-offset-margin", type=int, default=1, help="Offset margin"
+    )
+    # non-autoregressive related
+    # Mask CTC related. See https://arxiv.org/abs/2005.08700 for the detail.
+    parser.add_argument(
+        "--maskctc-n-iterations",
+        type=int,
+        default=10,
+        help="Number of decoding iterations."
+        "For Mask CTC, set 0 to predict 1 mask/iter.",
+    )
+    parser.add_argument(
+        "--maskctc-probability-threshold",
+        type=float,
+        default=0.999,
+        help="Threshold probability for CTC output",
+    )
+    return parser
+def main(args):
+    """Run the main decoding function."""
+    parser = get_parser()
+    args = parser.parse_args(args)
+    if args.ngpu == 0 and args.dtype == "float16":
+        raise ValueError(f"--dtype {args.dtype} does not support the CPU backend.")
+    # logging info
+    if args.verbose == 1:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    elif args.verbose == 2:
+        logging.basicConfig(
+            level=logging.DEBUG,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+    # check CUDA_VISIBLE_DEVICES
+    if args.ngpu > 0:
+        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
+        if cvd is None:
+            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
+        elif args.ngpu != len(cvd.split(",")):
+            logging.error("#gpus is not matched with CUDA_VISIBLE_DEVICES.")
+            sys.exit(1)
+        # TODO(mn5k): support of multiple GPUs
+        if args.ngpu > 1:
+            logging.error("The program only supports ngpu=1.")
+            sys.exit(1)
+    # display PYTHONPATH
+    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))
+    # seed setting
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    logging.info("set random seed = %d" % args.seed)
+    # validate rnn options
+    if args.rnnlm is not None and args.word_rnnlm is not None:
+        logging.error(
+            "It seems that both --rnnlm and --word-rnnlm are specified. "
+            "Please use either option."
+        )
+        sys.exit(1)
+    # recog
+    logging.info("backend = " + args.backend)
+    if args.num_spkrs == 1:
+        if args.backend == "chainer":
+            from espnet.asr.chainer_backend.asr import recog
+            recog(args)
+        elif args.backend == "pytorch":
+            if args.num_encs == 1:
+                # Experimental API that supports custom LMs
+                if args.api == "v2":
+                    from espnet.asr.pytorch_backend.recog import recog_v2
+                    recog_v2(args)
+                else:
+                    from espnet.asr.pytorch_backend.asr import recog
+                    if args.dtype != "float32":
+                        raise NotImplementedError(
+                            f"`--dtype {args.dtype}` is only available with `--api v2`"
+                        )
+                    recog(args)
+            else:
+                if args.api == "v2":
+                    raise NotImplementedError(
+                        f"--num-encs {args.num_encs} > 1 is not supported in --api v2"
+                    )
+                else:
+                    from espnet.asr.pytorch_backend.asr import recog
+                    recog(args)
+        else:
+            raise ValueError("Only chainer and pytorch are supported.")
+    elif args.num_spkrs == 2:
+        if args.backend == "pytorch":
+            from espnet.asr.pytorch_backend.asr_mix import recog
+            recog(args)
+        else:
+            raise ValueError("Only pytorch is supported.")
+if __name__ == "__main__":
+    main(sys.argv[1:])

espnet/bin/asr_train.py ADDED Viewed

	@@ -0,0 +1,644 @@

+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2017 Tomoki Hayashi (Nagoya University)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""Automatic speech recognition model training script."""
+import logging
+import os
+import random
+import subprocess
+import sys
+from distutils.version import LooseVersion
+import configargparse
+import numpy as np
+import torch
+from espnet import __version__
+from espnet.utils.cli_utils import strtobool
+from espnet.utils.training.batchfy import BATCH_COUNT_CHOICES
+is_torch_1_2_plus = LooseVersion(torch.__version__) >= LooseVersion("1.2")
+# NOTE: you need this func to generate our sphinx doc
+def get_parser(parser=None, required=True):
+    """Get default arguments."""
+    if parser is None:
+        parser = configargparse.ArgumentParser(
+            description="Train an automatic speech recognition (ASR) model on one CPU, "
+            "one or multiple GPUs",
+            config_file_parser_class=configargparse.YAMLConfigFileParser,
+            formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
+        )
+    # general configuration
+    parser.add("--config", is_config_file=True, help="config file path")
+    parser.add(
+        "--config2",
+        is_config_file=True,
+        help="second config file path that overwrites the settings in `--config`.",
+    )
+    parser.add(
+        "--config3",
+        is_config_file=True,
+        help="third config file path that overwrites the settings in "
+        "`--config` and `--config2`.",
+    )
+    parser.add_argument(
+        "--ngpu",
+        default=None,
+        type=int,
+        help="Number of GPUs. If not given, use all visible devices",
+    )
+    parser.add_argument(
+        "--train-dtype",
+        default="float32",
+        choices=["float16", "float32", "float64", "O0", "O1", "O2", "O3"],
+        help="Data type for training (only pytorch backend). "
+        "O0,O1,.. flags require apex. "
+        "See https://nvidia.github.io/apex/amp.html#opt-levels",
+    )
+    parser.add_argument(
+        "--backend",
+        default="chainer",
+        type=str,
+        choices=["chainer", "pytorch"],
+        help="Backend library",
+    )
+    parser.add_argument(
+        "--outdir", type=str, required=required, help="Output directory"
+    )
+    parser.add_argument("--debugmode", default=1, type=int, help="Debugmode")
+    parser.add_argument("--dict", required=required, help="Dictionary")
+    parser.add_argument("--seed", default=1, type=int, help="Random seed")
+    parser.add_argument("--debugdir", type=str, help="Output directory for debugging")
+    parser.add_argument(
+        "--resume",
+        "-r",
+        default="",
+        nargs="?",
+        help="Resume the training from snapshot",
+    )
+    parser.add_argument(
+        "--minibatches",
+        "-N",
+        type=int,
+        default="-1",
+        help="Process only N minibatches (for debug)",
+    )
+    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
+    parser.add_argument(
+        "--tensorboard-dir",
+        default=None,
+        type=str,
+        nargs="?",
+        help="Tensorboard log dir path",
+    )
+    parser.add_argument(
+        "--report-interval-iters",
+        default=100,
+        type=int,
+        help="Report interval iterations",
+    )
+    parser.add_argument(
+        "--save-interval-iters",
+        default=0,
+        type=int,
+        help="Save snapshot interval iterations",
+    )
+    # task related
+    parser.add_argument(
+        "--train-json",
+        type=str,
+        default=None,
+        help="Filename of train label data (json)",
+    )
+    parser.add_argument(
+        "--valid-json",
+        type=str,
+        default=None,
+        help="Filename of validation label data (json)",
+    )
+    # network architecture
+    parser.add_argument(
+        "--model-module",
+        type=str,
+        default=None,
+        help="model defined module (default: espnet.nets.xxx_backend.e2e_asr:E2E)",
+    )
+    # encoder
+    parser.add_argument(
+        "--num-encs", default=1, type=int, help="Number of encoders in the model."
+    )
+    # loss related
+    parser.add_argument(
+        "--ctc_type",
+        default="warpctc",
+        type=str,
+        choices=["builtin", "warpctc", "gtnctc", "cudnnctc"],
+        help="Type of CTC implementation to calculate loss.",
+    )
+    parser.add_argument(
+        "--mtlalpha",
+        default=0.5,
+        type=float,
+        help="Multitask learning coefficient, "
+        "alpha: alpha*ctc_loss + (1-alpha)*att_loss ",
+    )
+    parser.add_argument(
+        "--lsm-weight", default=0.0, type=float, help="Label smoothing weight"
+    )
+    # recognition options to compute CER/WER
+    parser.add_argument(
+        "--report-cer",
+        default=False,
+        action="store_true",
+        help="Compute CER on development set",
+    )
+    parser.add_argument(
+        "--report-wer",
+        default=False,
+        action="store_true",
+        help="Compute WER on development set",
+    )
+    parser.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
+    parser.add_argument("--beam-size", type=int, default=4, help="Beam size")
+    parser.add_argument("--penalty", default=0.0, type=float, help="Incertion penalty")
+    parser.add_argument(
+        "--maxlenratio",
+        default=0.0,
+        type=float,
+        help="""Input length ratio to obtain max output length.
+                        If maxlenratio=0.0 (default), it uses a end-detect function
+                        to automatically find maximum hypothesis lengths""",
+    )
+    parser.add_argument(
+        "--minlenratio",
+        default=0.0,
+        type=float,
+        help="Input length ratio to obtain min output length",
+    )
+    parser.add_argument(
+        "--ctc-weight", default=0.3, type=float, help="CTC weight in joint decoding"
+    )
+    parser.add_argument(
+        "--rnnlm", type=str, default=None, help="RNNLM model file to read"
+    )
+    parser.add_argument(
+        "--rnnlm-conf", type=str, default=None, help="RNNLM model config file to read"
+    )
+    parser.add_argument("--lm-weight", default=0.1, type=float, help="RNNLM weight.")
+    parser.add_argument("--sym-space", default="<space>", type=str, help="Space symbol")
+    parser.add_argument("--sym-blank", default="<blank>", type=str, help="Blank symbol")
+    # minibatch related
+    parser.add_argument(
+        "--sortagrad",
+        default=0,
+        type=int,
+        nargs="?",
+        help="How many epochs to use sortagrad for. 0 = deactivated, -1 = all epochs",
+    )
+    parser.add_argument(
+        "--batch-count",
+        default="auto",
+        choices=BATCH_COUNT_CHOICES,
+        help="How to count batch_size. "
+        "The default (auto) will find how to count by args.",
+    )
+    parser.add_argument(
+        "--batch-size",
+        "--batch-seqs",
+        "-b",
+        default=0,
+        type=int,
+        help="Maximum seqs in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-bins",
+        default=0,
+        type=int,
+        help="Maximum bins in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-frames-in",
+        default=0,
+        type=int,
+        help="Maximum input frames in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-frames-out",
+        default=0,
+        type=int,
+        help="Maximum output frames in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-frames-inout",
+        default=0,
+        type=int,
+        help="Maximum input+output frames in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--maxlen-in",
+        "--batch-seq-maxlen-in",
+        default=800,
+        type=int,
+        metavar="ML",
+        help="When --batch-count=seq, "
+        "batch size is reduced if the input sequence length > ML.",
+    )
+    parser.add_argument(
+        "--maxlen-out",
+        "--batch-seq-maxlen-out",
+        default=150,
+        type=int,
+        metavar="ML",
+        help="When --batch-count=seq, "
+        "batch size is reduced if the output sequence length > ML",
+    )
+    parser.add_argument(
+        "--n-iter-processes",
+        default=0,
+        type=int,
+        help="Number of processes of iterator",
+    )
+    parser.add_argument(
+        "--preprocess-conf",
+        type=str,
+        default=None,
+        nargs="?",
+        help="The configuration file for the pre-processing",
+    )
+    # optimization related
+    parser.add_argument(
+        "--opt",
+        default="adadelta",
+        type=str,
+        choices=["adadelta", "adam", "noam"],
+        help="Optimizer",
+    )
+    parser.add_argument(
+        "--accum-grad", default=1, type=int, help="Number of gradient accumuration"
+    )
+    parser.add_argument(
+        "--eps", default=1e-8, type=float, help="Epsilon constant for optimizer"
+    )
+    parser.add_argument(
+        "--eps-decay", default=0.01, type=float, help="Decaying ratio of epsilon"
+    )
+    parser.add_argument(
+        "--weight-decay", default=0.0, type=float, help="Weight decay ratio"
+    )
+    parser.add_argument(
+        "--criterion",
+        default="acc",
+        type=str,
+        choices=["loss", "loss_eps_decay_only", "acc"],
+        help="Criterion to perform epsilon decay",
+    )
+    parser.add_argument(
+        "--threshold", default=1e-4, type=float, help="Threshold to stop iteration"
+    )
+    parser.add_argument(
+        "--epochs", "-e", default=30, type=int, help="Maximum number of epochs"
+    )
+    parser.add_argument(
+        "--early-stop-criterion",
+        default="validation/main/acc",
+        type=str,
+        nargs="?",
+        help="Value to monitor to trigger an early stopping of the training",
+    )
+    parser.add_argument(
+        "--patience",
+        default=3,
+        type=int,
+        nargs="?",
+        help="Number of epochs to wait without improvement "
+        "before stopping the training",
+    )
+    parser.add_argument(
+        "--grad-clip", default=5, type=float, help="Gradient norm threshold to clip"
+    )
+    parser.add_argument(
+        "--num-save-attention",
+        default=3,
+        type=int,
+        help="Number of samples of attention to be saved",
+    )
+    parser.add_argument(
+        "--num-save-ctc",
+        default=3,
+        type=int,
+        help="Number of samples of CTC probability to be saved",
+    )
+    parser.add_argument(
+        "--grad-noise",
+        type=strtobool,
+        default=False,
+        help="The flag to switch to use noise injection to gradients during training",
+    )
+    # asr_mix related
+    parser.add_argument(
+        "--num-spkrs",
+        default=1,
+        type=int,
+        choices=[1, 2],
+        help="Number of speakers in the speech.",
+    )
+    # decoder related
+    parser.add_argument(
+        "--context-residual",
+        default=False,
+        type=strtobool,
+        nargs="?",
+        help="The flag to switch to use context vector residual in the decoder network",
+    )
+    # finetuning related
+    parser.add_argument(
+        "--enc-init",
+        default=None,
+        type=str,
+        help="Pre-trained ASR model to initialize encoder.",
+    )
+    parser.add_argument(
+        "--enc-init-mods",
+        default="enc.enc.",
+        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
+        help="List of encoder modules to initialize, separated by a comma.",
+    )
+    parser.add_argument(
+        "--dec-init",
+        default=None,
+        type=str,
+        help="Pre-trained ASR, MT or LM model to initialize decoder.",
+    )
+    parser.add_argument(
+        "--dec-init-mods",
+        default="att.,dec.",
+        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
+        help="List of decoder modules to initialize, separated by a comma.",
+    )
+    parser.add_argument(
+        "--freeze-mods",
+        default=None,
+        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
+        help="List of modules to freeze, separated by a comma.",
+    )
+    # front end related
+    parser.add_argument(
+        "--use-frontend",
+        type=strtobool,
+        default=False,
+        help="The flag to switch to use frontend system.",
+    )
+    # WPE related
+    parser.add_argument(
+        "--use-wpe",
+        type=strtobool,
+        default=False,
+        help="Apply Weighted Prediction Error",
+    )
+    parser.add_argument(
+        "--wtype",
+        default="blstmp",
+        type=str,
+        choices=[
+            "lstm",
+            "blstm",
+            "lstmp",
+            "blstmp",
+            "vgglstmp",
+            "vggblstmp",
+            "vgglstm",
+            "vggblstm",
+            "gru",
+            "bgru",
+            "grup",
+            "bgrup",
+            "vgggrup",
+            "vggbgrup",
+            "vgggru",
+            "vggbgru",
+        ],
+        help="Type of encoder network architecture "
+        "of the mask estimator for WPE. "
+        "",
+    )
+    parser.add_argument("--wlayers", type=int, default=2, help="")
+    parser.add_argument("--wunits", type=int, default=300, help="")
+    parser.add_argument("--wprojs", type=int, default=300, help="")
+    parser.add_argument("--wdropout-rate", type=float, default=0.0, help="")
+    parser.add_argument("--wpe-taps", type=int, default=5, help="")
+    parser.add_argument("--wpe-delay", type=int, default=3, help="")
+    parser.add_argument(
+        "--use-dnn-mask-for-wpe",
+        type=strtobool,
+        default=False,
+        help="Use DNN to estimate the power spectrogram. "
+        "This option is experimental.",
+    )
+    # Beamformer related
+    parser.add_argument("--use-beamformer", type=strtobool, default=True, help="")
+    parser.add_argument(
+        "--btype",
+        default="blstmp",
+        type=str,
+        choices=[
+            "lstm",
+            "blstm",
+            "lstmp",
+            "blstmp",
+            "vgglstmp",
+            "vggblstmp",
+            "vgglstm",
+            "vggblstm",
+            "gru",
+            "bgru",
+            "grup",
+            "bgrup",
+            "vgggrup",
+            "vggbgrup",
+            "vgggru",
+            "vggbgru",
+        ],
+        help="Type of encoder network architecture "
+        "of the mask estimator for Beamformer.",
+    )
+    parser.add_argument("--blayers", type=int, default=2, help="")
+    parser.add_argument("--bunits", type=int, default=300, help="")
+    parser.add_argument("--bprojs", type=int, default=300, help="")
+    parser.add_argument("--badim", type=int, default=320, help="")
+    parser.add_argument(
+        "--bnmask",
+        type=int,
+        default=2,
+        help="Number of beamforming masks, " "default is 2 for [speech, noise].",
+    )
+    parser.add_argument(
+        "--ref-channel",
+        type=int,
+        default=-1,
+        help="The reference channel used for beamformer. "
+        "By default, the channel is estimated by DNN.",
+    )
+    parser.add_argument("--bdropout-rate", type=float, default=0.0, help="")
+    # Feature transform: Normalization
+    parser.add_argument(
+        "--stats-file",
+        type=str,
+        default=None,
+        help="The stats file for the feature normalization",
+    )
+    parser.add_argument(
+        "--apply-uttmvn",
+        type=strtobool,
+        default=True,
+        help="Apply utterance level mean " "variance normalization.",
+    )
+    parser.add_argument("--uttmvn-norm-means", type=strtobool, default=True, help="")
+    parser.add_argument("--uttmvn-norm-vars", type=strtobool, default=False, help="")
+    # Feature transform: Fbank
+    parser.add_argument(
+        "--fbank-fs",
+        type=int,
+        default=16000,
+        help="The sample frequency used for " "the mel-fbank creation.",
+    )
+    parser.add_argument(
+        "--n-mels", type=int, default=80, help="The number of mel-frequency bins."
+    )
+    parser.add_argument("--fbank-fmin", type=float, default=0.0, help="")
+    parser.add_argument("--fbank-fmax", type=float, default=None, help="")
+    return parser
+def main(cmd_args):
+    """Run the main training function."""
+    parser = get_parser()
+    args, _ = parser.parse_known_args(cmd_args)
+    if args.backend == "chainer" and args.train_dtype != "float32":
+        raise NotImplementedError(
+            f"chainer backend does not support --train-dtype {args.train_dtype}."
+            "Use --dtype float32."
+        )
+    if args.ngpu == 0 and args.train_dtype in ("O0", "O1", "O2", "O3", "float16"):
+        raise ValueError(
+            f"--train-dtype {args.train_dtype} does not support the CPU backend."
+        )
+    from espnet.utils.dynamic_import import dynamic_import
+    if args.model_module is None:
+        if args.num_spkrs == 1:
+            model_module = "espnet.nets." + args.backend + "_backend.e2e_asr:E2E"
+        else:
+            model_module = "espnet.nets." + args.backend + "_backend.e2e_asr_mix:E2E"
+    else:
+        model_module = args.model_module
+    model_class = dynamic_import(model_module)
+    model_class.add_arguments(parser)
+    args = parser.parse_args(cmd_args)
+    args.model_module = model_module
+    if "chainer_backend" in args.model_module:
+        args.backend = "chainer"
+    if "pytorch_backend" in args.model_module:
+        args.backend = "pytorch"
+    # add version info in args
+    args.version = __version__
+    # logging info
+    if args.verbose > 0:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+    # If --ngpu is not given,
+    #   1. if CUDA_VISIBLE_DEVICES is set, all visible devices
+    #   2. if nvidia-smi exists, use all devices
+    #   3. else ngpu=0
+    if args.ngpu is None:
+        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
+        if cvd is not None:
+            ngpu = len(cvd.split(","))
+        else:
+            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
+            try:
+                p = subprocess.run(
+                    ["nvidia-smi", "-L"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
+                )
+            except (subprocess.CalledProcessError, FileNotFoundError):
+                ngpu = 0
+            else:
+                ngpu = len(p.stderr.decode().split("\n")) - 1
+    else:
+        if is_torch_1_2_plus and args.ngpu != 1:
+            logging.debug(
+                "There are some bugs with multi-GPU processing in PyTorch 1.2+"
+                + " (see https://github.com/pytorch/pytorch/issues/21108)"
+            )
+        ngpu = args.ngpu
+    logging.info(f"ngpu: {ngpu}")
+    # display PYTHONPATH
+    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))
+    # set random seed
+    logging.info("random seed = %d" % args.seed)
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    # load dictionary for debug log
+    if args.dict is not None:
+        with open(args.dict, "rb") as f:
+            dictionary = f.readlines()
+        char_list = [entry.decode("utf-8").split(" ")[0] for entry in dictionary]
+        char_list.insert(0, "<blank>")
+        char_list.append("<eos>")
+        # for non-autoregressive maskctc model
+        if "maskctc" in args.model_module:
+            char_list.append("<mask>")
+        args.char_list = char_list
+    else:
+        args.char_list = None
+    # train
+    logging.info("backend = " + args.backend)
+    if args.num_spkrs == 1:
+        if args.backend == "chainer":
+            from espnet.asr.chainer_backend.asr import train
+            train(args)
+        elif args.backend == "pytorch":
+            from espnet.asr.pytorch_backend.asr import train
+            train(args)
+        else:
+            raise ValueError("Only chainer and pytorch are supported.")
+    else:
+        # FIXME(kamo): Support --model-module
+        if args.backend == "pytorch":
+            from espnet.asr.pytorch_backend.asr_mix import train
+            train(args)
+        else:
+            raise ValueError("Only pytorch is supported.")
+if __name__ == "__main__":
+    main(sys.argv[1:])

espnet/bin/lm_train.py ADDED Viewed

	@@ -0,0 +1,288 @@

+#!/usr/bin/env python3
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+# This code is ported from the following implementation written in Torch.
+# https://github.com/chainer/chainer/blob/master/examples/ptb/train_ptb_custom_loop.py
+"""Language model training script."""
+import logging
+import os
+import random
+import subprocess
+import sys
+import configargparse
+import numpy as np
+from espnet import __version__
+from espnet.nets.lm_interface import dynamic_import_lm
+from espnet.optimizer.factory import dynamic_import_optimizer
+from espnet.scheduler.scheduler import dynamic_import_scheduler
+# NOTE: you need this func to generate our sphinx doc
+def get_parser(parser=None, required=True):
+    """Get parser."""
+    if parser is None:
+        parser = configargparse.ArgumentParser(
+            description="Train a new language model on one CPU or one GPU",
+            config_file_parser_class=configargparse.YAMLConfigFileParser,
+            formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
+        )
+    # general configuration
+    parser.add("--config", is_config_file=True, help="config file path")
+    parser.add(
+        "--config2",
+        is_config_file=True,
+        help="second config file path that overwrites the settings in `--config`.",
+    )
+    parser.add(
+        "--config3",
+        is_config_file=True,
+        help="third config file path that overwrites the settings "
+        "in `--config` and `--config2`.",
+    )
+    parser.add_argument(
+        "--ngpu",
+        default=None,
+        type=int,
+        help="Number of GPUs. If not given, use all visible devices",
+    )
+    parser.add_argument(
+        "--train-dtype",
+        default="float32",
+        choices=["float16", "float32", "float64", "O0", "O1", "O2", "O3"],
+        help="Data type for training (only pytorch backend). "
+        "O0,O1,.. flags require apex. "
+        "See https://nvidia.github.io/apex/amp.html#opt-levels",
+    )
+    parser.add_argument(
+        "--backend",
+        default="chainer",
+        type=str,
+        choices=["chainer", "pytorch"],
+        help="Backend library",
+    )
+    parser.add_argument(
+        "--outdir", type=str, required=required, help="Output directory"
+    )
+    parser.add_argument("--debugmode", default=1, type=int, help="Debugmode")
+    parser.add_argument("--dict", type=str, required=required, help="Dictionary")
+    parser.add_argument("--seed", default=1, type=int, help="Random seed")
+    parser.add_argument(
+        "--resume",
+        "-r",
+        default="",
+        nargs="?",
+        help="Resume the training from snapshot",
+    )
+    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
+    parser.add_argument(
+        "--tensorboard-dir",
+        default=None,
+        type=str,
+        nargs="?",
+        help="Tensorboard log dir path",
+    )
+    parser.add_argument(
+        "--report-interval-iters",
+        default=100,
+        type=int,
+        help="Report interval iterations",
+    )
+    # task related
+    parser.add_argument(
+        "--train-label",
+        type=str,
+        required=required,
+        help="Filename of train label data",
+    )
+    parser.add_argument(
+        "--valid-label",
+        type=str,
+        required=required,
+        help="Filename of validation label data",
+    )
+    parser.add_argument("--test-label", type=str, help="Filename of test label data")
+    parser.add_argument(
+        "--dump-hdf5-path",
+        type=str,
+        default=None,
+        help="Path to dump a preprocessed dataset as hdf5",
+    )
+    # training configuration
+    parser.add_argument("--opt", default="sgd", type=str, help="Optimizer")
+    parser.add_argument(
+        "--sortagrad",
+        default=0,
+        type=int,
+        nargs="?",
+        help="How many epochs to use sortagrad for. 0 = deactivated, -1 = all epochs",
+    )
+    parser.add_argument(
+        "--batchsize",
+        "-b",
+        type=int,
+        default=300,
+        help="Number of examples in each mini-batch",
+    )
+    parser.add_argument(
+        "--accum-grad", type=int, default=1, help="Number of gradient accumueration"
+    )
+    parser.add_argument(
+        "--epoch",
+        "-e",
+        type=int,
+        default=20,
+        help="Number of sweeps over the dataset to train",
+    )
+    parser.add_argument(
+        "--early-stop-criterion",
+        default="validation/main/loss",
+        type=str,
+        nargs="?",
+        help="Value to monitor to trigger an early stopping of the training",
+    )
+    parser.add_argument(
+        "--patience",
+        default=3,
+        type=int,
+        nargs="?",
+        help="Number of epochs "
+        "to wait without improvement before stopping the training",
+    )
+    parser.add_argument(
+        "--schedulers",
+        default=None,
+        action="append",
+        type=lambda kv: kv.split("="),
+        help="optimizer schedulers, you can configure params like:"
+        " <optimizer-param>-<scheduler-name>-<schduler-param>"
+        ' e.g., "--schedulers lr=noam --lr-noam-warmup 1000".',
+    )
+    parser.add_argument(
+        "--gradclip",
+        "-c",
+        type=float,
+        default=5,
+        help="Gradient norm threshold to clip",
+    )
+    parser.add_argument(
+        "--maxlen",
+        type=int,
+        default=40,
+        help="Batch size is reduced if the input sequence > ML",
+    )
+    parser.add_argument(
+        "--model-module",
+        type=str,
+        default="default",
+        help="model defined module "
+        "(default: espnet.nets.xxx_backend.lm.default:DefaultRNNLM)",
+    )
+    return parser
+def main(cmd_args):
+    """Train LM."""
+    parser = get_parser()
+    args, _ = parser.parse_known_args(cmd_args)
+    if args.backend == "chainer" and args.train_dtype != "float32":
+        raise NotImplementedError(
+            f"chainer backend does not support --train-dtype {args.train_dtype}."
+            "Use --dtype float32."
+        )
+    if args.ngpu == 0 and args.train_dtype in ("O0", "O1", "O2", "O3", "float16"):
+        raise ValueError(
+            f"--train-dtype {args.train_dtype} does not support the CPU backend."
+        )
+    # parse arguments dynamically
+    model_class = dynamic_import_lm(args.model_module, args.backend)
+    model_class.add_arguments(parser)
+    if args.schedulers is not None:
+        for k, v in args.schedulers:
+            scheduler_class = dynamic_import_scheduler(v)
+            scheduler_class.add_arguments(k, parser)
+    opt_class = dynamic_import_optimizer(args.opt, args.backend)
+    opt_class.add_arguments(parser)
+    args = parser.parse_args(cmd_args)
+    # add version info in args
+    args.version = __version__
+    # logging info
+    if args.verbose > 0:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+    # If --ngpu is not given,
+    #   1. if CUDA_VISIBLE_DEVICES is set, all visible devices
+    #   2. if nvidia-smi exists, use all devices
+    #   3. else ngpu=0
+    if args.ngpu is None:
+        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
+        if cvd is not None:
+            ngpu = len(cvd.split(","))
+        else:
+            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
+            try:
+                p = subprocess.run(
+                    ["nvidia-smi", "-L"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
+                )
+            except (subprocess.CalledProcessError, FileNotFoundError):
+                ngpu = 0
+            else:
+                ngpu = len(p.stderr.decode().split("\n")) - 1
+        args.ngpu = ngpu
+    else:
+        ngpu = args.ngpu
+    logging.info(f"ngpu: {ngpu}")
+    # display PYTHONPATH
+    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))
+    # seed setting
+    nseed = args.seed
+    random.seed(nseed)
+    np.random.seed(nseed)
+    # load dictionary
+    with open(args.dict, "rb") as f:
+        dictionary = f.readlines()
+    char_list = [entry.decode("utf-8").split(" ")[0] for entry in dictionary]
+    char_list.insert(0, "<blank>")
+    char_list.append("<eos>")
+    args.char_list_dict = {x: i for i, x in enumerate(char_list)}
+    args.n_vocab = len(char_list)
+    # train
+    logging.info("backend = " + args.backend)
+    if args.backend == "chainer":
+        from espnet.lm.chainer_backend.lm import train
+        train(args)
+    elif args.backend == "pytorch":
+        from espnet.lm.pytorch_backend.lm import train
+        train(args)
+    else:
+        raise ValueError("Only chainer and pytorch are supported.")
+if __name__ == "__main__":
+    main(sys.argv[1:])

espnet/bin/mt_train.py ADDED Viewed

	@@ -0,0 +1,480 @@

+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2019 Kyoto University (Hirofumi Inaguma)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""Neural machine translation model training script."""
+import logging
+import os
+import random
+import subprocess
+import sys
+from distutils.version import LooseVersion
+import configargparse
+import numpy as np
+import torch
+from espnet import __version__
+from espnet.utils.cli_utils import strtobool
+from espnet.utils.training.batchfy import BATCH_COUNT_CHOICES
+is_torch_1_2_plus = LooseVersion(torch.__version__) >= LooseVersion("1.2")
+# NOTE: you need this func to generate our sphinx doc
+def get_parser(parser=None, required=True):
+    """Get default arguments."""
+    if parser is None:
+        parser = configargparse.ArgumentParser(
+            description="Train a neural machine translation (NMT) model on one CPU, "
+            "one or multiple GPUs",
+            config_file_parser_class=configargparse.YAMLConfigFileParser,
+            formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
+        )
+    # general configuration
+    parser.add("--config", is_config_file=True, help="config file path")
+    parser.add(
+        "--config2",
+        is_config_file=True,
+        help="second config file path that overwrites the settings in `--config`.",
+    )
+    parser.add(
+        "--config3",
+        is_config_file=True,
+        help="third config file path that overwrites the settings "
+        "in `--config` and `--config2`.",
+    )
+    parser.add_argument(
+        "--ngpu",
+        default=None,
+        type=int,
+        help="Number of GPUs. If not given, use all visible devices",
+    )
+    parser.add_argument(
+        "--train-dtype",
+        default="float32",
+        choices=["float16", "float32", "float64", "O0", "O1", "O2", "O3"],
+        help="Data type for training (only pytorch backend). "
+        "O0,O1,.. flags require apex. "
+        "See https://nvidia.github.io/apex/amp.html#opt-levels",
+    )
+    parser.add_argument(
+        "--backend",
+        default="chainer",
+        type=str,
+        choices=["chainer", "pytorch"],
+        help="Backend library",
+    )
+    parser.add_argument(
+        "--outdir", type=str, required=required, help="Output directory"
+    )
+    parser.add_argument("--debugmode", default=1, type=int, help="Debugmode")
+    parser.add_argument(
+        "--dict", required=required, help="Dictionary for source/target languages"
+    )
+    parser.add_argument("--seed", default=1, type=int, help="Random seed")
+    parser.add_argument("--debugdir", type=str, help="Output directory for debugging")
+    parser.add_argument(
+        "--resume",
+        "-r",
+        default="",
+        nargs="?",
+        help="Resume the training from snapshot",
+    )
+    parser.add_argument(
+        "--minibatches",
+        "-N",
+        type=int,
+        default="-1",
+        help="Process only N minibatches (for debug)",
+    )
+    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
+    parser.add_argument(
+        "--tensorboard-dir",
+        default=None,
+        type=str,
+        nargs="?",
+        help="Tensorboard log dir path",
+    )
+    parser.add_argument(
+        "--report-interval-iters",
+        default=100,
+        type=int,
+        help="Report interval iterations",
+    )
+    parser.add_argument(
+        "--save-interval-iters",
+        default=0,
+        type=int,
+        help="Save snapshot interval iterations",
+    )
+    # task related
+    parser.add_argument(
+        "--train-json",
+        type=str,
+        default=None,
+        help="Filename of train label data (json)",
+    )
+    parser.add_argument(
+        "--valid-json",
+        type=str,
+        default=None,
+        help="Filename of validation label data (json)",
+    )
+    # network architecture
+    parser.add_argument(
+        "--model-module",
+        type=str,
+        default=None,
+        help="model defined module (default: espnet.nets.xxx_backend.e2e_mt:E2E)",
+    )
+    # loss related
+    parser.add_argument(
+        "--lsm-weight", default=0.0, type=float, help="Label smoothing weight"
+    )
+    # translations options to compute BLEU
+    parser.add_argument(
+        "--report-bleu",
+        default=True,
+        action="store_true",
+        help="Compute BLEU on development set",
+    )
+    parser.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
+    parser.add_argument("--beam-size", type=int, default=4, help="Beam size")
+    parser.add_argument("--penalty", default=0.0, type=float, help="Incertion penalty")
+    parser.add_argument(
+        "--maxlenratio",
+        default=0.0,
+        type=float,
+        help="""Input length ratio to obtain max output length.
+                        If maxlenratio=0.0 (default), it uses a end-detect function
+                        to automatically find maximum hypothesis lengths""",
+    )
+    parser.add_argument(
+        "--minlenratio",
+        default=0.0,
+        type=float,
+        help="Input length ratio to obtain min output length",
+    )
+    parser.add_argument(
+        "--rnnlm", type=str, default=None, help="RNNLM model file to read"
+    )
+    parser.add_argument(
+        "--rnnlm-conf", type=str, default=None, help="RNNLM model config file to read"
+    )
+    parser.add_argument("--lm-weight", default=0.0, type=float, help="RNNLM weight.")
+    parser.add_argument("--sym-space", default="<space>", type=str, help="Space symbol")
+    parser.add_argument("--sym-blank", default="<blank>", type=str, help="Blank symbol")
+    # minibatch related
+    parser.add_argument(
+        "--sortagrad",
+        default=0,
+        type=int,
+        nargs="?",
+        help="How many epochs to use sortagrad for. 0 = deactivated, -1 = all epochs",
+    )
+    parser.add_argument(
+        "--batch-count",
+        default="auto",
+        choices=BATCH_COUNT_CHOICES,
+        help="How to count batch_size. "
+        "The default (auto) will find how to count by args.",
+    )
+    parser.add_argument(
+        "--batch-size",
+        "--batch-seqs",
+        "-b",
+        default=0,
+        type=int,
+        help="Maximum seqs in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-bins",
+        default=0,
+        type=int,
+        help="Maximum bins in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-frames-in",
+        default=0,
+        type=int,
+        help="Maximum input frames in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-frames-out",
+        default=0,
+        type=int,
+        help="Maximum output frames in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-frames-inout",
+        default=0,
+        type=int,
+        help="Maximum input+output frames in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--maxlen-in",
+        "--batch-seq-maxlen-in",
+        default=100,
+        type=int,
+        metavar="ML",
+        help="When --batch-count=seq, "
+        "batch size is reduced if the input sequence length > ML.",
+    )
+    parser.add_argument(
+        "--maxlen-out",
+        "--batch-seq-maxlen-out",
+        default=100,
+        type=int,
+        metavar="ML",
+        help="When --batch-count=seq, "
+        "batch size is reduced if the output sequence length > ML",
+    )
+    parser.add_argument(
+        "--n-iter-processes",
+        default=0,
+        type=int,
+        help="Number of processes of iterator",
+    )
+    # optimization related
+    parser.add_argument(
+        "--opt",
+        default="adadelta",
+        type=str,
+        choices=["adadelta", "adam", "noam"],
+        help="Optimizer",
+    )
+    parser.add_argument(
+        "--accum-grad", default=1, type=int, help="Number of gradient accumuration"
+    )
+    parser.add_argument(
+        "--eps", default=1e-8, type=float, help="Epsilon constant for optimizer"
+    )
+    parser.add_argument(
+        "--eps-decay", default=0.01, type=float, help="Decaying ratio of epsilon"
+    )
+    parser.add_argument(
+        "--lr", default=1e-3, type=float, help="Learning rate for optimizer"
+    )
+    parser.add_argument(
+        "--lr-decay", default=1.0, type=float, help="Decaying ratio of learning rate"
+    )
+    parser.add_argument(
+        "--weight-decay", default=0.0, type=float, help="Weight decay ratio"
+    )
+    parser.add_argument(
+        "--criterion",
+        default="acc",
+        type=str,
+        choices=["loss", "acc"],
+        help="Criterion to perform epsilon decay",
+    )
+    parser.add_argument(
+        "--threshold", default=1e-4, type=float, help="Threshold to stop iteration"
+    )
+    parser.add_argument(
+        "--epochs", "-e", default=30, type=int, help="Maximum number of epochs"
+    )
+    parser.add_argument(
+        "--early-stop-criterion",
+        default="validation/main/acc",
+        type=str,
+        nargs="?",
+        help="Value to monitor to trigger an early stopping of the training",
+    )
+    parser.add_argument(
+        "--patience",
+        default=3,
+        type=int,
+        nargs="?",
+        help="Number of epochs to wait "
+        "without improvement before stopping the training",
+    )
+    parser.add_argument(
+        "--grad-clip", default=5, type=float, help="Gradient norm threshold to clip"
+    )
+    parser.add_argument(
+        "--num-save-attention",
+        default=3,
+        type=int,
+        help="Number of samples of attention to be saved",
+    )
+    # decoder related
+    parser.add_argument(
+        "--context-residual",
+        default=False,
+        type=strtobool,
+        nargs="?",
+        help="The flag to switch to use context vector residual in the decoder network",
+    )
+    parser.add_argument(
+        "--tie-src-tgt-embedding",
+        default=False,
+        type=strtobool,
+        nargs="?",
+        help="Tie parameters of source embedding and target embedding.",
+    )
+    parser.add_argument(
+        "--tie-classifier",
+        default=False,
+        type=strtobool,
+        nargs="?",
+        help="Tie parameters of target embedding and output projection layer.",
+    )
+    # finetuning related
+    parser.add_argument(
+        "--enc-init",
+        default=None,
+        type=str,
+        nargs="?",
+        help="Pre-trained ASR model to initialize encoder.",
+    )
+    parser.add_argument(
+        "--enc-init-mods",
+        default="enc.enc.",
+        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
+        help="List of encoder modules to initialize, separated by a comma.",
+    )
+    parser.add_argument(
+        "--dec-init",
+        default=None,
+        type=str,
+        nargs="?",
+        help="Pre-trained ASR, MT or LM model to initialize decoder.",
+    )
+    parser.add_argument(
+        "--dec-init-mods",
+        default="att., dec.",
+        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
+        help="List of decoder modules to initialize, separated by a comma.",
+    )
+    # multilingual related
+    parser.add_argument(
+        "--multilingual",
+        default=False,
+        type=strtobool,
+        help="Prepend target language ID to the source sentence. "
+        "Both source/target language IDs must be prepend in the pre-processing stage.",
+    )
+    parser.add_argument(
+        "--replace-sos",
+        default=False,
+        type=strtobool,
+        help="Replace <sos> in the decoder with a target language ID "
+        "(the first token in the target sequence)",
+    )
+    return parser
+def main(cmd_args):
+    """Run the main training function."""
+    parser = get_parser()
+    args, _ = parser.parse_known_args(cmd_args)
+    if args.backend == "chainer" and args.train_dtype != "float32":
+        raise NotImplementedError(
+            f"chainer backend does not support --train-dtype {args.train_dtype}."
+            "Use --dtype float32."
+        )
+    if args.ngpu == 0 and args.train_dtype in ("O0", "O1", "O2", "O3", "float16"):
+        raise ValueError(
+            f"--train-dtype {args.train_dtype} does not support the CPU backend."
+        )
+    from espnet.utils.dynamic_import import dynamic_import
+    if args.model_module is None:
+        model_module = "espnet.nets." + args.backend + "_backend.e2e_mt:E2E"
+    else:
+        model_module = args.model_module
+    model_class = dynamic_import(model_module)
+    model_class.add_arguments(parser)
+    args = parser.parse_args(cmd_args)
+    args.model_module = model_module
+    if "chainer_backend" in args.model_module:
+        args.backend = "chainer"
+    if "pytorch_backend" in args.model_module:
+        args.backend = "pytorch"
+    # add version info in args
+    args.version = __version__
+    # logging info
+    if args.verbose > 0:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+    # If --ngpu is not given,
+    #   1. if CUDA_VISIBLE_DEVICES is set, all visible devices
+    #   2. if nvidia-smi exists, use all devices
+    #   3. else ngpu=0
+    if args.ngpu is None:
+        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
+        if cvd is not None:
+            ngpu = len(cvd.split(","))
+        else:
+            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
+            try:
+                p = subprocess.run(
+                    ["nvidia-smi", "-L"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
+                )
+            except (subprocess.CalledProcessError, FileNotFoundError):
+                ngpu = 0
+            else:
+                ngpu = len(p.stderr.decode().split("\n")) - 1
+        args.ngpu = ngpu
+    else:
+        if is_torch_1_2_plus and args.ngpu != 1:
+            logging.debug(
+                "There are some bugs with multi-GPU processing in PyTorch 1.2+"
+                + " (see https://github.com/pytorch/pytorch/issues/21108)"
+            )
+        ngpu = args.ngpu
+    logging.info(f"ngpu: {ngpu}")
+    # display PYTHONPATH
+    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))
+    # set random seed
+    logging.info("random seed = %d" % args.seed)
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    # load dictionary for debug log
+    if args.dict is not None:
+        with open(args.dict, "rb") as f:
+            dictionary = f.readlines()
+        char_list = [entry.decode("utf-8").split(" ")[0] for entry in dictionary]
+        char_list.insert(0, "<blank>")
+        char_list.append("<eos>")
+        args.char_list = char_list
+    else:
+        args.char_list = None
+    # train
+    logging.info("backend = " + args.backend)
+    if args.backend == "pytorch":
+        from espnet.mt.pytorch_backend.mt import train
+        train(args)
+    else:
+        raise ValueError("Only pytorch are supported.")
+if __name__ == "__main__":
+    main(sys.argv[1:])

espnet/bin/mt_trans.py ADDED Viewed

	@@ -0,0 +1,186 @@

+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2019 Kyoto University (Hirofumi Inaguma)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""Neural machine translation model decoding script."""
+import configargparse
+import logging
+import os
+import random
+import sys
+import numpy as np
+# NOTE: you need this func to generate our sphinx doc
+def get_parser():
+    """Get default arguments."""
+    parser = configargparse.ArgumentParser(
+        description="Translate text from speech "
+        "using a speech translation model on one CPU or GPU",
+        config_file_parser_class=configargparse.YAMLConfigFileParser,
+        formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
+    )
+    # general configuration
+    parser.add("--config", is_config_file=True, help="Config file path")
+    parser.add(
+        "--config2",
+        is_config_file=True,
+        help="Second config file path that overwrites the settings in `--config`",
+    )
+    parser.add(
+        "--config3",
+        is_config_file=True,
+        help="Third config file path "
+        "that overwrites the settings in `--config` and `--config2`",
+    )
+    parser.add_argument("--ngpu", type=int, default=0, help="Number of GPUs")
+    parser.add_argument(
+        "--dtype",
+        choices=("float16", "float32", "float64"),
+        default="float32",
+        help="Float precision (only available in --api v2)",
+    )
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="chainer",
+        choices=["chainer", "pytorch"],
+        help="Backend library",
+    )
+    parser.add_argument("--debugmode", type=int, default=1, help="Debugmode")
+    parser.add_argument("--seed", type=int, default=1, help="Random seed")
+    parser.add_argument("--verbose", "-V", type=int, default=1, help="Verbose option")
+    parser.add_argument(
+        "--batchsize",
+        type=int,
+        default=1,
+        help="Batch size for beam search (0: means no batch processing)",
+    )
+    parser.add_argument(
+        "--preprocess-conf",
+        type=str,
+        default=None,
+        help="The configuration file for the pre-processing",
+    )
+    parser.add_argument(
+        "--api",
+        default="v1",
+        choices=["v1", "v2"],
+        help="Beam search APIs "
+        "v1: Default API. It only supports "
+        "the ASRInterface.recognize method and DefaultRNNLM. "
+        "v2: Experimental API. "
+        "It supports any models that implements ScorerInterface.",
+    )
+    # task related
+    parser.add_argument(
+        "--trans-json", type=str, help="Filename of translation data (json)"
+    )
+    parser.add_argument(
+        "--result-label",
+        type=str,
+        required=True,
+        help="Filename of result label data (json)",
+    )
+    # model (parameter) related
+    parser.add_argument(
+        "--model", type=str, required=True, help="Model file parameters to read"
+    )
+    parser.add_argument(
+        "--model-conf", type=str, default=None, help="Model config file"
+    )
+    # search related
+    parser.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
+    parser.add_argument("--beam-size", type=int, default=1, help="Beam size")
+    parser.add_argument("--penalty", type=float, default=0.1, help="Incertion penalty")
+    parser.add_argument(
+        "--maxlenratio",
+        type=float,
+        default=3.0,
+        help="""Input length ratio to obtain max output length.
+                        If maxlenratio=0.0 (default), it uses a end-detect function
+                        to automatically find maximum hypothesis lengths""",
+    )
+    parser.add_argument(
+        "--minlenratio",
+        type=float,
+        default=0.0,
+        help="Input length ratio to obtain min output length",
+    )
+    # multilingual related
+    parser.add_argument(
+        "--tgt-lang",
+        default=False,
+        type=str,
+        help="target language ID (e.g., <en>, <de>, and <fr> etc.)",
+    )
+    return parser
+def main(args):
+    """Run the main decoding function."""
+    parser = get_parser()
+    args = parser.parse_args(args)
+    # logging info
+    if args.verbose == 1:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    elif args.verbose == 2:
+        logging.basicConfig(
+            level=logging.DEBUG,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+    # check CUDA_VISIBLE_DEVICES
+    if args.ngpu > 0:
+        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
+        if cvd is None:
+            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
+        elif args.ngpu != len(cvd.split(",")):
+            logging.error("#gpus is not matched with CUDA_VISIBLE_DEVICES.")
+            sys.exit(1)
+        # TODO(mn5k): support of multiple GPUs
+        if args.ngpu > 1:
+            logging.error("The program only supports ngpu=1.")
+            sys.exit(1)
+    # display PYTHONPATH
+    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))
+    # seed setting
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    logging.info("set random seed = %d" % args.seed)
+    # trans
+    logging.info("backend = " + args.backend)
+    if args.backend == "pytorch":
+        # Experimental API that supports custom LMs
+        from espnet.mt.pytorch_backend.mt import trans
+        if args.dtype != "float32":
+            raise NotImplementedError(
+                f"`--dtype {args.dtype}` is only available with `--api v2`"
+            )
+        trans(args)
+    else:
+        raise ValueError("Only pytorch are supported.")
+if __name__ == "__main__":
+    main(sys.argv[1:])

espnet/bin/st_train.py ADDED Viewed

	@@ -0,0 +1,550 @@

+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2019 Kyoto University (Hirofumi Inaguma)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""End-to-end speech translation model training script."""
+from distutils.version import LooseVersion
+import logging
+import os
+import random
+import subprocess
+import sys
+import configargparse
+import numpy as np
+import torch
+from espnet import __version__
+from espnet.utils.cli_utils import strtobool
+from espnet.utils.training.batchfy import BATCH_COUNT_CHOICES
+is_torch_1_2_plus = LooseVersion(torch.__version__) >= LooseVersion("1.2")
+# NOTE: you need this func to generate our sphinx doc
+def get_parser(parser=None, required=True):
+    """Get default arguments."""
+    if parser is None:
+        parser = configargparse.ArgumentParser(
+            description="Train a speech translation (ST) model on one CPU, "
+            "one or multiple GPUs",
+            config_file_parser_class=configargparse.YAMLConfigFileParser,
+            formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
+        )
+    # general configuration
+    parser.add("--config", is_config_file=True, help="config file path")
+    parser.add(
+        "--config2",
+        is_config_file=True,
+        help="second config file path that overwrites the settings in `--config`.",
+    )
+    parser.add(
+        "--config3",
+        is_config_file=True,
+        help="third config file path that overwrites the settings "
+        "in `--config` and `--config2`.",
+    )
+    parser.add_argument(
+        "--ngpu",
+        default=None,
+        type=int,
+        help="Number of GPUs. If not given, use all visible devices",
+    )
+    parser.add_argument(
+        "--train-dtype",
+        default="float32",
+        choices=["float16", "float32", "float64", "O0", "O1", "O2", "O3"],
+        help="Data type for training (only pytorch backend). "
+        "O0,O1,.. flags require apex. "
+        "See https://nvidia.github.io/apex/amp.html#opt-levels",
+    )
+    parser.add_argument(
+        "--backend",
+        default="chainer",
+        type=str,
+        choices=["chainer", "pytorch"],
+        help="Backend library",
+    )
+    parser.add_argument(
+        "--outdir", type=str, required=required, help="Output directory"
+    )
+    parser.add_argument("--debugmode", default=1, type=int, help="Debugmode")
+    parser.add_argument("--dict", required=required, help="Dictionary")
+    parser.add_argument("--seed", default=1, type=int, help="Random seed")
+    parser.add_argument("--debugdir", type=str, help="Output directory for debugging")
+    parser.add_argument(
+        "--resume",
+        "-r",
+        default="",
+        nargs="?",
+        help="Resume the training from snapshot",
+    )
+    parser.add_argument(
+        "--minibatches",
+        "-N",
+        type=int,
+        default="-1",
+        help="Process only N minibatches (for debug)",
+    )
+    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
+    parser.add_argument(
+        "--tensorboard-dir",
+        default=None,
+        type=str,
+        nargs="?",
+        help="Tensorboard log dir path",
+    )
+    parser.add_argument(
+        "--report-interval-iters",
+        default=100,
+        type=int,
+        help="Report interval iterations",
+    )
+    parser.add_argument(
+        "--save-interval-iters",
+        default=0,
+        type=int,
+        help="Save snapshot interval iterations",
+    )
+    # task related
+    parser.add_argument(
+        "--train-json",
+        type=str,
+        default=None,
+        help="Filename of train label data (json)",
+    )
+    parser.add_argument(
+        "--valid-json",
+        type=str,
+        default=None,
+        help="Filename of validation label data (json)",
+    )
+    # network architecture
+    parser.add_argument(
+        "--model-module",
+        type=str,
+        default=None,
+        help="model defined module (default: espnet.nets.xxx_backend.e2e_st:E2E)",
+    )
+    # loss related
+    parser.add_argument(
+        "--ctc_type",
+        default="warpctc",
+        type=str,
+        choices=["builtin", "warpctc", "gtnctc", "cudnnctc"],
+        help="Type of CTC implementation to calculate loss.",
+    )
+    parser.add_argument(
+        "--mtlalpha",
+        default=0.0,
+        type=float,
+        help="Multitask learning coefficient, alpha: \
+                                alpha*ctc_loss + (1-alpha)*att_loss",
+    )
+    parser.add_argument(
+        "--asr-weight",
+        default=0.0,
+        type=float,
+        help="Multitask learning coefficient for ASR task, weight: "
+        " asr_weight*(alpha*ctc_loss + (1-alpha)*att_loss)"
+        " + (1-asr_weight-mt_weight)*st_loss",
+    )
+    parser.add_argument(
+        "--mt-weight",
+        default=0.0,
+        type=float,
+        help="Multitask learning coefficient for MT task, weight: \
+                                mt_weight*mt_loss + (1-mt_weight-asr_weight)*st_loss",
+    )
+    parser.add_argument(
+        "--lsm-weight", default=0.0, type=float, help="Label smoothing weight"
+    )
+    # recognition options to compute CER/WER
+    parser.add_argument(
+        "--report-cer",
+        default=False,
+        action="store_true",
+        help="Compute CER on development set",
+    )
+    parser.add_argument(
+        "--report-wer",
+        default=False,
+        action="store_true",
+        help="Compute WER on development set",
+    )
+    # translations options to compute BLEU
+    parser.add_argument(
+        "--report-bleu",
+        default=True,
+        action="store_true",
+        help="Compute BLEU on development set",
+    )
+    parser.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
+    parser.add_argument("--beam-size", type=int, default=4, help="Beam size")
+    parser.add_argument("--penalty", default=0.0, type=float, help="Incertion penalty")
+    parser.add_argument(
+        "--maxlenratio",
+        default=0.0,
+        type=float,
+        help="""Input length ratio to obtain max output length.
+                        If maxlenratio=0.0 (default), it uses a end-detect function
+                        to automatically find maximum hypothesis lengths""",
+    )
+    parser.add_argument(
+        "--minlenratio",
+        default=0.0,
+        type=float,
+        help="Input length ratio to obtain min output length",
+    )
+    parser.add_argument(
+        "--rnnlm", type=str, default=None, help="RNNLM model file to read"
+    )
+    parser.add_argument(
+        "--rnnlm-conf", type=str, default=None, help="RNNLM model config file to read"
+    )
+    parser.add_argument("--lm-weight", default=0.0, type=float, help="RNNLM weight.")
+    parser.add_argument("--sym-space", default="<space>", type=str, help="Space symbol")
+    parser.add_argument("--sym-blank", default="<blank>", type=str, help="Blank symbol")
+    # minibatch related
+    parser.add_argument(
+        "--sortagrad",
+        default=0,
+        type=int,
+        nargs="?",
+        help="How many epochs to use sortagrad for. 0 = deactivated, -1 = all epochs",
+    )
+    parser.add_argument(
+        "--batch-count",
+        default="auto",
+        choices=BATCH_COUNT_CHOICES,
+        help="How to count batch_size. "
+        "The default (auto) will find how to count by args.",
+    )
+    parser.add_argument(
+        "--batch-size",
+        "--batch-seqs",
+        "-b",
+        default=0,
+        type=int,
+        help="Maximum seqs in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-bins",
+        default=0,
+        type=int,
+        help="Maximum bins in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-frames-in",
+        default=0,
+        type=int,
+        help="Maximum input frames in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-frames-out",
+        default=0,
+        type=int,
+        help="Maximum output frames in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-frames-inout",
+        default=0,
+        type=int,
+        help="Maximum input+output frames in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--maxlen-in",
+        "--batch-seq-maxlen-in",
+        default=800,
+        type=int,
+        metavar="ML",
+        help="When --batch-count=seq, batch size is reduced "
+        "if the input sequence length > ML.",
+    )
+    parser.add_argument(
+        "--maxlen-out",
+        "--batch-seq-maxlen-out",
+        default=150,
+        type=int,
+        metavar="ML",
+        help="When --batch-count=seq, "
+        "batch size is reduced if the output sequence length > ML",
+    )
+    parser.add_argument(
+        "--n-iter-processes",
+        default=0,
+        type=int,
+        help="Number of processes of iterator",
+    )
+    parser.add_argument(
+        "--preprocess-conf",
+        type=str,
+        default=None,
+        nargs="?",
+        help="The configuration file for the pre-processing",
+    )
+    # optimization related
+    parser.add_argument(
+        "--opt",
+        default="adadelta",
+        type=str,
+        choices=["adadelta", "adam", "noam"],
+        help="Optimizer",
+    )
+    parser.add_argument(
+        "--accum-grad", default=1, type=int, help="Number of gradient accumuration"
+    )
+    parser.add_argument(
+        "--eps", default=1e-8, type=float, help="Epsilon constant for optimizer"
+    )
+    parser.add_argument(
+        "--eps-decay", default=0.01, type=float, help="Decaying ratio of epsilon"
+    )
+    parser.add_argument(
+        "--lr", default=1e-3, type=float, help="Learning rate for optimizer"
+    )
+    parser.add_argument(
+        "--lr-decay", default=1.0, type=float, help="Decaying ratio of learning rate"
+    )
+    parser.add_argument(
+        "--weight-decay", default=0.0, type=float, help="Weight decay ratio"
+    )
+    parser.add_argument(
+        "--criterion",
+        default="acc",
+        type=str,
+        choices=["loss", "acc"],
+        help="Criterion to perform epsilon decay",
+    )
+    parser.add_argument(
+        "--threshold", default=1e-4, type=float, help="Threshold to stop iteration"
+    )
+    parser.add_argument(
+        "--epochs", "-e", default=30, type=int, help="Maximum number of epochs"
+    )
+    parser.add_argument(
+        "--early-stop-criterion",
+        default="validation/main/acc",
+        type=str,
+        nargs="?",
+        help="Value to monitor to trigger an early stopping of the training",
+    )
+    parser.add_argument(
+        "--patience",
+        default=3,
+        type=int,
+        nargs="?",
+        help="Number of epochs to wait "
+        "without improvement before stopping the training",
+    )
+    parser.add_argument(
+        "--grad-clip", default=5, type=float, help="Gradient norm threshold to clip"
+    )
+    parser.add_argument(
+        "--num-save-attention",
+        default=3,
+        type=int,
+        help="Number of samples of attention to be saved",
+    )
+    parser.add_argument(
+        "--num-save-ctc",
+        default=3,
+        type=int,
+        help="Number of samples of CTC probability to be saved",
+    )
+    parser.add_argument(
+        "--grad-noise",
+        type=strtobool,
+        default=False,
+        help="The flag to switch to use noise injection to gradients during training",
+    )
+    # speech translation related
+    parser.add_argument(
+        "--context-residual",
+        default=False,
+        type=strtobool,
+        nargs="?",
+        help="The flag to switch to use context vector residual in the decoder network",
+    )
+    # finetuning related
+    parser.add_argument(
+        "--enc-init",
+        default=None,
+        type=str,
+        nargs="?",
+        help="Pre-trained ASR model to initialize encoder.",
+    )
+    parser.add_argument(
+        "--enc-init-mods",
+        default="enc.enc.",
+        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
+        help="List of encoder modules to initialize, separated by a comma.",
+    )
+    parser.add_argument(
+        "--dec-init",
+        default=None,
+        type=str,
+        nargs="?",
+        help="Pre-trained ASR, MT or LM model to initialize decoder.",
+    )
+    parser.add_argument(
+        "--dec-init-mods",
+        default="att., dec.",
+        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
+        help="List of decoder modules to initialize, separated by a comma.",
+    )
+    # multilingual related
+    parser.add_argument(
+        "--multilingual",
+        default=False,
+        type=strtobool,
+        help="Prepend target language ID to the source sentence. "
+        " Both source/target language IDs must be prepend in the pre-processing stage.",
+    )
+    parser.add_argument(
+        "--replace-sos",
+        default=False,
+        type=strtobool,
+        help="Replace <sos> in the decoder with a target language ID \
+                              (the first token in the target sequence)",
+    )
+    # Feature transform: Normalization
+    parser.add_argument(
+        "--stats-file",
+        type=str,
+        default=None,
+        help="The stats file for the feature normalization",
+    )
+    parser.add_argument(
+        "--apply-uttmvn",
+        type=strtobool,
+        default=True,
+        help="Apply utterance level mean " "variance normalization.",
+    )
+    parser.add_argument("--uttmvn-norm-means", type=strtobool, default=True, help="")
+    parser.add_argument("--uttmvn-norm-vars", type=strtobool, default=False, help="")
+    # Feature transform: Fbank
+    parser.add_argument(
+        "--fbank-fs",
+        type=int,
+        default=16000,
+        help="The sample frequency used for " "the mel-fbank creation.",
+    )
+    parser.add_argument(
+        "--n-mels", type=int, default=80, help="The number of mel-frequency bins."
+    )
+    parser.add_argument("--fbank-fmin", type=float, default=0.0, help="")
+    parser.add_argument("--fbank-fmax", type=float, default=None, help="")
+    return parser
+def main(cmd_args):
+    """Run the main training function."""
+    parser = get_parser()
+    args, _ = parser.parse_known_args(cmd_args)
+    if args.backend == "chainer" and args.train_dtype != "float32":
+        raise NotImplementedError(
+            f"chainer backend does not support --train-dtype {args.train_dtype}."
+            "Use --dtype float32."
+        )
+    if args.ngpu == 0 and args.train_dtype in ("O0", "O1", "O2", "O3", "float16"):
+        raise ValueError(
+            f"--train-dtype {args.train_dtype} does not support the CPU backend."
+        )
+    from espnet.utils.dynamic_import import dynamic_import
+    if args.model_module is None:
+        model_module = "espnet.nets." + args.backend + "_backend.e2e_st:E2E"
+    else:
+        model_module = args.model_module
+    model_class = dynamic_import(model_module)
+    model_class.add_arguments(parser)
+    args = parser.parse_args(cmd_args)
+    args.model_module = model_module
+    if "chainer_backend" in args.model_module:
+        args.backend = "chainer"
+    if "pytorch_backend" in args.model_module:
+        args.backend = "pytorch"
+    # add version info in args
+    args.version = __version__
+    # logging info
+    if args.verbose > 0:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+    # If --ngpu is not given,
+    #   1. if CUDA_VISIBLE_DEVICES is set, all visible devices
+    #   2. if nvidia-smi exists, use all devices
+    #   3. else ngpu=0
+    if args.ngpu is None:
+        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
+        if cvd is not None:
+            ngpu = len(cvd.split(","))
+        else:
+            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
+            try:
+                p = subprocess.run(
+                    ["nvidia-smi", "-L"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
+                )
+            except (subprocess.CalledProcessError, FileNotFoundError):
+                ngpu = 0
+            else:
+                ngpu = len(p.stderr.decode().split("\n")) - 1
+        args.ngpu = ngpu
+    else:
+        if is_torch_1_2_plus and args.ngpu != 1:
+            logging.debug(
+                "There are some bugs with multi-GPU processing in PyTorch 1.2+"
+                + " (see https://github.com/pytorch/pytorch/issues/21108)"
+            )
+        ngpu = args.ngpu
+    logging.info(f"ngpu: {ngpu}")
+    # display PYTHONPATH
+    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))
+    # set random seed
+    logging.info("random seed = %d" % args.seed)
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    # load dictionary for debug log
+    if args.dict is not None:
+        with open(args.dict, "rb") as f:
+            dictionary = f.readlines()
+        char_list = [entry.decode("utf-8").split(" ")[0] for entry in dictionary]
+        char_list.insert(0, "<blank>")
+        char_list.append("<eos>")
+        args.char_list = char_list
+    else:
+        args.char_list = None
+    # train
+    logging.info("backend = " + args.backend)
+    if args.backend == "pytorch":
+        from espnet.st.pytorch_backend.st import train
+        train(args)
+    else:
+        raise ValueError("Only pytorch are supported.")
+if __name__ == "__main__":
+    main(sys.argv[1:])

espnet/bin/st_trans.py ADDED Viewed

	@@ -0,0 +1,183 @@

+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2019 Kyoto University (Hirofumi Inaguma)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""End-to-end speech translation model decoding script."""
+import logging
+import os
+import random
+import sys
+import configargparse
+import numpy as np
+# NOTE: you need this func to generate our sphinx doc
+def get_parser():
+    """Get default arguments."""
+    parser = configargparse.ArgumentParser(
+        description="Translate text from speech using a speech translation "
+        "model on one CPU or GPU",
+        config_file_parser_class=configargparse.YAMLConfigFileParser,
+        formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
+    )
+    # general configuration
+    parser.add("--config", is_config_file=True, help="Config file path")
+    parser.add(
+        "--config2",
+        is_config_file=True,
+        help="Second config file path that overwrites the settings in `--config`",
+    )
+    parser.add(
+        "--config3",
+        is_config_file=True,
+        help="Third config file path that overwrites "
+        "the settings in `--config` and `--config2`",
+    )
+    parser.add_argument("--ngpu", type=int, default=0, help="Number of GPUs")
+    parser.add_argument(
+        "--dtype",
+        choices=("float16", "float32", "float64"),
+        default="float32",
+        help="Float precision (only available in --api v2)",
+    )
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="chainer",
+        choices=["chainer", "pytorch"],
+        help="Backend library",
+    )
+    parser.add_argument("--debugmode", type=int, default=1, help="Debugmode")
+    parser.add_argument("--seed", type=int, default=1, help="Random seed")
+    parser.add_argument("--verbose", "-V", type=int, default=1, help="Verbose option")
+    parser.add_argument(
+        "--batchsize",
+        type=int,
+        default=1,
+        help="Batch size for beam search (0: means no batch processing)",
+    )
+    parser.add_argument(
+        "--preprocess-conf",
+        type=str,
+        default=None,
+        help="The configuration file for the pre-processing",
+    )
+    parser.add_argument(
+        "--api",
+        default="v1",
+        choices=["v1", "v2"],
+        help="Beam search APIs "
+        "v1: Default API. "
+        "It only supports the ASRInterface.recognize method and DefaultRNNLM. "
+        "v2: Experimental API. "
+        "It supports any models that implements ScorerInterface.",
+    )
+    # task related
+    parser.add_argument(
+        "--trans-json", type=str, help="Filename of translation data (json)"
+    )
+    parser.add_argument(
+        "--result-label",
+        type=str,
+        required=True,
+        help="Filename of result label data (json)",
+    )
+    # model (parameter) related
+    parser.add_argument(
+        "--model", type=str, required=True, help="Model file parameters to read"
+    )
+    # search related
+    parser.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
+    parser.add_argument("--beam-size", type=int, default=1, help="Beam size")
+    parser.add_argument("--penalty", type=float, default=0.0, help="Incertion penalty")
+    parser.add_argument(
+        "--maxlenratio",
+        type=float,
+        default=0.0,
+        help="""Input length ratio to obtain max output length.
+                        If maxlenratio=0.0 (default), it uses a end-detect function
+                        to automatically find maximum hypothesis lengths""",
+    )
+    parser.add_argument(
+        "--minlenratio",
+        type=float,
+        default=0.0,
+        help="Input length ratio to obtain min output length",
+    )
+    # multilingual related
+    parser.add_argument(
+        "--tgt-lang",
+        default=False,
+        type=str,
+        help="target language ID (e.g., <en>, <de>, and <fr> etc.)",
+    )
+    return parser
+def main(args):
+    """Run the main decoding function."""
+    parser = get_parser()
+    args = parser.parse_args(args)
+    # logging info
+    if args.verbose == 1:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    elif args.verbose == 2:
+        logging.basicConfig(
+            level=logging.DEBUG,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+    # check CUDA_VISIBLE_DEVICES
+    if args.ngpu > 0:
+        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
+        if cvd is None:
+            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
+        elif args.ngpu != len(cvd.split(",")):
+            logging.error("#gpus is not matched with CUDA_VISIBLE_DEVICES.")
+            sys.exit(1)
+        # TODO(mn5k): support of multiple GPUs
+        if args.ngpu > 1:
+            logging.error("The program only supports ngpu=1.")
+            sys.exit(1)
+    # display PYTHONPATH
+    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))
+    # seed setting
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    logging.info("set random seed = %d" % args.seed)
+    # trans
+    logging.info("backend = " + args.backend)
+    if args.backend == "pytorch":
+        # Experimental API that supports custom LMs
+        from espnet.st.pytorch_backend.st import trans
+        if args.dtype != "float32":
+            raise NotImplementedError(
+                f"`--dtype {args.dtype}` is only available with `--api v2`"
+            )
+        trans(args)
+    else:
+        raise ValueError("Only pytorch are supported.")
+if __name__ == "__main__":
+    main(sys.argv[1:])

espnet/bin/tts_decode.py ADDED Viewed

	@@ -0,0 +1,180 @@

+#!/usr/bin/env python3
+# Copyright 2018 Nagoya University (Tomoki Hayashi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""TTS decoding script."""
+import configargparse
+import logging
+import os
+import platform
+import subprocess
+import sys
+from espnet.utils.cli_utils import strtobool
+# NOTE: you need this func to generate our sphinx doc
+def get_parser():
+    """Get parser of decoding arguments."""
+    parser = configargparse.ArgumentParser(
+        description="Synthesize speech from text using a TTS model on one CPU",
+        config_file_parser_class=configargparse.YAMLConfigFileParser,
+        formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
+    )
+    # general configuration
+    parser.add("--config", is_config_file=True, help="config file path")
+    parser.add(
+        "--config2",
+        is_config_file=True,
+        help="second config file path that overwrites the settings in `--config`.",
+    )
+    parser.add(
+        "--config3",
+        is_config_file=True,
+        help="third config file path that overwrites "
+        "the settings in `--config` and `--config2`.",
+    )
+    parser.add_argument("--ngpu", default=0, type=int, help="Number of GPUs")
+    parser.add_argument(
+        "--backend",
+        default="pytorch",
+        type=str,
+        choices=["chainer", "pytorch"],
+        help="Backend library",
+    )
+    parser.add_argument("--debugmode", default=1, type=int, help="Debugmode")
+    parser.add_argument("--seed", default=1, type=int, help="Random seed")
+    parser.add_argument("--out", type=str, required=True, help="Output filename")
+    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
+    parser.add_argument(
+        "--preprocess-conf",
+        type=str,
+        default=None,
+        help="The configuration file for the pre-processing",
+    )
+    # task related
+    parser.add_argument(
+        "--json", type=str, required=True, help="Filename of train label data (json)"
+    )
+    parser.add_argument(
+        "--model", type=str, required=True, help="Model file parameters to read"
+    )
+    parser.add_argument(
+        "--model-conf", type=str, default=None, help="Model config file"
+    )
+    # decoding related
+    parser.add_argument(
+        "--maxlenratio", type=float, default=5, help="Maximum length ratio in decoding"
+    )
+    parser.add_argument(
+        "--minlenratio", type=float, default=0, help="Minimum length ratio in decoding"
+    )
+    parser.add_argument(
+        "--threshold", type=float, default=0.5, help="Threshold value in decoding"
+    )
+    parser.add_argument(
+        "--use-att-constraint",
+        type=strtobool,
+        default=False,
+        help="Whether to use the attention constraint",
+    )
+    parser.add_argument(
+        "--backward-window",
+        type=int,
+        default=1,
+        help="Backward window size in the attention constraint",
+    )
+    parser.add_argument(
+        "--forward-window",
+        type=int,
+        default=3,
+        help="Forward window size in the attention constraint",
+    )
+    parser.add_argument(
+        "--fastspeech-alpha",
+        type=float,
+        default=1.0,
+        help="Alpha to change the speed for FastSpeech",
+    )
+    # save related
+    parser.add_argument(
+        "--save-durations",
+        default=False,
+        type=strtobool,
+        help="Whether to save durations converted from attentions",
+    )
+    parser.add_argument(
+        "--save-focus-rates",
+        default=False,
+        type=strtobool,
+        help="Whether to save focus rates of attentions",
+    )
+    return parser
+def main(args):
+    """Run deocding."""
+    parser = get_parser()
+    args = parser.parse_args(args)
+    # logging info
+    if args.verbose > 0:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+    # check CUDA_VISIBLE_DEVICES
+    if args.ngpu > 0:
+        # python 2 case
+        if platform.python_version_tuple()[0] == "2":
+            if "clsp.jhu.edu" in subprocess.check_output(["hostname", "-f"]):
+                cvd = subprocess.check_output(
+                    ["/usr/local/bin/free-gpu", "-n", str(args.ngpu)]
+                ).strip()
+                logging.info("CLSP: use gpu" + cvd)
+                os.environ["CUDA_VISIBLE_DEVICES"] = cvd
+        # python 3 case
+        else:
+            if "clsp.jhu.edu" in subprocess.check_output(["hostname", "-f"]).decode():
+                cvd = (
+                    subprocess.check_output(
+                        ["/usr/local/bin/free-gpu", "-n", str(args.ngpu)]
+                    )
+                    .decode()
+                    .strip()
+                )
+                logging.info("CLSP: use gpu" + cvd)
+                os.environ["CUDA_VISIBLE_DEVICES"] = cvd
+        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
+        if cvd is None:
+            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
+        elif args.ngpu != len(cvd.split(",")):
+            logging.error("#gpus is not matched with CUDA_VISIBLE_DEVICES.")
+            sys.exit(1)
+    # display PYTHONPATH
+    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))
+    # extract
+    logging.info("backend = " + args.backend)
+    if args.backend == "pytorch":
+        from espnet.tts.pytorch_backend.tts import decode
+        decode(args)
+    else:
+        raise NotImplementedError("Only pytorch is supported.")
+if __name__ == "__main__":
+    main(sys.argv[1:])

espnet/bin/tts_train.py ADDED Viewed

	@@ -0,0 +1,359 @@

+#!/usr/bin/env python3
+# Copyright 2018 Nagoya University (Tomoki Hayashi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""Text-to-speech model training script."""
+import logging
+import os
+import random
+import subprocess
+import sys
+import configargparse
+import numpy as np
+from espnet import __version__
+from espnet.nets.tts_interface import TTSInterface
+from espnet.utils.cli_utils import strtobool
+from espnet.utils.training.batchfy import BATCH_COUNT_CHOICES
+# NOTE: you need this func to generate our sphinx doc
+def get_parser():
+    """Get parser of training arguments."""
+    parser = configargparse.ArgumentParser(
+        description="Train a new text-to-speech (TTS) model on one CPU, "
+        "one or multiple GPUs",
+        config_file_parser_class=configargparse.YAMLConfigFileParser,
+        formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
+    )
+    # general configuration
+    parser.add("--config", is_config_file=True, help="config file path")
+    parser.add(
+        "--config2",
+        is_config_file=True,
+        help="second config file path that overwrites the settings in `--config`.",
+    )
+    parser.add(
+        "--config3",
+        is_config_file=True,
+        help="third config file path that overwrites "
+        "the settings in `--config` and `--config2`.",
+    )
+    parser.add_argument(
+        "--ngpu",
+        default=None,
+        type=int,
+        help="Number of GPUs. If not given, use all visible devices",
+    )
+    parser.add_argument(
+        "--backend",
+        default="pytorch",
+        type=str,
+        choices=["chainer", "pytorch"],
+        help="Backend library",
+    )
+    parser.add_argument("--outdir", type=str, required=True, help="Output directory")
+    parser.add_argument("--debugmode", default=1, type=int, help="Debugmode")
+    parser.add_argument("--seed", default=1, type=int, help="Random seed")
+    parser.add_argument(
+        "--resume",
+        "-r",
+        default="",
+        type=str,
+        nargs="?",
+        help="Resume the training from snapshot",
+    )
+    parser.add_argument(
+        "--minibatches",
+        "-N",
+        type=int,
+        default="-1",
+        help="Process only N minibatches (for debug)",
+    )
+    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
+    parser.add_argument(
+        "--tensorboard-dir",
+        default=None,
+        type=str,
+        nargs="?",
+        help="Tensorboard log directory path",
+    )
+    parser.add_argument(
+        "--eval-interval-epochs", default=1, type=int, help="Evaluation interval epochs"
+    )
+    parser.add_argument(
+        "--save-interval-epochs", default=1, type=int, help="Save interval epochs"
+    )
+    parser.add_argument(
+        "--report-interval-iters",
+        default=100,
+        type=int,
+        help="Report interval iterations",
+    )
+    # task related
+    parser.add_argument(
+        "--train-json", type=str, required=True, help="Filename of training json"
+    )
+    parser.add_argument(
+        "--valid-json", type=str, required=True, help="Filename of validation json"
+    )
+    # network architecture
+    parser.add_argument(
+        "--model-module",
+        type=str,
+        default="espnet.nets.pytorch_backend.e2e_tts_tacotron2:Tacotron2",
+        help="model defined module",
+    )
+    # minibatch related
+    parser.add_argument(
+        "--sortagrad",
+        default=0,
+        type=int,
+        nargs="?",
+        help="How many epochs to use sortagrad for. 0 = deactivated, -1 = all epochs",
+    )
+    parser.add_argument(
+        "--batch-sort-key",
+        default="shuffle",
+        type=str,
+        choices=["shuffle", "output", "input"],
+        nargs="?",
+        help='Batch sorting key. "shuffle" only work with --batch-count "seq".',
+    )
+    parser.add_argument(
+        "--batch-count",
+        default="auto",
+        choices=BATCH_COUNT_CHOICES,
+        help="How to count batch_size. "
+        "The default (auto) will find how to count by args.",
+    )
+    parser.add_argument(
+        "--batch-size",
+        "--batch-seqs",
+        "-b",
+        default=0,
+        type=int,
+        help="Maximum seqs in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-bins",
+        default=0,
+        type=int,
+        help="Maximum bins in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-frames-in",
+        default=0,
+        type=int,
+        help="Maximum input frames in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-frames-out",
+        default=0,
+        type=int,
+        help="Maximum output frames in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-frames-inout",
+        default=0,
+        type=int,
+        help="Maximum input+output frames in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--maxlen-in",
+        "--batch-seq-maxlen-in",
+        default=100,
+        type=int,
+        metavar="ML",
+        help="When --batch-count=seq, "
+        "batch size is reduced if the input sequence length > ML.",
+    )
+    parser.add_argument(
+        "--maxlen-out",
+        "--batch-seq-maxlen-out",
+        default=200,
+        type=int,
+        metavar="ML",
+        help="When --batch-count=seq, "
+        "batch size is reduced if the output sequence length > ML",
+    )
+    parser.add_argument(
+        "--num-iter-processes",
+        default=0,
+        type=int,
+        help="Number of processes of iterator",
+    )
+    parser.add_argument(
+        "--preprocess-conf",
+        type=str,
+        default=None,
+        help="The configuration file for the pre-processing",
+    )
+    parser.add_argument(
+        "--use-speaker-embedding",
+        default=False,
+        type=strtobool,
+        help="Whether to use speaker embedding",
+    )
+    parser.add_argument(
+        "--use-second-target",
+        default=False,
+        type=strtobool,
+        help="Whether to use second target",
+    )
+    # optimization related
+    parser.add_argument(
+        "--opt", default="adam", type=str, choices=["adam", "noam"], help="Optimizer"
+    )
+    parser.add_argument(
+        "--accum-grad", default=1, type=int, help="Number of gradient accumuration"
+    )
+    parser.add_argument(
+        "--lr", default=1e-3, type=float, help="Learning rate for optimizer"
+    )
+    parser.add_argument("--eps", default=1e-6, type=float, help="Epsilon for optimizer")
+    parser.add_argument(
+        "--weight-decay",
+        default=1e-6,
+        type=float,
+        help="Weight decay coefficient for optimizer",
+    )
+    parser.add_argument(
+        "--epochs", "-e", default=30, type=int, help="Number of maximum epochs"
+    )
+    parser.add_argument(
+        "--early-stop-criterion",
+        default="validation/main/loss",
+        type=str,
+        nargs="?",
+        help="Value to monitor to trigger an early stopping of the training",
+    )
+    parser.add_argument(
+        "--patience",
+        default=3,
+        type=int,
+        nargs="?",
+        help="Number of epochs to wait "
+        "without improvement before stopping the training",
+    )
+    parser.add_argument(
+        "--grad-clip", default=1, type=float, help="Gradient norm threshold to clip"
+    )
+    parser.add_argument(
+        "--num-save-attention",
+        default=5,
+        type=int,
+        help="Number of samples of attention to be saved",
+    )
+    parser.add_argument(
+        "--keep-all-data-on-mem",
+        default=False,
+        type=strtobool,
+        help="Whether to keep all data on memory",
+    )
+    # finetuning related
+    parser.add_argument(
+        "--enc-init",
+        default=None,
+        type=str,
+        help="Pre-trained TTS model path to initialize encoder.",
+    )
+    parser.add_argument(
+        "--enc-init-mods",
+        default="enc.",
+        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
+        help="List of encoder modules to initialize, separated by a comma.",
+    )
+    parser.add_argument(
+        "--dec-init",
+        default=None,
+        type=str,
+        help="Pre-trained TTS model path to initialize decoder.",
+    )
+    parser.add_argument(
+        "--dec-init-mods",
+        default="dec.",
+        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
+        help="List of decoder modules to initialize, separated by a comma.",
+    )
+    parser.add_argument(
+        "--freeze-mods",
+        default=None,
+        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
+        help="List of modules to freeze (not to train), separated by a comma.",
+    )
+    return parser
+def main(cmd_args):
+    """Run training."""
+    parser = get_parser()
+    args, _ = parser.parse_known_args(cmd_args)
+    from espnet.utils.dynamic_import import dynamic_import
+    model_class = dynamic_import(args.model_module)
+    assert issubclass(model_class, TTSInterface)
+    model_class.add_arguments(parser)
+    args = parser.parse_args(cmd_args)
+    # add version info in args
+    args.version = __version__
+    # logging info
+    if args.verbose > 0:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+    # If --ngpu is not given,
+    #   1. if CUDA_VISIBLE_DEVICES is set, all visible devices
+    #   2. if nvidia-smi exists, use all devices
+    #   3. else ngpu=0
+    if args.ngpu is None:
+        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
+        if cvd is not None:
+            ngpu = len(cvd.split(","))
+        else:
+            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
+            try:
+                p = subprocess.run(
+                    ["nvidia-smi", "-L"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
+                )
+            except (subprocess.CalledProcessError, FileNotFoundError):
+                ngpu = 0
+            else:
+                ngpu = len(p.stderr.decode().split("\n")) - 1
+        args.ngpu = ngpu
+    else:
+        ngpu = args.ngpu
+    logging.info(f"ngpu: {ngpu}")
+    # set random seed
+    logging.info("random seed = %d" % args.seed)
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    if args.backend == "pytorch":
+        from espnet.tts.pytorch_backend.tts import train
+        train(args)
+    else:
+        raise NotImplementedError("Only pytorch is supported.")
+if __name__ == "__main__":
+    main(sys.argv[1:])

espnet/bin/vc_decode.py ADDED Viewed

	@@ -0,0 +1,174 @@

+#!/usr/bin/env python3
+# Copyright 2020 Nagoya University (Wen-Chin Huang)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""VC decoding script."""
+import configargparse
+import logging
+import os
+import platform
+import subprocess
+import sys
+from espnet.utils.cli_utils import strtobool
+# NOTE: you need this func to generate our sphinx doc
+def get_parser():
+    """Get parser of decoding arguments."""
+    parser = configargparse.ArgumentParser(
+        description="Converting speech using a VC model on one CPU",
+        config_file_parser_class=configargparse.YAMLConfigFileParser,
+        formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
+    )
+    # general configuration
+    parser.add("--config", is_config_file=True, help="config file path")
+    parser.add(
+        "--config2",
+        is_config_file=True,
+        help="second config file path that overwrites the settings in `--config`.",
+    )
+    parser.add(
+        "--config3",
+        is_config_file=True,
+        help="third config file path that overwrites the settings "
+        "in `--config` and `--config2`.",
+    )
+    parser.add_argument("--ngpu", default=0, type=int, help="Number of GPUs")
+    parser.add_argument(
+        "--backend",
+        default="pytorch",
+        type=str,
+        choices=["chainer", "pytorch"],
+        help="Backend library",
+    )
+    parser.add_argument("--debugmode", default=1, type=int, help="Debugmode")
+    parser.add_argument("--seed", default=1, type=int, help="Random seed")
+    parser.add_argument("--out", type=str, required=True, help="Output filename")
+    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
+    parser.add_argument(
+        "--preprocess-conf",
+        type=str,
+        default=None,
+        help="The configuration file for the pre-processing",
+    )
+    # task related
+    parser.add_argument(
+        "--json", type=str, required=True, help="Filename of train label data (json)"
+    )
+    parser.add_argument(
+        "--model", type=str, required=True, help="Model file parameters to read"
+    )
+    parser.add_argument(
+        "--model-conf", type=str, default=None, help="Model config file"
+    )
+    # decoding related
+    parser.add_argument(
+        "--maxlenratio", type=float, default=5, help="Maximum length ratio in decoding"
+    )
+    parser.add_argument(
+        "--minlenratio", type=float, default=0, help="Minimum length ratio in decoding"
+    )
+    parser.add_argument(
+        "--threshold", type=float, default=0.5, help="Threshold value in decoding"
+    )
+    parser.add_argument(
+        "--use-att-constraint",
+        type=strtobool,
+        default=False,
+        help="Whether to use the attention constraint",
+    )
+    parser.add_argument(
+        "--backward-window",
+        type=int,
+        default=1,
+        help="Backward window size in the attention constraint",
+    )
+    parser.add_argument(
+        "--forward-window",
+        type=int,
+        default=3,
+        help="Forward window size in the attention constraint",
+    )
+    # save related
+    parser.add_argument(
+        "--save-durations",
+        default=False,
+        type=strtobool,
+        help="Whether to save durations converted from attentions",
+    )
+    parser.add_argument(
+        "--save-focus-rates",
+        default=False,
+        type=strtobool,
+        help="Whether to save focus rates of attentions",
+    )
+    return parser
+def main(args):
+    """Run deocding."""
+    parser = get_parser()
+    args = parser.parse_args(args)
+    # logging info
+    if args.verbose > 0:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+    # check CUDA_VISIBLE_DEVICES
+    if args.ngpu > 0:
+        # python 2 case
+        if platform.python_version_tuple()[0] == "2":
+            if "clsp.jhu.edu" in subprocess.check_output(["hostname", "-f"]):
+                cvd = subprocess.check_output(
+                    ["/usr/local/bin/free-gpu", "-n", str(args.ngpu)]
+                ).strip()
+                logging.info("CLSP: use gpu" + cvd)
+                os.environ["CUDA_VISIBLE_DEVICES"] = cvd
+        # python 3 case
+        else:
+            if "clsp.jhu.edu" in subprocess.check_output(["hostname", "-f"]).decode():
+                cvd = (
+                    subprocess.check_output(
+                        ["/usr/local/bin/free-gpu", "-n", str(args.ngpu)]
+                    )
+                    .decode()
+                    .strip()
+                )
+                logging.info("CLSP: use gpu" + cvd)
+                os.environ["CUDA_VISIBLE_DEVICES"] = cvd
+        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
+        if cvd is None:
+            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
+        elif args.ngpu != len(cvd.split(",")):
+            logging.error("#gpus is not matched with CUDA_VISIBLE_DEVICES.")
+            sys.exit(1)
+    # display PYTHONPATH
+    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))
+    # extract
+    logging.info("backend = " + args.backend)
+    if args.backend == "pytorch":
+        from espnet.vc.pytorch_backend.vc import decode
+        decode(args)
+    else:
+        raise NotImplementedError("Only pytorch is supported.")
+if __name__ == "__main__":
+    main(sys.argv[1:])

espnet/bin/vc_train.py ADDED Viewed

	@@ -0,0 +1,368 @@

+#!/usr/bin/env python3
+# Copyright 2020 Nagoya University (Wen-Chin Huang)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""Voice conversion model training script."""
+import logging
+import os
+import random
+import subprocess
+import sys
+import configargparse
+import numpy as np
+from espnet import __version__
+from espnet.nets.tts_interface import TTSInterface
+from espnet.utils.cli_utils import strtobool
+from espnet.utils.training.batchfy import BATCH_COUNT_CHOICES
+# NOTE: you need this func to generate our sphinx doc
+def get_parser():
+    """Get parser of training arguments."""
+    parser = configargparse.ArgumentParser(
+        description="Train a new voice conversion (VC) model on one CPU, "
+        "one or multiple GPUs",
+        config_file_parser_class=configargparse.YAMLConfigFileParser,
+        formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
+    )
+    # general configuration
+    parser.add("--config", is_config_file=True, help="config file path")
+    parser.add(
+        "--config2",
+        is_config_file=True,
+        help="second config file path that overwrites the settings in `--config`.",
+    )
+    parser.add(
+        "--config3",
+        is_config_file=True,
+        help="third config file path that overwrites the settings "
+        "in `--config` and `--config2`.",
+    )
+    parser.add_argument(
+        "--ngpu",
+        default=None,
+        type=int,
+        help="Number of GPUs. If not given, use all visible devices",
+    )
+    parser.add_argument(
+        "--backend",
+        default="pytorch",
+        type=str,
+        choices=["chainer", "pytorch"],
+        help="Backend library",
+    )
+    parser.add_argument("--outdir", type=str, required=True, help="Output directory")
+    parser.add_argument("--debugmode", default=1, type=int, help="Debugmode")
+    parser.add_argument("--seed", default=1, type=int, help="Random seed")
+    parser.add_argument(
+        "--resume",
+        "-r",
+        default="",
+        type=str,
+        nargs="?",
+        help="Resume the training from snapshot",
+    )
+    parser.add_argument(
+        "--minibatches",
+        "-N",
+        type=int,
+        default="-1",
+        help="Process only N minibatches (for debug)",
+    )
+    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
+    parser.add_argument(
+        "--tensorboard-dir",
+        default=None,
+        type=str,
+        nargs="?",
+        help="Tensorboard log directory path",
+    )
+    parser.add_argument(
+        "--eval-interval-epochs",
+        default=100,
+        type=int,
+        help="Evaluation interval epochs",
+    )
+    parser.add_argument(
+        "--save-interval-epochs", default=1, type=int, help="Save interval epochs"
+    )
+    parser.add_argument(
+        "--report-interval-iters",
+        default=10,
+        type=int,
+        help="Report interval iterations",
+    )
+    # task related
+    parser.add_argument("--srcspk", type=str, help="Source speaker")
+    parser.add_argument("--trgspk", type=str, help="Target speaker")
+    parser.add_argument(
+        "--train-json", type=str, required=True, help="Filename of training json"
+    )
+    parser.add_argument(
+        "--valid-json", type=str, required=True, help="Filename of validation json"
+    )
+    # network architecture
+    parser.add_argument(
+        "--model-module",
+        type=str,
+        default="espnet.nets.pytorch_backend.e2e_tts_tacotron2:Tacotron2",
+        help="model defined module",
+    )
+    # minibatch related
+    parser.add_argument(
+        "--sortagrad",
+        default=0,
+        type=int,
+        nargs="?",
+        help="How many epochs to use sortagrad for. 0 = deactivated, -1 = all epochs",
+    )
+    parser.add_argument(
+        "--batch-sort-key",
+        default="shuffle",
+        type=str,
+        choices=["shuffle", "output", "input"],
+        nargs="?",
+        help='Batch sorting key. "shuffle" only work with --batch-count "seq".',
+    )
+    parser.add_argument(
+        "--batch-count",
+        default="auto",
+        choices=BATCH_COUNT_CHOICES,
+        help="How to count batch_size. "
+        "The default (auto) will find how to count by args.",
+    )
+    parser.add_argument(
+        "--batch-size",
+        "--batch-seqs",
+        "-b",
+        default=0,
+        type=int,
+        help="Maximum seqs in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-bins",
+        default=0,
+        type=int,
+        help="Maximum bins in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-frames-in",
+        default=0,
+        type=int,
+        help="Maximum input frames in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-frames-out",
+        default=0,
+        type=int,
+        help="Maximum output frames in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-frames-inout",
+        default=0,
+        type=int,
+        help="Maximum input+output frames in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--maxlen-in",
+        "--batch-seq-maxlen-in",
+        default=100,
+        type=int,
+        metavar="ML",
+        help="When --batch-count=seq, "
+        "batch size is reduced if the input sequence length > ML.",
+    )
+    parser.add_argument(
+        "--maxlen-out",
+        "--batch-seq-maxlen-out",
+        default=200,
+        type=int,
+        metavar="ML",
+        help="When --batch-count=seq, "
+        "batch size is reduced if the output sequence length > ML",
+    )
+    parser.add_argument(
+        "--num-iter-processes",
+        default=0,
+        type=int,
+        help="Number of processes of iterator",
+    )
+    parser.add_argument(
+        "--preprocess-conf",
+        type=str,
+        default=None,
+        help="The configuration file for the pre-processing",
+    )
+    parser.add_argument(
+        "--use-speaker-embedding",
+        default=False,
+        type=strtobool,
+        help="Whether to use speaker embedding",
+    )
+    parser.add_argument(
+        "--use-second-target",
+        default=False,
+        type=strtobool,
+        help="Whether to use second target",
+    )
+    # optimization related
+    parser.add_argument(
+        "--opt",
+        default="adam",
+        type=str,
+        choices=["adam", "noam", "lamb"],
+        help="Optimizer",
+    )
+    parser.add_argument(
+        "--accum-grad", default=1, type=int, help="Number of gradient accumuration"
+    )
+    parser.add_argument(
+        "--lr", default=1e-3, type=float, help="Learning rate for optimizer"
+    )
+    parser.add_argument("--eps", default=1e-6, type=float, help="Epsilon for optimizer")
+    parser.add_argument(
+        "--weight-decay",
+        default=1e-6,
+        type=float,
+        help="Weight decay coefficient for optimizer",
+    )
+    parser.add_argument(
+        "--epochs", "-e", default=30, type=int, help="Number of maximum epochs"
+    )
+    parser.add_argument(
+        "--early-stop-criterion",
+        default="validation/main/loss",
+        type=str,
+        nargs="?",
+        help="Value to monitor to trigger an early stopping of the training",
+    )
+    parser.add_argument(
+        "--patience",
+        default=3,
+        type=int,
+        nargs="?",
+        help="Number of epochs to wait without improvement "
+        "before stopping the training",
+    )
+    parser.add_argument(
+        "--grad-clip", default=1, type=float, help="Gradient norm threshold to clip"
+    )
+    parser.add_argument(
+        "--num-save-attention",
+        default=5,
+        type=int,
+        help="Number of samples of attention to be saved",
+    )
+    parser.add_argument(
+        "--keep-all-data-on-mem",
+        default=False,
+        type=strtobool,
+        help="Whether to keep all data on memory",
+    )
+    parser.add_argument(
+        "--enc-init",
+        default=None,
+        type=str,
+        help="Pre-trained model path to initialize encoder.",
+    )
+    parser.add_argument(
+        "--enc-init-mods",
+        default="enc.",
+        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
+        help="List of encoder modules to initialize, separated by a comma.",
+    )
+    parser.add_argument(
+        "--dec-init",
+        default=None,
+        type=str,
+        help="Pre-trained model path to initialize decoder.",
+    )
+    parser.add_argument(
+        "--dec-init-mods",
+        default="dec.",
+        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
+        help="List of decoder modules to initialize, separated by a comma.",
+    )
+    parser.add_argument(
+        "--freeze-mods",
+        default=None,
+        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
+        help="List of modules to freeze (not to train), separated by a comma.",
+    )
+    return parser
+def main(cmd_args):
+    """Run training."""
+    parser = get_parser()
+    args, _ = parser.parse_known_args(cmd_args)
+    from espnet.utils.dynamic_import import dynamic_import
+    model_class = dynamic_import(args.model_module)
+    assert issubclass(model_class, TTSInterface)
+    model_class.add_arguments(parser)
+    args = parser.parse_args(cmd_args)
+    # add version info in args
+    args.version = __version__
+    # logging info
+    if args.verbose > 0:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+    # If --ngpu is not given,
+    #   1. if CUDA_VISIBLE_DEVICES is set, all visible devices
+    #   2. if nvidia-smi exists, use all devices
+    #   3. else ngpu=0
+    if args.ngpu is None:
+        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
+        if cvd is not None:
+            ngpu = len(cvd.split(","))
+        else:
+            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
+            try:
+                p = subprocess.run(
+                    ["nvidia-smi", "-L"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
+                )
+            except (subprocess.CalledProcessError, FileNotFoundError):
+                ngpu = 0
+            else:
+                ngpu = len(p.stderr.decode().split("\n")) - 1
+    else:
+        ngpu = args.ngpu
+    logging.info(f"ngpu: {ngpu}")
+    # set random seed
+    logging.info("random seed = %d" % args.seed)
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    if args.backend == "pytorch":
+        from espnet.vc.pytorch_backend.vc import train
+        train(args)
+    else:
+        raise NotImplementedError("Only pytorch is supported.")
+if __name__ == "__main__":
+    main(sys.argv[1:])

espnet/lm/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Initialize sub package."""

espnet/lm/chainer_backend/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Initialize sub package."""

espnet/lm/chainer_backend/extlm.py ADDED Viewed

	@@ -0,0 +1,199 @@

+#!/usr/bin/env python3
+# Copyright 2018 Mitsubishi Electric Research Laboratories (Takaaki Hori)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+import math
+import chainer
+import chainer.functions as F
+from espnet.lm.lm_utils import make_lexical_tree
+# Definition of a multi-level (subword/word) language model
+class MultiLevelLM(chainer.Chain):
+    logzero = -10000000000.0
+    zero = 1.0e-10
+    def __init__(
+        self,
+        wordlm,
+        subwordlm,
+        word_dict,
+        subword_dict,
+        subwordlm_weight=0.8,
+        oov_penalty=1.0,
+        open_vocab=True,
+    ):
+        super(MultiLevelLM, self).__init__()
+        self.wordlm = wordlm
+        self.subwordlm = subwordlm
+        self.word_eos = word_dict["<eos>"]
+        self.word_unk = word_dict["<unk>"]
+        self.xp_word_eos = self.xp.full(1, self.word_eos, "i")
+        self.xp_word_unk = self.xp.full(1, self.word_unk, "i")
+        self.space = subword_dict["<space>"]
+        self.eos = subword_dict["<eos>"]
+        self.lexroot = make_lexical_tree(word_dict, subword_dict, self.word_unk)
+        self.log_oov_penalty = math.log(oov_penalty)
+        self.open_vocab = open_vocab
+        self.subword_dict_size = len(subword_dict)
+        self.subwordlm_weight = subwordlm_weight
+        self.normalized = True
+    def __call__(self, state, x):
+        # update state with input label x
+        if state is None:  # make initial states and log-prob vectors
+            wlm_state, z_wlm = self.wordlm(None, self.xp_word_eos)
+            wlm_logprobs = F.log_softmax(z_wlm).data
+            clm_state, z_clm = self.subwordlm(None, x)
+            log_y = F.log_softmax(z_clm).data * self.subwordlm_weight
+            new_node = self.lexroot
+            clm_logprob = 0.0
+            xi = self.space
+        else:
+            clm_state, wlm_state, wlm_logprobs, node, log_y, clm_logprob = state
+            xi = int(x)
+            if xi == self.space:  # inter-word transition
+                if node is not None and node[1] >= 0:  # check if the node is word end
+                    w = self.xp.full(1, node[1], "i")
+                else:  # this node is not a word end, which means <unk>
+                    w = self.xp_word_unk
+                # update wordlm state and log-prob vector
+                wlm_state, z_wlm = self.wordlm(wlm_state, w)
+                wlm_logprobs = F.log_softmax(z_wlm).data
+                new_node = self.lexroot  # move to the tree root
+                clm_logprob = 0.0
+            elif node is not None and xi in node[0]:  # intra-word transition
+                new_node = node[0][xi]
+                clm_logprob += log_y[0, xi]
+            elif self.open_vocab:  # if no path in the tree, enter open-vocabulary mode
+                new_node = None
+                clm_logprob += log_y[0, xi]
+            else:  # if open_vocab flag is disabled, return 0 probabilities
+                log_y = self.xp.full((1, self.subword_dict_size), self.logzero, "f")
+                return (clm_state, wlm_state, None, log_y, 0.0), log_y
+            clm_state, z_clm = self.subwordlm(clm_state, x)
+            log_y = F.log_softmax(z_clm).data * self.subwordlm_weight
+        # apply word-level probabilies for <space> and <eos> labels
+        if xi != self.space:
+            if new_node is not None and new_node[1] >= 0:  # if new node is word end
+                wlm_logprob = wlm_logprobs[:, new_node[1]] - clm_logprob
+            else:
+                wlm_logprob = wlm_logprobs[:, self.word_unk] + self.log_oov_penalty
+            log_y[:, self.space] = wlm_logprob
+            log_y[:, self.eos] = wlm_logprob
+        else:
+            log_y[:, self.space] = self.logzero
+            log_y[:, self.eos] = self.logzero
+        return (clm_state, wlm_state, wlm_logprobs, new_node, log_y, clm_logprob), log_y
+    def final(self, state):
+        clm_state, wlm_state, wlm_logprobs, node, log_y, clm_logprob = state
+        if node is not None and node[1] >= 0:  # check if the node is word end
+            w = self.xp.full(1, node[1], "i")
+        else:  # this node is not a word end, which means <unk>
+            w = self.xp_word_unk
+        wlm_state, z_wlm = self.wordlm(wlm_state, w)
+        return F.log_softmax(z_wlm).data[:, self.word_eos]
+# Definition of a look-ahead word language model
+class LookAheadWordLM(chainer.Chain):
+    logzero = -10000000000.0
+    zero = 1.0e-10
+    def __init__(
+        self, wordlm, word_dict, subword_dict, oov_penalty=0.0001, open_vocab=True
+    ):
+        super(LookAheadWordLM, self).__init__()
+        self.wordlm = wordlm
+        self.word_eos = word_dict["<eos>"]
+        self.word_unk = word_dict["<unk>"]
+        self.xp_word_eos = self.xp.full(1, self.word_eos, "i")
+        self.xp_word_unk = self.xp.full(1, self.word_unk, "i")
+        self.space = subword_dict["<space>"]
+        self.eos = subword_dict["<eos>"]
+        self.lexroot = make_lexical_tree(word_dict, subword_dict, self.word_unk)
+        self.oov_penalty = oov_penalty
+        self.open_vocab = open_vocab
+        self.subword_dict_size = len(subword_dict)
+        self.normalized = True
+    def __call__(self, state, x):
+        # update state with input label x
+        if state is None:  # make initial states and cumlative probability vector
+            wlm_state, z_wlm = self.wordlm(None, self.xp_word_eos)
+            cumsum_probs = self.xp.cumsum(F.softmax(z_wlm).data, axis=1)
+            new_node = self.lexroot
+            xi = self.space
+        else:
+            wlm_state, cumsum_probs, node = state
+            xi = int(x)
+            if xi == self.space:  # inter-word transition
+                if node is not None and node[1] >= 0:  # check if the node is word end
+                    w = self.xp.full(1, node[1], "i")
+                else:  # this node is not a word end, which means <unk>
+                    w = self.xp_word_unk
+                # update wordlm state and cumlative probability vector
+                wlm_state, z_wlm = self.wordlm(wlm_state, w)
+                cumsum_probs = self.xp.cumsum(F.softmax(z_wlm).data, axis=1)
+                new_node = self.lexroot  # move to the tree root
+            elif node is not None and xi in node[0]:  # intra-word transition
+                new_node = node[0][xi]
+            elif self.open_vocab:  # if no path in the tree, enter open-vocabulary mode
+                new_node = None
+            else:  # if open_vocab flag is disabled, return 0 probabilities
+                log_y = self.xp.full((1, self.subword_dict_size), self.logzero, "f")
+                return (wlm_state, None, None), log_y
+        if new_node is not None:
+            succ, wid, wids = new_node
+            # compute parent node probability
+            sum_prob = (
+                (cumsum_probs[:, wids[1]] - cumsum_probs[:, wids[0]])
+                if wids is not None
+                else 1.0
+            )
+            if sum_prob < self.zero:
+                log_y = self.xp.full((1, self.subword_dict_size), self.logzero, "f")
+                return (wlm_state, cumsum_probs, new_node), log_y
+            # set <unk> probability as a default value
+            unk_prob = (
+                cumsum_probs[:, self.word_unk] - cumsum_probs[:, self.word_unk - 1]
+            )
+            y = self.xp.full(
+                (1, self.subword_dict_size), unk_prob * self.oov_penalty, "f"
+            )
+            # compute transition probabilities to child nodes
+            for cid, nd in succ.items():
+                y[:, cid] = (
+                    cumsum_probs[:, nd[2][1]] - cumsum_probs[:, nd[2][0]]
+                ) / sum_prob
+            # apply word-level probabilies for <space> and <eos> labels
+            if wid >= 0:
+                wlm_prob = (cumsum_probs[:, wid] - cumsum_probs[:, wid - 1]) / sum_prob
+                y[:, self.space] = wlm_prob
+                y[:, self.eos] = wlm_prob
+            elif xi == self.space:
+                y[:, self.space] = self.zero
+                y[:, self.eos] = self.zero
+            log_y = self.xp.log(
+                self.xp.clip(y, self.zero, None)
+            )  # clip to avoid log(0)
+        else:  # if no path in the tree, transition probability is one
+            log_y = self.xp.zeros((1, self.subword_dict_size), "f")
+        return (wlm_state, cumsum_probs, new_node), log_y
+    def final(self, state):
+        wlm_state, cumsum_probs, node = state
+        if node is not None and node[1] >= 0:  # check if the node is word end
+            w = self.xp.full(1, node[1], "i")
+        else:  # this node is not a word end, which means <unk>
+            w = self.xp_word_unk
+        wlm_state, z_wlm = self.wordlm(wlm_state, w)
+        return F.log_softmax(z_wlm).data[:, self.word_eos]

espnet/lm/chainer_backend/lm.py ADDED Viewed

	@@ -0,0 +1,484 @@

+#!/usr/bin/env python3
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+# This code is ported from the following implementation written in Torch.
+# https://github.com/chainer/chainer/blob/master/examples/ptb/train_ptb_custom_loop.py
+import copy
+import json
+import logging
+import numpy as np
+import six
+import chainer
+from chainer.dataset import convert
+import chainer.functions as F
+import chainer.links as L
+# for classifier link
+from chainer.functions.loss import softmax_cross_entropy
+from chainer import link
+from chainer import reporter
+from chainer import training
+from chainer.training import extensions
+from espnet.lm.lm_utils import compute_perplexity
+from espnet.lm.lm_utils import count_tokens
+from espnet.lm.lm_utils import MakeSymlinkToBestModel
+from espnet.lm.lm_utils import ParallelSentenceIterator
+from espnet.lm.lm_utils import read_tokens
+import espnet.nets.chainer_backend.deterministic_embed_id as DL
+from espnet.nets.lm_interface import LMInterface
+from espnet.optimizer.factory import dynamic_import_optimizer
+from espnet.scheduler.chainer import ChainerScheduler
+from espnet.scheduler.scheduler import dynamic_import_scheduler
+from espnet.utils.training.tensorboard_logger import TensorboardLogger
+from tensorboardX import SummaryWriter
+from espnet.utils.deterministic_utils import set_deterministic_chainer
+from espnet.utils.training.evaluator import BaseEvaluator
+from espnet.utils.training.iterators import ShufflingEnabler
+from espnet.utils.training.train_utils import check_early_stop
+from espnet.utils.training.train_utils import set_early_stop
+# TODO(karita): reimplement RNNLM with new interface
+class DefaultRNNLM(LMInterface, link.Chain):
+    """Default RNNLM wrapper to compute reduce framewise loss values.
+    Args:
+        n_vocab (int): The size of the vocabulary
+        args (argparse.Namespace): configurations. see `add_arguments`
+    """
+    @staticmethod
+    def add_arguments(parser):
+        parser.add_argument(
+            "--type",
+            type=str,
+            default="lstm",
+            nargs="?",
+            choices=["lstm", "gru"],
+            help="Which type of RNN to use",
+        )
+        parser.add_argument(
+            "--layer", "-l", type=int, default=2, help="Number of hidden layers"
+        )
+        parser.add_argument(
+            "--unit", "-u", type=int, default=650, help="Number of hidden units"
+        )
+        return parser
+class ClassifierWithState(link.Chain):
+    """A wrapper for a chainer RNNLM
+    :param link.Chain predictor : The RNNLM
+    :param function lossfun: The loss function to use
+    :param int/str label_key:
+    """
+    def __init__(
+        self,
+        predictor,
+        lossfun=softmax_cross_entropy.softmax_cross_entropy,
+        label_key=-1,
+    ):
+        if not (isinstance(label_key, (int, str))):
+            raise TypeError("label_key must be int or str, but is %s" % type(label_key))
+        super(ClassifierWithState, self).__init__()
+        self.lossfun = lossfun
+        self.y = None
+        self.loss = None
+        self.label_key = label_key
+        with self.init_scope():
+            self.predictor = predictor
+    def __call__(self, state, *args, **kwargs):
+        """Computes the loss value for an input and label pair.
+            It also computes accuracy and stores it to the attribute.
+            When ``label_key`` is ``int``, the corresponding element in ``args``
+            is treated as ground truth labels. And when it is ``str``, the
+            element in ``kwargs`` is used.
+            The all elements of ``args`` and ``kwargs`` except the groundtruth
+            labels are features.
+            It feeds features to the predictor and compare the result
+            with ground truth labels.
+        :param state : The LM state
+        :param list[chainer.Variable] args : Input minibatch
+        :param dict[chainer.Variable] kwargs : Input minibatch
+        :return loss value
+        :rtype chainer.Variable
+        """
+        if isinstance(self.label_key, int):
+            if not (-len(args) <= self.label_key < len(args)):
+                msg = "Label key %d is out of bounds" % self.label_key
+                raise ValueError(msg)
+            t = args[self.label_key]
+            if self.label_key == -1:
+                args = args[:-1]
+            else:
+                args = args[: self.label_key] + args[self.label_key + 1 :]
+        elif isinstance(self.label_key, str):
+            if self.label_key not in kwargs:
+                msg = 'Label key "%s" is not found' % self.label_key
+                raise ValueError(msg)
+            t = kwargs[self.label_key]
+            del kwargs[self.label_key]
+        self.y = None
+        self.loss = None
+        state, self.y = self.predictor(state, *args, **kwargs)
+        self.loss = self.lossfun(self.y, t)
+        return state, self.loss
+    def predict(self, state, x):
+        """Predict log probabilities for given state and input x using the predictor
+        :param state : the state
+        :param x : the input
+        :return a tuple (state, log prob vector)
+        :rtype cupy/numpy array
+        """
+        if hasattr(self.predictor, "normalized") and self.predictor.normalized:
+            return self.predictor(state, x)
+        else:
+            state, z = self.predictor(state, x)
+            return state, F.log_softmax(z).data
+    def final(self, state):
+        """Predict final log probabilities for given state using the predictor
+        :param state : the state
+        :return log probability vector
+        :rtype cupy/numpy array
+        """
+        if hasattr(self.predictor, "final"):
+            return self.predictor.final(state)
+        else:
+            return 0.0
+# Definition of a recurrent net for language modeling
+class RNNLM(chainer.Chain):
+    """A chainer RNNLM
+    :param int n_vocab: The size of the vocabulary
+    :param int n_layers: The number of layers to create
+    :param int n_units: The number of units per layer
+    :param str type: The RNN type
+    """
+    def __init__(self, n_vocab, n_layers, n_units, typ="lstm"):
+        super(RNNLM, self).__init__()
+        with self.init_scope():
+            self.embed = DL.EmbedID(n_vocab, n_units)
+            self.rnn = (
+                chainer.ChainList(
+                    *[L.StatelessLSTM(n_units, n_units) for _ in range(n_layers)]
+                )
+                if typ == "lstm"
+                else chainer.ChainList(
+                    *[L.StatelessGRU(n_units, n_units) for _ in range(n_layers)]
+                )
+            )
+            self.lo = L.Linear(n_units, n_vocab)
+        for param in self.params():
+            param.data[...] = np.random.uniform(-0.1, 0.1, param.data.shape)
+        self.n_layers = n_layers
+        self.n_units = n_units
+        self.typ = typ
+    def __call__(self, state, x):
+        if state is None:
+            if self.typ == "lstm":
+                state = {"c": [None] * self.n_layers, "h": [None] * self.n_layers}
+            else:
+                state = {"h": [None] * self.n_layers}
+        h = [None] * self.n_layers
+        emb = self.embed(x)
+        if self.typ == "lstm":
+            c = [None] * self.n_layers
+            c[0], h[0] = self.rnn[0](state["c"][0], state["h"][0], F.dropout(emb))
+            for n in six.moves.range(1, self.n_layers):
+                c[n], h[n] = self.rnn[n](
+                    state["c"][n], state["h"][n], F.dropout(h[n - 1])
+                )
+            state = {"c": c, "h": h}
+        else:
+            if state["h"][0] is None:
+                xp = self.xp
+                with chainer.backends.cuda.get_device_from_id(self._device_id):
+                    state["h"][0] = chainer.Variable(
+                        xp.zeros((emb.shape[0], self.n_units), dtype=emb.dtype)
+                    )
+            h[0] = self.rnn[0](state["h"][0], F.dropout(emb))
+            for n in six.moves.range(1, self.n_layers):
+                if state["h"][n] is None:
+                    xp = self.xp
+                    with chainer.backends.cuda.get_device_from_id(self._device_id):
+                        state["h"][n] = chainer.Variable(
+                            xp.zeros(
+                                (h[n - 1].shape[0], self.n_units), dtype=h[n - 1].dtype
+                            )
+                        )
+                h[n] = self.rnn[n](state["h"][n], F.dropout(h[n - 1]))
+            state = {"h": h}
+        y = self.lo(F.dropout(h[-1]))
+        return state, y
+class BPTTUpdater(training.updaters.StandardUpdater):
+    """An updater for a chainer LM
+    :param chainer.dataset.Iterator train_iter : The train iterator
+    :param optimizer:
+    :param schedulers:
+    :param int device : The device id
+    :param int accum_grad :
+    """
+    def __init__(self, train_iter, optimizer, schedulers, device, accum_grad):
+        super(BPTTUpdater, self).__init__(train_iter, optimizer, device=device)
+        self.scheduler = ChainerScheduler(schedulers, optimizer)
+        self.accum_grad = accum_grad
+    # The core part of the update routine can be customized by overriding.
+    def update_core(self):
+        # When we pass one iterator and optimizer to StandardUpdater.__init__,
+        # they are automatically named 'main'.
+        train_iter = self.get_iterator("main")
+        optimizer = self.get_optimizer("main")
+        count = 0
+        sum_loss = 0
+        optimizer.target.cleargrads()  # Clear the parameter gradients
+        for _ in range(self.accum_grad):
+            # Progress the dataset iterator for sentences at each iteration.
+            batch = train_iter.__next__()
+            x, t = convert.concat_examples(batch, device=self.device, padding=(0, -1))
+            # Concatenate the token IDs to matrices and send them to the device
+            # self.converter does this job
+            # (it is chainer.dataset.concat_examples by default)
+            xp = chainer.backends.cuda.get_array_module(x)
+            loss = 0
+            state = None
+            batch_size, sequence_length = x.shape
+            for i in six.moves.range(sequence_length):
+                # Compute the loss at this time step and accumulate it
+                state, loss_batch = optimizer.target(
+                    state, chainer.Variable(x[:, i]), chainer.Variable(t[:, i])
+                )
+                non_zeros = xp.count_nonzero(x[:, i])
+                loss += loss_batch * non_zeros
+                count += int(non_zeros)
+            # backward
+            loss /= batch_size * self.accum_grad  # normalized by batch size
+            sum_loss += float(loss.data)
+            loss.backward()  # Backprop
+            loss.unchain_backward()  # Truncate the graph
+        reporter.report({"loss": sum_loss}, optimizer.target)
+        reporter.report({"count": count}, optimizer.target)
+        # update
+        optimizer.update()  # Update the parameters
+        self.scheduler.step(self.iteration)
+class LMEvaluator(BaseEvaluator):
+    """A custom evaluator for a chainer LM
+    :param chainer.dataset.Iterator val_iter : The validation iterator
+    :param eval_model : The model to evaluate
+    :param int device : The device id to use
+    """
+    def __init__(self, val_iter, eval_model, device):
+        super(LMEvaluator, self).__init__(val_iter, eval_model, device=device)
+    def evaluate(self):
+        val_iter = self.get_iterator("main")
+        target = self.get_target("main")
+        loss = 0
+        count = 0
+        for batch in copy.copy(val_iter):
+            x, t = convert.concat_examples(batch, device=self.device, padding=(0, -1))
+            xp = chainer.backends.cuda.get_array_module(x)
+            state = None
+            for i in six.moves.range(len(x[0])):
+                state, loss_batch = target(state, x[:, i], t[:, i])
+                non_zeros = xp.count_nonzero(x[:, i])
+                loss += loss_batch.data * non_zeros
+                count += int(non_zeros)
+        # report validation loss
+        observation = {}
+        with reporter.report_scope(observation):
+            reporter.report({"loss": float(loss / count)}, target)
+        return observation
+def train(args):
+    """Train with the given args
+    :param Namespace args: The program arguments
+    """
+    # TODO(karita): support this
+    if args.model_module != "default":
+        raise NotImplementedError("chainer backend does not support --model-module")
+    # display chainer version
+    logging.info("chainer version = " + chainer.__version__)
+    set_deterministic_chainer(args)
+    # check cuda and cudnn availability
+    if not chainer.cuda.available:
+        logging.warning("cuda is not available")
+    if not chainer.cuda.cudnn_enabled:
+        logging.warning("cudnn is not available")
+    # get special label ids
+    unk = args.char_list_dict["<unk>"]
+    eos = args.char_list_dict["<eos>"]
+    # read tokens as a sequence of sentences
+    train = read_tokens(args.train_label, args.char_list_dict)
+    val = read_tokens(args.valid_label, args.char_list_dict)
+    # count tokens
+    n_train_tokens, n_train_oovs = count_tokens(train, unk)
+    n_val_tokens, n_val_oovs = count_tokens(val, unk)
+    logging.info("#vocab = " + str(args.n_vocab))
+    logging.info("#sentences in the training data = " + str(len(train)))
+    logging.info("#tokens in the training data = " + str(n_train_tokens))
+    logging.info(
+        "oov rate in the training data = %.2f %%"
+        % (n_train_oovs / n_train_tokens * 100)
+    )
+    logging.info("#sentences in the validation data = " + str(len(val)))
+    logging.info("#tokens in the validation data = " + str(n_val_tokens))
+    logging.info(
+        "oov rate in the validation data = %.2f %%" % (n_val_oovs / n_val_tokens * 100)
+    )
+    use_sortagrad = args.sortagrad == -1 or args.sortagrad > 0
+    # Create the dataset iterators
+    train_iter = ParallelSentenceIterator(
+        train,
+        args.batchsize,
+        max_length=args.maxlen,
+        sos=eos,
+        eos=eos,
+        shuffle=not use_sortagrad,
+    )
+    val_iter = ParallelSentenceIterator(
+        val, args.batchsize, max_length=args.maxlen, sos=eos, eos=eos, repeat=False
+    )
+    epoch_iters = int(len(train_iter.batch_indices) / args.accum_grad)
+    logging.info("#iterations per epoch = %d" % epoch_iters)
+    logging.info("#total iterations = " + str(args.epoch * epoch_iters))
+    # Prepare an RNNLM model
+    rnn = RNNLM(args.n_vocab, args.layer, args.unit, args.type)
+    model = ClassifierWithState(rnn)
+    if args.ngpu > 1:
+        logging.warning("currently, multi-gpu is not supported. use single gpu.")
+    if args.ngpu > 0:
+        # Make the specified GPU current
+        gpu_id = 0
+        chainer.cuda.get_device_from_id(gpu_id).use()
+        model.to_gpu()
+    else:
+        gpu_id = -1
+    # Save model conf to json
+    model_conf = args.outdir + "/model.json"
+    with open(model_conf, "wb") as f:
+        logging.info("writing a model config file to " + model_conf)
+        f.write(
+            json.dumps(vars(args), indent=4, ensure_ascii=False, sort_keys=True).encode(
+                "utf_8"
+            )
+        )
+    # Set up an optimizer
+    opt_class = dynamic_import_optimizer(args.opt, args.backend)
+    optimizer = opt_class.from_args(model, args)
+    if args.schedulers is None:
+        schedulers = []
+    else:
+        schedulers = [dynamic_import_scheduler(v)(k, args) for k, v in args.schedulers]
+    optimizer.setup(model)
+    optimizer.add_hook(chainer.optimizer.GradientClipping(args.gradclip))
+    updater = BPTTUpdater(train_iter, optimizer, schedulers, gpu_id, args.accum_grad)
+    trainer = training.Trainer(updater, (args.epoch, "epoch"), out=args.outdir)
+    trainer.extend(LMEvaluator(val_iter, model, device=gpu_id))
+    trainer.extend(
+        extensions.LogReport(
+            postprocess=compute_perplexity,
+            trigger=(args.report_interval_iters, "iteration"),
+        )
+    )
+    trainer.extend(
+        extensions.PrintReport(
+            ["epoch", "iteration", "perplexity", "val_perplexity", "elapsed_time"]
+        ),
+        trigger=(args.report_interval_iters, "iteration"),
+    )
+    trainer.extend(extensions.ProgressBar(update_interval=args.report_interval_iters))
+    trainer.extend(extensions.snapshot(filename="snapshot.ep.{.updater.epoch}"))
+    trainer.extend(extensions.snapshot_object(model, "rnnlm.model.{.updater.epoch}"))
+    # MEMO(Hori): wants to use MinValueTrigger, but it seems to fail in resuming
+    trainer.extend(MakeSymlinkToBestModel("validation/main/loss", "rnnlm.model"))
+    if use_sortagrad:
+        trainer.extend(
+            ShufflingEnabler([train_iter]),
+            trigger=(args.sortagrad if args.sortagrad != -1 else args.epoch, "epoch"),
+        )
+    if args.resume:
+        logging.info("resumed from %s" % args.resume)
+        chainer.serializers.load_npz(args.resume, trainer)
+    set_early_stop(trainer, args, is_lm=True)
+    if args.tensorboard_dir is not None and args.tensorboard_dir != "":
+        writer = SummaryWriter(args.tensorboard_dir)
+        trainer.extend(
+            TensorboardLogger(writer), trigger=(args.report_interval_iters, "iteration")
+        )
+    trainer.run()
+    check_early_stop(trainer, args.epoch)
+    # compute perplexity for test set
+    if args.test_label:
+        logging.info("test the best model")
+        chainer.serializers.load_npz(args.outdir + "/rnnlm.model.best", model)
+        test = read_tokens(args.test_label, args.char_list_dict)
+        n_test_tokens, n_test_oovs = count_tokens(test, unk)
+        logging.info("#sentences in the test data = " + str(len(test)))
+        logging.info("#tokens in the test data = " + str(n_test_tokens))
+        logging.info(
+            "oov rate in the test data = %.2f %%" % (n_test_oovs / n_test_tokens * 100)
+        )
+        test_iter = ParallelSentenceIterator(
+            test, args.batchsize, max_length=args.maxlen, sos=eos, eos=eos, repeat=False
+        )
+        evaluator = LMEvaluator(test_iter, model, device=gpu_id)
+        with chainer.using_config("train", False):
+            result = evaluator()
+        logging.info("test perplexity: " + str(np.exp(float(result["main/loss"]))))

espnet/lm/lm_utils.py ADDED Viewed

	@@ -0,0 +1,293 @@

+#!/usr/bin/env python3
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+# This code is ported from the following implementation written in Torch.
+# https://github.com/chainer/chainer/blob/master/examples/ptb/train_ptb_custom_loop.py
+import chainer
+import h5py
+import logging
+import numpy as np
+import os
+import random
+import six
+from tqdm import tqdm
+from chainer.training import extension
+def load_dataset(path, label_dict, outdir=None):
+    """Load and save HDF5 that contains a dataset and stats for LM
+    Args:
+        path (str): The path of an input text dataset file
+        label_dict (dict[str, int]):
+            dictionary that maps token label string to its ID number
+        outdir (str): The path of an output dir
+    Returns:
+        tuple[list[np.ndarray], int, int]: Tuple of
+            token IDs in np.int32 converted by `read_tokens`
+            the number of tokens by `count_tokens`,
+            and the number of OOVs by `count_tokens`
+    """
+    if outdir is not None:
+        os.makedirs(outdir, exist_ok=True)
+        filename = outdir + "/" + os.path.basename(path) + ".h5"
+        if os.path.exists(filename):
+            logging.info(f"loading binary dataset: {filename}")
+            f = h5py.File(filename, "r")
+            return f["data"][:], f["n_tokens"][()], f["n_oovs"][()]
+    else:
+        logging.info("skip dump/load HDF5 because the output dir is not specified")
+    logging.info(f"reading text dataset: {path}")
+    ret = read_tokens(path, label_dict)
+    n_tokens, n_oovs = count_tokens(ret, label_dict["<unk>"])
+    if outdir is not None:
+        logging.info(f"saving binary dataset: {filename}")
+        with h5py.File(filename, "w") as f:
+            # http://docs.h5py.org/en/stable/special.html#arbitrary-vlen-data
+            data = f.create_dataset(
+                "data", (len(ret),), dtype=h5py.special_dtype(vlen=np.int32)
+            )
+            data[:] = ret
+            f["n_tokens"] = n_tokens
+            f["n_oovs"] = n_oovs
+    return ret, n_tokens, n_oovs
+def read_tokens(filename, label_dict):
+    """Read tokens as a sequence of sentences
+    :param str filename : The name of the input file
+    :param dict label_dict : dictionary that maps token label string to its ID number
+    :return list of ID sequences
+    :rtype list
+    """
+    data = []
+    unk = label_dict["<unk>"]
+    for ln in tqdm(open(filename, "r", encoding="utf-8")):
+        data.append(
+            np.array(
+                [label_dict.get(label, unk) for label in ln.split()], dtype=np.int32
+            )
+        )
+    return data
+def count_tokens(data, unk_id=None):
+    """Count tokens and oovs in token ID sequences.
+    Args:
+        data (list[np.ndarray]): list of token ID sequences
+        unk_id (int): ID of unknown token
+    Returns:
+        tuple: tuple of number of token occurrences and number of oov tokens
+    """
+    n_tokens = 0
+    n_oovs = 0
+    for sentence in data:
+        n_tokens += len(sentence)
+        if unk_id is not None:
+            n_oovs += np.count_nonzero(sentence == unk_id)
+    return n_tokens, n_oovs
+def compute_perplexity(result):
+    """Computes and add the perplexity to the LogReport
+    :param dict result: The current observations
+    """
+    # Routine to rewrite the result dictionary of LogReport to add perplexity values
+    result["perplexity"] = np.exp(result["main/loss"] / result["main/count"])
+    if "validation/main/loss" in result:
+        result["val_perplexity"] = np.exp(result["validation/main/loss"])
+class ParallelSentenceIterator(chainer.dataset.Iterator):
+    """Dataset iterator to create a batch of sentences.
+    This iterator returns a pair of sentences, where one token is shifted
+    between the sentences like '<sos> w1 w2 w3' and 'w1 w2 w3 <eos>'
+    Sentence batches are made in order of longer sentences, and then
+    randomly shuffled.
+    """
+    def __init__(
+        self, dataset, batch_size, max_length=0, sos=0, eos=0, repeat=True, shuffle=True
+    ):
+        self.dataset = dataset
+        self.batch_size = batch_size  # batch size
+        # Number of completed sweeps over the dataset. In this case, it is
+        # incremented if every word is visited at least once after the last
+        # increment.
+        self.epoch = 0
+        # True if the epoch is incremented at the last iteration.
+        self.is_new_epoch = False
+        self.repeat = repeat
+        length = len(dataset)
+        self.batch_indices = []
+        # make mini-batches
+        if batch_size > 1:
+            indices = sorted(range(len(dataset)), key=lambda i: -len(dataset[i]))
+            bs = 0
+            while bs < length:
+                be = min(bs + batch_size, length)
+                # batch size is automatically reduced if the sentence length
+                # is larger than max_length
+                if max_length > 0:
+                    sent_length = len(dataset[indices[bs]])
+                    be = min(
+                        be, bs + max(batch_size // (sent_length // max_length + 1), 1)
+                    )
+                self.batch_indices.append(np.array(indices[bs:be]))
+                bs = be
+            if shuffle:
+                # shuffle batches
+                random.shuffle(self.batch_indices)
+        else:
+            self.batch_indices = [np.array([i]) for i in six.moves.range(length)]
+        # NOTE: this is not a count of parameter updates. It is just a count of
+        # calls of ``__next__``.
+        self.iteration = 0
+        self.sos = sos
+        self.eos = eos
+        # use -1 instead of None internally
+        self._previous_epoch_detail = -1.0
+    def __next__(self):
+        # This iterator returns a list representing a mini-batch. Each item
+        # indicates a sentence pair like '<sos> w1 w2 w3' and 'w1 w2 w3 <eos>'
+        # represented by token IDs.
+        n_batches = len(self.batch_indices)
+        if not self.repeat and self.iteration >= n_batches:
+            # If not self.repeat, this iterator stops at the end of the first
+            # epoch (i.e., when all words are visited once).
+            raise StopIteration
+        batch = []
+        for idx in self.batch_indices[self.iteration % n_batches]:
+            batch.append(
+                (
+                    np.append([self.sos], self.dataset[idx]),
+                    np.append(self.dataset[idx], [self.eos]),
+                )
+            )
+        self._previous_epoch_detail = self.epoch_detail
+        self.iteration += 1
+        epoch = self.iteration // n_batches
+        self.is_new_epoch = self.epoch < epoch
+        if self.is_new_epoch:
+            self.epoch = epoch
+        return batch
+    def start_shuffle(self):
+        random.shuffle(self.batch_indices)
+    @property
+    def epoch_detail(self):
+        # Floating point version of epoch.
+        return self.iteration / len(self.batch_indices)
+    @property
+    def previous_epoch_detail(self):
+        if self._previous_epoch_detail < 0:
+            return None
+        return self._previous_epoch_detail
+    def serialize(self, serializer):
+        # It is important to serialize the state to be recovered on resume.
+        self.iteration = serializer("iteration", self.iteration)
+        self.epoch = serializer("epoch", self.epoch)
+        try:
+            self._previous_epoch_detail = serializer(
+                "previous_epoch_detail", self._previous_epoch_detail
+            )
+        except KeyError:
+            # guess previous_epoch_detail for older version
+            self._previous_epoch_detail = self.epoch + (
+                self.current_position - 1
+            ) / len(self.batch_indices)
+            if self.epoch_detail > 0:
+                self._previous_epoch_detail = max(self._previous_epoch_detail, 0.0)
+            else:
+                self._previous_epoch_detail = -1.0
+class MakeSymlinkToBestModel(extension.Extension):
+    """Extension that makes a symbolic link to the best model
+    :param str key: Key of value
+    :param str prefix: Prefix of model files and link target
+    :param str suffix: Suffix of link target
+    """
+    def __init__(self, key, prefix="model", suffix="best"):
+        super(MakeSymlinkToBestModel, self).__init__()
+        self.best_model = -1
+        self.min_loss = 0.0
+        self.key = key
+        self.prefix = prefix
+        self.suffix = suffix
+    def __call__(self, trainer):
+        observation = trainer.observation
+        if self.key in observation:
+            loss = observation[self.key]
+            if self.best_model == -1 or loss < self.min_loss:
+                self.min_loss = loss
+                self.best_model = trainer.updater.epoch
+                src = "%s.%d" % (self.prefix, self.best_model)
+                dest = os.path.join(trainer.out, "%s.%s" % (self.prefix, self.suffix))
+                if os.path.lexists(dest):
+                    os.remove(dest)
+                os.symlink(src, dest)
+                logging.info("best model is " + src)
+    def serialize(self, serializer):
+        if isinstance(serializer, chainer.serializer.Serializer):
+            serializer("_best_model", self.best_model)
+            serializer("_min_loss", self.min_loss)
+            serializer("_key", self.key)
+            serializer("_prefix", self.prefix)
+            serializer("_suffix", self.suffix)
+        else:
+            self.best_model = serializer("_best_model", -1)
+            self.min_loss = serializer("_min_loss", 0.0)
+            self.key = serializer("_key", "")
+            self.prefix = serializer("_prefix", "model")
+            self.suffix = serializer("_suffix", "best")
+# TODO(Hori): currently it only works with character-word level LM.
+#             need to consider any types of subwords-to-word mapping.
+def make_lexical_tree(word_dict, subword_dict, word_unk):
+    """Make a lexical tree to compute word-level probabilities"""
+    # node [dict(subword_id -> node), word_id, word_set[start-1, end]]
+    root = [{}, -1, None]
+    for w, wid in word_dict.items():
+        if wid > 0 and wid != word_unk:  # skip <blank> and <unk>
+            if True in [c not in subword_dict for c in w]:  # skip unknown subword
+                continue
+            succ = root[0]  # get successors from root node
+            for i, c in enumerate(w):
+                cid = subword_dict[c]
+                if cid not in succ:  # if next node does not exist, make a new node
+                    succ[cid] = [{}, -1, (wid - 1, wid)]
+                else:
+                    prev = succ[cid][2]
+                    succ[cid][2] = (min(prev[0], wid - 1), max(prev[1], wid))
+                if i == len(w) - 1:  # if word end, set word id
+                    succ[cid][1] = wid
+                succ = succ[cid][0]  # move to the child successors
+    return root

espnet/lm/pytorch_backend/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Initialize sub package."""

espnet/lm/pytorch_backend/extlm.py ADDED Viewed

	@@ -0,0 +1,218 @@

+#!/usr/bin/env python3
+# Copyright 2018 Mitsubishi Electric Research Laboratories (Takaaki Hori)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from espnet.lm.lm_utils import make_lexical_tree
+from espnet.nets.pytorch_backend.nets_utils import to_device
+# Definition of a multi-level (subword/word) language model
+class MultiLevelLM(nn.Module):
+    logzero = -10000000000.0
+    zero = 1.0e-10
+    def __init__(
+        self,
+        wordlm,
+        subwordlm,
+        word_dict,
+        subword_dict,
+        subwordlm_weight=0.8,
+        oov_penalty=1.0,
+        open_vocab=True,
+    ):
+        super(MultiLevelLM, self).__init__()
+        self.wordlm = wordlm
+        self.subwordlm = subwordlm
+        self.word_eos = word_dict["<eos>"]
+        self.word_unk = word_dict["<unk>"]
+        self.var_word_eos = torch.LongTensor([self.word_eos])
+        self.var_word_unk = torch.LongTensor([self.word_unk])
+        self.space = subword_dict["<space>"]
+        self.eos = subword_dict["<eos>"]
+        self.lexroot = make_lexical_tree(word_dict, subword_dict, self.word_unk)
+        self.log_oov_penalty = math.log(oov_penalty)
+        self.open_vocab = open_vocab
+        self.subword_dict_size = len(subword_dict)
+        self.subwordlm_weight = subwordlm_weight
+        self.normalized = True
+    def forward(self, state, x):
+        # update state with input label x
+        if state is None:  # make initial states and log-prob vectors
+            self.var_word_eos = to_device(x, self.var_word_eos)
+            self.var_word_unk = to_device(x, self.var_word_eos)
+            wlm_state, z_wlm = self.wordlm(None, self.var_word_eos)
+            wlm_logprobs = F.log_softmax(z_wlm, dim=1)
+            clm_state, z_clm = self.subwordlm(None, x)
+            log_y = F.log_softmax(z_clm, dim=1) * self.subwordlm_weight
+            new_node = self.lexroot
+            clm_logprob = 0.0
+            xi = self.space
+        else:
+            clm_state, wlm_state, wlm_logprobs, node, log_y, clm_logprob = state
+            xi = int(x)
+            if xi == self.space:  # inter-word transition
+                if node is not None and node[1] >= 0:  # check if the node is word end
+                    w = to_device(x, torch.LongTensor([node[1]]))
+                else:  # this node is not a word end, which means <unk>
+                    w = self.var_word_unk
+                # update wordlm state and log-prob vector
+                wlm_state, z_wlm = self.wordlm(wlm_state, w)
+                wlm_logprobs = F.log_softmax(z_wlm, dim=1)
+                new_node = self.lexroot  # move to the tree root
+                clm_logprob = 0.0
+            elif node is not None and xi in node[0]:  # intra-word transition
+                new_node = node[0][xi]
+                clm_logprob += log_y[0, xi]
+            elif self.open_vocab:  # if no path in the tree, enter open-vocabulary mode
+                new_node = None
+                clm_logprob += log_y[0, xi]
+            else:  # if open_vocab flag is disabled, return 0 probabilities
+                log_y = to_device(
+                    x, torch.full((1, self.subword_dict_size), self.logzero)
+                )
+                return (clm_state, wlm_state, wlm_logprobs, None, log_y, 0.0), log_y
+            clm_state, z_clm = self.subwordlm(clm_state, x)
+            log_y = F.log_softmax(z_clm, dim=1) * self.subwordlm_weight
+        # apply word-level probabilies for <space> and <eos> labels
+        if xi != self.space:
+            if new_node is not None and new_node[1] >= 0:  # if new node is word end
+                wlm_logprob = wlm_logprobs[:, new_node[1]] - clm_logprob
+            else:
+                wlm_logprob = wlm_logprobs[:, self.word_unk] + self.log_oov_penalty
+            log_y[:, self.space] = wlm_logprob
+            log_y[:, self.eos] = wlm_logprob
+        else:
+            log_y[:, self.space] = self.logzero
+            log_y[:, self.eos] = self.logzero
+        return (
+            (clm_state, wlm_state, wlm_logprobs, new_node, log_y, float(clm_logprob)),
+            log_y,
+        )
+    def final(self, state):
+        clm_state, wlm_state, wlm_logprobs, node, log_y, clm_logprob = state
+        if node is not None and node[1] >= 0:  # check if the node is word end
+            w = to_device(wlm_logprobs, torch.LongTensor([node[1]]))
+        else:  # this node is not a word end, which means <unk>
+            w = self.var_word_unk
+        wlm_state, z_wlm = self.wordlm(wlm_state, w)
+        return float(F.log_softmax(z_wlm, dim=1)[:, self.word_eos])
+# Definition of a look-ahead word language model
+class LookAheadWordLM(nn.Module):
+    logzero = -10000000000.0
+    zero = 1.0e-10
+    def __init__(
+        self, wordlm, word_dict, subword_dict, oov_penalty=0.0001, open_vocab=True
+    ):
+        super(LookAheadWordLM, self).__init__()
+        self.wordlm = wordlm
+        self.word_eos = word_dict["<eos>"]
+        self.word_unk = word_dict["<unk>"]
+        self.var_word_eos = torch.LongTensor([self.word_eos])
+        self.var_word_unk = torch.LongTensor([self.word_unk])
+        self.space = subword_dict["<space>"]
+        self.eos = subword_dict["<eos>"]
+        self.lexroot = make_lexical_tree(word_dict, subword_dict, self.word_unk)
+        self.oov_penalty = oov_penalty
+        self.open_vocab = open_vocab
+        self.subword_dict_size = len(subword_dict)
+        self.zero_tensor = torch.FloatTensor([self.zero])
+        self.normalized = True
+    def forward(self, state, x):
+        # update state with input label x
+        if state is None:  # make initial states and cumlative probability vector
+            self.var_word_eos = to_device(x, self.var_word_eos)
+            self.var_word_unk = to_device(x, self.var_word_eos)
+            self.zero_tensor = to_device(x, self.zero_tensor)
+            wlm_state, z_wlm = self.wordlm(None, self.var_word_eos)
+            cumsum_probs = torch.cumsum(F.softmax(z_wlm, dim=1), dim=1)
+            new_node = self.lexroot
+            xi = self.space
+        else:
+            wlm_state, cumsum_probs, node = state
+            xi = int(x)
+            if xi == self.space:  # inter-word transition
+                if node is not None and node[1] >= 0:  # check if the node is word end
+                    w = to_device(x, torch.LongTensor([node[1]]))
+                else:  # this node is not a word end, which means <unk>
+                    w = self.var_word_unk
+                # update wordlm state and cumlative probability vector
+                wlm_state, z_wlm = self.wordlm(wlm_state, w)
+                cumsum_probs = torch.cumsum(F.softmax(z_wlm, dim=1), dim=1)
+                new_node = self.lexroot  # move to the tree root
+            elif node is not None and xi in node[0]:  # intra-word transition
+                new_node = node[0][xi]
+            elif self.open_vocab:  # if no path in the tree, enter open-vocabulary mode
+                new_node = None
+            else:  # if open_vocab flag is disabled, return 0 probabilities
+                log_y = to_device(
+                    x, torch.full((1, self.subword_dict_size), self.logzero)
+                )
+                return (wlm_state, None, None), log_y
+        if new_node is not None:
+            succ, wid, wids = new_node
+            # compute parent node probability
+            sum_prob = (
+                (cumsum_probs[:, wids[1]] - cumsum_probs[:, wids[0]])
+                if wids is not None
+                else 1.0
+            )
+            if sum_prob < self.zero:
+                log_y = to_device(
+                    x, torch.full((1, self.subword_dict_size), self.logzero)
+                )
+                return (wlm_state, cumsum_probs, new_node), log_y
+            # set <unk> probability as a default value
+            unk_prob = (
+                cumsum_probs[:, self.word_unk] - cumsum_probs[:, self.word_unk - 1]
+            )
+            y = to_device(
+                x,
+                torch.full(
+                    (1, self.subword_dict_size), float(unk_prob) * self.oov_penalty
+                ),
+            )
+            # compute transition probabilities to child nodes
+            for cid, nd in succ.items():
+                y[:, cid] = (
+                    cumsum_probs[:, nd[2][1]] - cumsum_probs[:, nd[2][0]]
+                ) / sum_prob
+            # apply word-level probabilies for <space> and <eos> labels
+            if wid >= 0:
+                wlm_prob = (cumsum_probs[:, wid] - cumsum_probs[:, wid - 1]) / sum_prob
+                y[:, self.space] = wlm_prob
+                y[:, self.eos] = wlm_prob
+            elif xi == self.space:
+                y[:, self.space] = self.zero
+                y[:, self.eos] = self.zero
+            log_y = torch.log(torch.max(y, self.zero_tensor))  # clip to avoid log(0)
+        else:  # if no path in the tree, transition probability is one
+            log_y = to_device(x, torch.zeros(1, self.subword_dict_size))
+        return (wlm_state, cumsum_probs, new_node), log_y
+    def final(self, state):
+        wlm_state, cumsum_probs, node = state
+        if node is not None and node[1] >= 0:  # check if the node is word end
+            w = to_device(cumsum_probs, torch.LongTensor([node[1]]))
+        else:  # this node is not a word end, which means <unk>
+            w = self.var_word_unk
+        wlm_state, z_wlm = self.wordlm(wlm_state, w)
+        return float(F.log_softmax(z_wlm, dim=1)[:, self.word_eos])

espnet/lm/pytorch_backend/lm.py ADDED Viewed

	@@ -0,0 +1,410 @@

+#!/usr/bin/env python3
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+# This code is ported from the following implementation written in Torch.
+# https://github.com/chainer/chainer/blob/master/examples/ptb/train_ptb_custom_loop.py
+"""LM training in pytorch."""
+import copy
+import json
+import logging
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn.parallel import data_parallel
+from chainer import Chain
+from chainer.dataset import convert
+from chainer import reporter
+from chainer import training
+from chainer.training import extensions
+from espnet.lm.lm_utils import count_tokens
+from espnet.lm.lm_utils import load_dataset
+from espnet.lm.lm_utils import MakeSymlinkToBestModel
+from espnet.lm.lm_utils import ParallelSentenceIterator
+from espnet.lm.lm_utils import read_tokens
+from espnet.nets.lm_interface import dynamic_import_lm
+from espnet.nets.lm_interface import LMInterface
+from espnet.optimizer.factory import dynamic_import_optimizer
+from espnet.scheduler.pytorch import PyTorchScheduler
+from espnet.scheduler.scheduler import dynamic_import_scheduler
+from espnet.asr.asr_utils import snapshot_object
+from espnet.asr.asr_utils import torch_load
+from espnet.asr.asr_utils import torch_resume
+from espnet.asr.asr_utils import torch_snapshot
+from espnet.utils.training.tensorboard_logger import TensorboardLogger
+from tensorboardX import SummaryWriter
+from espnet.utils.deterministic_utils import set_deterministic_pytorch
+from espnet.utils.training.evaluator import BaseEvaluator
+from espnet.utils.training.iterators import ShufflingEnabler
+from espnet.utils.training.train_utils import check_early_stop
+from espnet.utils.training.train_utils import set_early_stop
+def compute_perplexity(result):
+    """Compute and add the perplexity to the LogReport.
+    :param dict result: The current observations
+    """
+    # Routine to rewrite the result dictionary of LogReport to add perplexity values
+    result["perplexity"] = np.exp(result["main/nll"] / result["main/count"])
+    if "validation/main/nll" in result:
+        result["val_perplexity"] = np.exp(
+            result["validation/main/nll"] / result["validation/main/count"]
+        )
+class Reporter(Chain):
+    """Dummy module to use chainer's trainer."""
+    def report(self, loss):
+        """Report nothing."""
+        pass
+def concat_examples(batch, device=None, padding=None):
+    """Concat examples in minibatch.
+    :param np.ndarray batch: The batch to concatenate
+    :param int device: The device to send to
+    :param Tuple[int,int] padding: The padding to use
+    :return: (inputs, targets)
+    :rtype (torch.Tensor, torch.Tensor)
+    """
+    x, t = convert.concat_examples(batch, padding=padding)
+    x = torch.from_numpy(x)
+    t = torch.from_numpy(t)
+    if device is not None and device >= 0:
+        x = x.cuda(device)
+        t = t.cuda(device)
+    return x, t
+class BPTTUpdater(training.StandardUpdater):
+    """An updater for a pytorch LM."""
+    def __init__(
+        self,
+        train_iter,
+        model,
+        optimizer,
+        schedulers,
+        device,
+        gradclip=None,
+        use_apex=False,
+        accum_grad=1,
+    ):
+        """Initialize class.
+        Args:
+            train_iter (chainer.dataset.Iterator): The train iterator
+            model (LMInterface) : The model to update
+            optimizer (torch.optim.Optimizer): The optimizer for training
+            schedulers (espnet.scheduler.scheduler.SchedulerInterface):
+                The schedulers of `optimizer`
+            device (int): The device id
+            gradclip (float): The gradient clipping value to use
+            use_apex (bool): The flag to use Apex in backprop.
+            accum_grad (int): The number of gradient accumulation.
+        """
+        super(BPTTUpdater, self).__init__(train_iter, optimizer)
+        self.model = model
+        self.device = device
+        self.gradclip = gradclip
+        self.use_apex = use_apex
+        self.scheduler = PyTorchScheduler(schedulers, optimizer)
+        self.accum_grad = accum_grad
+    # The core part of the update routine can be customized by overriding.
+    def update_core(self):
+        """Update the model."""
+        # When we pass one iterator and optimizer to StandardUpdater.__init__,
+        # they are automatically named 'main'.
+        train_iter = self.get_iterator("main")
+        optimizer = self.get_optimizer("main")
+        # Progress the dataset iterator for sentences at each iteration.
+        self.model.zero_grad()  # Clear the parameter gradients
+        accum = {"loss": 0.0, "nll": 0.0, "count": 0}
+        for _ in range(self.accum_grad):
+            batch = train_iter.__next__()
+            # Concatenate the token IDs to matrices and send them to the device
+            # self.converter does this job
+            # (it is chainer.dataset.concat_examples by default)
+            x, t = concat_examples(batch, device=self.device[0], padding=(0, -100))
+            if self.device[0] == -1:
+                loss, nll, count = self.model(x, t)
+            else:
+                # apex does not support torch.nn.DataParallel
+                loss, nll, count = data_parallel(self.model, (x, t), self.device)
+            # backward
+            loss = loss.mean() / self.accum_grad
+            if self.use_apex:
+                from apex import amp
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+            else:
+                loss.backward()  # Backprop
+            # accumulate stats
+            accum["loss"] += float(loss)
+            accum["nll"] += float(nll.sum())
+            accum["count"] += int(count.sum())
+        for k, v in accum.items():
+            reporter.report({k: v}, optimizer.target)
+        if self.gradclip is not None:
+            nn.utils.clip_grad_norm_(self.model.parameters(), self.gradclip)
+        optimizer.step()  # Update the parameters
+        self.scheduler.step(n_iter=self.iteration)
+class LMEvaluator(BaseEvaluator):
+    """A custom evaluator for a pytorch LM."""
+    def __init__(self, val_iter, eval_model, reporter, device):
+        """Initialize class.
+        :param chainer.dataset.Iterator val_iter : The validation iterator
+        :param LMInterface eval_model : The model to evaluate
+        :param chainer.Reporter reporter : The observations reporter
+        :param int device : The device id to use
+        """
+        super(LMEvaluator, self).__init__(val_iter, reporter, device=-1)
+        self.model = eval_model
+        self.device = device
+    def evaluate(self):
+        """Evaluate the model."""
+        val_iter = self.get_iterator("main")
+        loss = 0
+        nll = 0
+        count = 0
+        self.model.eval()
+        with torch.no_grad():
+            for batch in copy.copy(val_iter):
+                x, t = concat_examples(batch, device=self.device[0], padding=(0, -100))
+                if self.device[0] == -1:
+                    l, n, c = self.model(x, t)
+                else:
+                    # apex does not support torch.nn.DataParallel
+                    l, n, c = data_parallel(self.model, (x, t), self.device)
+                loss += float(l.sum())
+                nll += float(n.sum())
+                count += int(c.sum())
+        self.model.train()
+        # report validation loss
+        observation = {}
+        with reporter.report_scope(observation):
+            reporter.report({"loss": loss}, self.model.reporter)
+            reporter.report({"nll": nll}, self.model.reporter)
+            reporter.report({"count": count}, self.model.reporter)
+        return observation
+def train(args):
+    """Train with the given args.
+    :param Namespace args: The program arguments
+    :param type model_class: LMInterface class for training
+    """
+    model_class = dynamic_import_lm(args.model_module, args.backend)
+    assert issubclass(model_class, LMInterface), "model should implement LMInterface"
+    # display torch version
+    logging.info("torch version = " + torch.__version__)
+    set_deterministic_pytorch(args)
+    # check cuda and cudnn availability
+    if not torch.cuda.is_available():
+        logging.warning("cuda is not available")
+    # get special label ids
+    unk = args.char_list_dict["<unk>"]
+    eos = args.char_list_dict["<eos>"]
+    # read tokens as a sequence of sentences
+    val, n_val_tokens, n_val_oovs = load_dataset(
+        args.valid_label, args.char_list_dict, args.dump_hdf5_path
+    )
+    train, n_train_tokens, n_train_oovs = load_dataset(
+        args.train_label, args.char_list_dict, args.dump_hdf5_path
+    )
+    logging.info("#vocab = " + str(args.n_vocab))
+    logging.info("#sentences in the training data = " + str(len(train)))
+    logging.info("#tokens in the training data = " + str(n_train_tokens))
+    logging.info(
+        "oov rate in the training data = %.2f %%"
+        % (n_train_oovs / n_train_tokens * 100)
+    )
+    logging.info("#sentences in the validation data = " + str(len(val)))
+    logging.info("#tokens in the validation data = " + str(n_val_tokens))
+    logging.info(
+        "oov rate in the validation data = %.2f %%" % (n_val_oovs / n_val_tokens * 100)
+    )
+    use_sortagrad = args.sortagrad == -1 or args.sortagrad > 0
+    # Create the dataset iterators
+    batch_size = args.batchsize * max(args.ngpu, 1)
+    if batch_size * args.accum_grad > args.batchsize:
+        logging.info(
+            f"batch size is automatically increased "
+            f"({args.batchsize} -> {batch_size * args.accum_grad})"
+        )
+    train_iter = ParallelSentenceIterator(
+        train,
+        batch_size,
+        max_length=args.maxlen,
+        sos=eos,
+        eos=eos,
+        shuffle=not use_sortagrad,
+    )
+    val_iter = ParallelSentenceIterator(
+        val, batch_size, max_length=args.maxlen, sos=eos, eos=eos, repeat=False
+    )
+    epoch_iters = int(len(train_iter.batch_indices) / args.accum_grad)
+    logging.info("#iterations per epoch = %d" % epoch_iters)
+    logging.info("#total iterations = " + str(args.epoch * epoch_iters))
+    # Prepare an RNNLM model
+    if args.train_dtype in ("float16", "float32", "float64"):
+        dtype = getattr(torch, args.train_dtype)
+    else:
+        dtype = torch.float32
+    model = model_class(args.n_vocab, args).to(dtype=dtype)
+    if args.ngpu > 0:
+        model.to("cuda")
+        gpu_id = list(range(args.ngpu))
+    else:
+        gpu_id = [-1]
+    # Save model conf to json
+    model_conf = args.outdir + "/model.json"
+    with open(model_conf, "wb") as f:
+        logging.info("writing a model config file to " + model_conf)
+        f.write(
+            json.dumps(vars(args), indent=4, ensure_ascii=False, sort_keys=True).encode(
+                "utf_8"
+            )
+        )
+    logging.warning(
+        "num. model params: {:,} (num. trained: {:,} ({:.1f}%))".format(
+            sum(p.numel() for p in model.parameters()),
+            sum(p.numel() for p in model.parameters() if p.requires_grad),
+            sum(p.numel() for p in model.parameters() if p.requires_grad)
+            * 100.0
+            / sum(p.numel() for p in model.parameters()),
+        )
+    )
+    # Set up an optimizer
+    opt_class = dynamic_import_optimizer(args.opt, args.backend)
+    optimizer = opt_class.from_args(model.parameters(), args)
+    if args.schedulers is None:
+        schedulers = []
+    else:
+        schedulers = [dynamic_import_scheduler(v)(k, args) for k, v in args.schedulers]
+    # setup apex.amp
+    if args.train_dtype in ("O0", "O1", "O2", "O3"):
+        try:
+            from apex import amp
+        except ImportError as e:
+            logging.error(
+                f"You need to install apex for --train-dtype {args.train_dtype}. "
+                "See https://github.com/NVIDIA/apex#linux"
+            )
+            raise e
+        model, optimizer = amp.initialize(model, optimizer, opt_level=args.train_dtype)
+        use_apex = True
+    else:
+        use_apex = False
+    # FIXME: TOO DIRTY HACK
+    reporter = Reporter()
+    setattr(model, "reporter", reporter)
+    setattr(optimizer, "target", reporter)
+    setattr(optimizer, "serialize", lambda s: reporter.serialize(s))
+    updater = BPTTUpdater(
+        train_iter,
+        model,
+        optimizer,
+        schedulers,
+        gpu_id,
+        gradclip=args.gradclip,
+        use_apex=use_apex,
+        accum_grad=args.accum_grad,
+    )
+    trainer = training.Trainer(updater, (args.epoch, "epoch"), out=args.outdir)
+    trainer.extend(LMEvaluator(val_iter, model, reporter, device=gpu_id))
+    trainer.extend(
+        extensions.LogReport(
+            postprocess=compute_perplexity,
+            trigger=(args.report_interval_iters, "iteration"),
+        )
+    )
+    trainer.extend(
+        extensions.PrintReport(
+            [
+                "epoch",
+                "iteration",
+                "main/loss",
+                "perplexity",
+                "val_perplexity",
+                "elapsed_time",
+            ]
+        ),
+        trigger=(args.report_interval_iters, "iteration"),
+    )
+    trainer.extend(extensions.ProgressBar(update_interval=args.report_interval_iters))
+    # Save best models
+    trainer.extend(torch_snapshot(filename="snapshot.ep.{.updater.epoch}"))
+    trainer.extend(snapshot_object(model, "rnnlm.model.{.updater.epoch}"))
+    # T.Hori: MinValueTrigger should be used, but it fails when resuming
+    trainer.extend(MakeSymlinkToBestModel("validation/main/loss", "rnnlm.model"))
+    if use_sortagrad:
+        trainer.extend(
+            ShufflingEnabler([train_iter]),
+            trigger=(args.sortagrad if args.sortagrad != -1 else args.epoch, "epoch"),
+        )
+    if args.resume:
+        logging.info("resumed from %s" % args.resume)
+        torch_resume(args.resume, trainer)
+    set_early_stop(trainer, args, is_lm=True)
+    if args.tensorboard_dir is not None and args.tensorboard_dir != "":
+        writer = SummaryWriter(args.tensorboard_dir)
+        trainer.extend(
+            TensorboardLogger(writer), trigger=(args.report_interval_iters, "iteration")
+        )
+    trainer.run()
+    check_early_stop(trainer, args.epoch)
+    # compute perplexity for test set
+    if args.test_label:
+        logging.info("test the best model")
+        torch_load(args.outdir + "/rnnlm.model.best", model)
+        test = read_tokens(args.test_label, args.char_list_dict)
+        n_test_tokens, n_test_oovs = count_tokens(test, unk)
+        logging.info("#sentences in the test data = " + str(len(test)))
+        logging.info("#tokens in the test data = " + str(n_test_tokens))
+        logging.info(
+            "oov rate in the test data = %.2f %%" % (n_test_oovs / n_test_tokens * 100)
+        )
+        test_iter = ParallelSentenceIterator(
+            test, batch_size, max_length=args.maxlen, sos=eos, eos=eos, repeat=False
+        )
+        evaluator = LMEvaluator(test_iter, model, reporter, device=gpu_id)
+        result = evaluator()
+        compute_perplexity(result)
+        logging.info(f"test perplexity: {result['perplexity']}")

espnet/mt/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Initialize sub package."""

espnet/mt/mt_utils.py ADDED Viewed

	@@ -0,0 +1,83 @@

+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2019 Kyoto University (Hirofumi Inaguma)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""Utility funcitons for the text translation task."""
+import logging
+# * ------------------ recognition related ------------------ *
+def parse_hypothesis(hyp, char_list):
+    """Parse hypothesis.
+    :param list hyp: recognition hypothesis
+    :param list char_list: list of characters
+    :return: recognition text string
+    :return: recognition token string
+    :return: recognition tokenid string
+    """
+    # remove sos and get results
+    tokenid_as_list = list(map(int, hyp["yseq"][1:]))
+    token_as_list = [char_list[idx] for idx in tokenid_as_list]
+    score = float(hyp["score"])
+    # convert to string
+    tokenid = " ".join([str(idx) for idx in tokenid_as_list])
+    token = " ".join(token_as_list)
+    text = "".join(token_as_list).replace("<space>", " ")
+    return text, token, tokenid, score
+def add_results_to_json(js, nbest_hyps, char_list):
+    """Add N-best results to json.
+    :param dict js: groundtruth utterance dict
+    :param list nbest_hyps: list of hypothesis
+    :param list char_list: list of characters
+    :return: N-best results added utterance dict
+    """
+    # copy old json info
+    new_js = dict()
+    if "utt2spk" in js.keys():
+        new_js["utt2spk"] = js["utt2spk"]
+    new_js["output"] = []
+    for n, hyp in enumerate(nbest_hyps, 1):
+        # parse hypothesis
+        rec_text, rec_token, rec_tokenid, score = parse_hypothesis(hyp, char_list)
+        # copy ground-truth
+        if len(js["output"]) > 0:
+            out_dic = dict(js["output"][0].items())
+        else:
+            out_dic = {"name": ""}
+        # update name
+        out_dic["name"] += "[%d]" % n
+        # add recognition results
+        out_dic["rec_text"] = rec_text
+        out_dic["rec_token"] = rec_token
+        out_dic["rec_tokenid"] = rec_tokenid
+        out_dic["score"] = score
+        # add source reference
+        out_dic["text_src"] = js["output"][1]["text"]
+        out_dic["token_src"] = js["output"][1]["token"]
+        out_dic["tokenid_src"] = js["output"][1]["tokenid"]
+        # add to list of N-best result dicts
+        new_js["output"].append(out_dic)
+        # show 1-best result
+        if n == 1:
+            if "text" in out_dic.keys():
+                logging.info("groundtruth: %s" % out_dic["text"])
+            logging.info("prediction : %s" % out_dic["rec_text"])
+            logging.info("source : %s" % out_dic["token_src"])
+    return new_js

espnet/mt/pytorch_backend/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Initialize sub package."""

espnet/mt/pytorch_backend/mt.py ADDED Viewed

	@@ -0,0 +1,600 @@

+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2019 Kyoto University (Hirofumi Inaguma)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""Training/decoding definition for the text translation task."""
+import json
+import logging
+import os
+import sys
+from chainer import training
+from chainer.training import extensions
+import numpy as np
+from tensorboardX import SummaryWriter
+import torch
+from espnet.asr.asr_utils import adadelta_eps_decay
+from espnet.asr.asr_utils import adam_lr_decay
+from espnet.asr.asr_utils import add_results_to_json
+from espnet.asr.asr_utils import CompareValueTrigger
+from espnet.asr.asr_utils import restore_snapshot
+from espnet.asr.asr_utils import snapshot_object
+from espnet.asr.asr_utils import torch_load
+from espnet.asr.asr_utils import torch_resume
+from espnet.asr.asr_utils import torch_snapshot
+from espnet.nets.mt_interface import MTInterface
+from espnet.nets.pytorch_backend.e2e_asr import pad_list
+from espnet.utils.dataset import ChainerDataLoader
+from espnet.utils.dataset import TransformDataset
+from espnet.utils.deterministic_utils import set_deterministic_pytorch
+from espnet.utils.dynamic_import import dynamic_import
+from espnet.utils.io_utils import LoadInputsAndTargets
+from espnet.utils.training.batchfy import make_batchset
+from espnet.utils.training.iterators import ShufflingEnabler
+from espnet.utils.training.tensorboard_logger import TensorboardLogger
+from espnet.utils.training.train_utils import check_early_stop
+from espnet.utils.training.train_utils import set_early_stop
+from espnet.asr.pytorch_backend.asr import CustomEvaluator
+from espnet.asr.pytorch_backend.asr import CustomUpdater
+from espnet.asr.pytorch_backend.asr import load_trained_model
+import matplotlib
+matplotlib.use("Agg")
+if sys.version_info[0] == 2:
+    from itertools import izip_longest as zip_longest
+else:
+    from itertools import zip_longest as zip_longest
+class CustomConverter(object):
+    """Custom batch converter for Pytorch."""
+    def __init__(self):
+        """Construct a CustomConverter object."""
+        self.ignore_id = -1
+        self.pad = 0
+        # NOTE: we reserve index:0 for <pad> although this is reserved for a blank class
+        # in ASR. However,
+        # blank labels are not used in NMT. To keep the vocabulary size,
+        # we use index:0 for padding instead of adding one more class.
+    def __call__(self, batch, device=torch.device("cpu")):
+        """Transform a batch and send it to a device.
+        Args:
+            batch (list): The batch to transform.
+            device (torch.device): The device to send to.
+        Returns:
+            tuple(torch.Tensor, torch.Tensor, torch.Tensor)
+        """
+        # batch should be located in list
+        assert len(batch) == 1
+        xs, ys = batch[0]
+        # get batch of lengths of input sequences
+        ilens = np.array([x.shape[0] for x in xs])
+        # perform padding and convert to tensor
+        xs_pad = pad_list([torch.from_numpy(x).long() for x in xs], self.pad).to(device)
+        ilens = torch.from_numpy(ilens).to(device)
+        ys_pad = pad_list([torch.from_numpy(y).long() for y in ys], self.ignore_id).to(
+            device
+        )
+        return xs_pad, ilens, ys_pad
+def train(args):
+    """Train with the given args.
+    Args:
+        args (namespace): The program arguments.
+    """
+    set_deterministic_pytorch(args)
+    # check cuda availability
+    if not torch.cuda.is_available():
+        logging.warning("cuda is not available")
+    # get input and output dimension info
+    with open(args.valid_json, "rb") as f:
+        valid_json = json.load(f)["utts"]
+    utts = list(valid_json.keys())
+    idim = int(valid_json[utts[0]]["output"][1]["shape"][1])
+    odim = int(valid_json[utts[0]]["output"][0]["shape"][1])
+    logging.info("#input dims : " + str(idim))
+    logging.info("#output dims: " + str(odim))
+    # specify model architecture
+    model_class = dynamic_import(args.model_module)
+    model = model_class(idim, odim, args)
+    assert isinstance(model, MTInterface)
+    # write model config
+    if not os.path.exists(args.outdir):
+        os.makedirs(args.outdir)
+    model_conf = args.outdir + "/model.json"
+    with open(model_conf, "wb") as f:
+        logging.info("writing a model config file to " + model_conf)
+        f.write(
+            json.dumps(
+                (idim, odim, vars(args)), indent=4, ensure_ascii=False, sort_keys=True
+            ).encode("utf_8")
+        )
+    for key in sorted(vars(args).keys()):
+        logging.info("ARGS: " + key + ": " + str(vars(args)[key]))
+    reporter = model.reporter
+    # check the use of multi-gpu
+    if args.ngpu > 1:
+        if args.batch_size != 0:
+            logging.warning(
+                "batch size is automatically increased (%d -> %d)"
+                % (args.batch_size, args.batch_size * args.ngpu)
+            )
+            args.batch_size *= args.ngpu
+    # set torch device
+    device = torch.device("cuda" if args.ngpu > 0 else "cpu")
+    if args.train_dtype in ("float16", "float32", "float64"):
+        dtype = getattr(torch, args.train_dtype)
+    else:
+        dtype = torch.float32
+    model = model.to(device=device, dtype=dtype)
+    logging.warning(
+        "num. model params: {:,} (num. trained: {:,} ({:.1f}%))".format(
+            sum(p.numel() for p in model.parameters()),
+            sum(p.numel() for p in model.parameters() if p.requires_grad),
+            sum(p.numel() for p in model.parameters() if p.requires_grad)
+            * 100.0
+            / sum(p.numel() for p in model.parameters()),
+        )
+    )
+    # Setup an optimizer
+    if args.opt == "adadelta":
+        optimizer = torch.optim.Adadelta(
+            model.parameters(), rho=0.95, eps=args.eps, weight_decay=args.weight_decay
+        )
+    elif args.opt == "adam":
+        optimizer = torch.optim.Adam(
+            model.parameters(), lr=args.lr, weight_decay=args.weight_decay
+        )
+    elif args.opt == "noam":
+        from espnet.nets.pytorch_backend.transformer.optimizer import get_std_opt
+        optimizer = get_std_opt(
+            model.parameters(),
+            args.adim,
+            args.transformer_warmup_steps,
+            args.transformer_lr,
+        )
+    else:
+        raise NotImplementedError("unknown optimizer: " + args.opt)
+    # setup apex.amp
+    if args.train_dtype in ("O0", "O1", "O2", "O3"):
+        try:
+            from apex import amp
+        except ImportError as e:
+            logging.error(
+                f"You need to install apex for --train-dtype {args.train_dtype}. "
+                "See https://github.com/NVIDIA/apex#linux"
+            )
+            raise e
+        if args.opt == "noam":
+            model, optimizer.optimizer = amp.initialize(
+                model, optimizer.optimizer, opt_level=args.train_dtype
+            )
+        else:
+            model, optimizer = amp.initialize(
+                model, optimizer, opt_level=args.train_dtype
+            )
+        use_apex = True
+    else:
+        use_apex = False
+    # FIXME: TOO DIRTY HACK
+    setattr(optimizer, "target", reporter)
+    setattr(optimizer, "serialize", lambda s: reporter.serialize(s))
+    # Setup a converter
+    converter = CustomConverter()
+    # read json data
+    with open(args.train_json, "rb") as f:
+        train_json = json.load(f)["utts"]
+    with open(args.valid_json, "rb") as f:
+        valid_json = json.load(f)["utts"]
+    use_sortagrad = args.sortagrad == -1 or args.sortagrad > 0
+    # make minibatch list (variable length)
+    train = make_batchset(
+        train_json,
+        args.batch_size,
+        args.maxlen_in,
+        args.maxlen_out,
+        args.minibatches,
+        min_batch_size=args.ngpu if args.ngpu > 1 else 1,
+        shortest_first=use_sortagrad,
+        count=args.batch_count,
+        batch_bins=args.batch_bins,
+        batch_frames_in=args.batch_frames_in,
+        batch_frames_out=args.batch_frames_out,
+        batch_frames_inout=args.batch_frames_inout,
+        mt=True,
+        iaxis=1,
+        oaxis=0,
+    )
+    valid = make_batchset(
+        valid_json,
+        args.batch_size,
+        args.maxlen_in,
+        args.maxlen_out,
+        args.minibatches,
+        min_batch_size=args.ngpu if args.ngpu > 1 else 1,
+        count=args.batch_count,
+        batch_bins=args.batch_bins,
+        batch_frames_in=args.batch_frames_in,
+        batch_frames_out=args.batch_frames_out,
+        batch_frames_inout=args.batch_frames_inout,
+        mt=True,
+        iaxis=1,
+        oaxis=0,
+    )
+    load_tr = LoadInputsAndTargets(mode="mt", load_output=True)
+    load_cv = LoadInputsAndTargets(mode="mt", load_output=True)
+    # hack to make batchsize argument as 1
+    # actual bathsize is included in a list
+    # default collate function converts numpy array to pytorch tensor
+    # we used an empty collate function instead which returns list
+    train_iter = ChainerDataLoader(
+        dataset=TransformDataset(train, lambda data: converter([load_tr(data)])),
+        batch_size=1,
+        num_workers=args.n_iter_processes,
+        shuffle=not use_sortagrad,
+        collate_fn=lambda x: x[0],
+    )
+    valid_iter = ChainerDataLoader(
+        dataset=TransformDataset(valid, lambda data: converter([load_cv(data)])),
+        batch_size=1,
+        shuffle=False,
+        collate_fn=lambda x: x[0],
+        num_workers=args.n_iter_processes,
+    )
+    # Set up a trainer
+    updater = CustomUpdater(
+        model,
+        args.grad_clip,
+        {"main": train_iter},
+        optimizer,
+        device,
+        args.ngpu,
+        False,
+        args.accum_grad,
+        use_apex=use_apex,
+    )
+    trainer = training.Trainer(updater, (args.epochs, "epoch"), out=args.outdir)
+    if use_sortagrad:
+        trainer.extend(
+            ShufflingEnabler([train_iter]),
+            trigger=(args.sortagrad if args.sortagrad != -1 else args.epochs, "epoch"),
+        )
+    # Resume from a snapshot
+    if args.resume:
+        logging.info("resumed from %s" % args.resume)
+        torch_resume(args.resume, trainer)
+    # Evaluate the model with the test dataset for each epoch
+    if args.save_interval_iters > 0:
+        trainer.extend(
+            CustomEvaluator(model, {"main": valid_iter}, reporter, device, args.ngpu),
+            trigger=(args.save_interval_iters, "iteration"),
+        )
+    else:
+        trainer.extend(
+            CustomEvaluator(model, {"main": valid_iter}, reporter, device, args.ngpu)
+        )
+    # Save attention weight each epoch
+    if args.num_save_attention > 0:
+        # NOTE: sort it by output lengths
+        data = sorted(
+            list(valid_json.items())[: args.num_save_attention],
+            key=lambda x: int(x[1]["output"][0]["shape"][0]),
+            reverse=True,
+        )
+        if hasattr(model, "module"):
+            att_vis_fn = model.module.calculate_all_attentions
+            plot_class = model.module.attention_plot_class
+        else:
+            att_vis_fn = model.calculate_all_attentions
+            plot_class = model.attention_plot_class
+        att_reporter = plot_class(
+            att_vis_fn,
+            data,
+            args.outdir + "/att_ws",
+            converter=converter,
+            transform=load_cv,
+            device=device,
+            ikey="output",
+            iaxis=1,
+        )
+        trainer.extend(att_reporter, trigger=(1, "epoch"))
+    else:
+        att_reporter = None
+    # Make a plot for training and validation values
+    trainer.extend(
+        extensions.PlotReport(
+            ["main/loss", "validation/main/loss"], "epoch", file_name="loss.png"
+        )
+    )
+    trainer.extend(
+        extensions.PlotReport(
+            ["main/acc", "validation/main/acc"], "epoch", file_name="acc.png"
+        )
+    )
+    trainer.extend(
+        extensions.PlotReport(
+            ["main/ppl", "validation/main/ppl"], "epoch", file_name="ppl.png"
+        )
+    )
+    trainer.extend(
+        extensions.PlotReport(
+            ["main/bleu", "validation/main/bleu"], "epoch", file_name="bleu.png"
+        )
+    )
+    # Save best models
+    trainer.extend(
+        snapshot_object(model, "model.loss.best"),
+        trigger=training.triggers.MinValueTrigger("validation/main/loss"),
+    )
+    trainer.extend(
+        snapshot_object(model, "model.acc.best"),
+        trigger=training.triggers.MaxValueTrigger("validation/main/acc"),
+    )
+    # save snapshot which contains model and optimizer states
+    if args.save_interval_iters > 0:
+        trainer.extend(
+            torch_snapshot(filename="snapshot.iter.{.updater.iteration}"),
+            trigger=(args.save_interval_iters, "iteration"),
+        )
+    else:
+        trainer.extend(torch_snapshot(), trigger=(1, "epoch"))
+    # epsilon decay in the optimizer
+    if args.opt == "adadelta":
+        if args.criterion == "acc":
+            trainer.extend(
+                restore_snapshot(
+                    model, args.outdir + "/model.acc.best", load_fn=torch_load
+                ),
+                trigger=CompareValueTrigger(
+                    "validation/main/acc",
+                    lambda best_value, current_value: best_value > current_value,
+                ),
+            )
+            trainer.extend(
+                adadelta_eps_decay(args.eps_decay),
+                trigger=CompareValueTrigger(
+                    "validation/main/acc",
+                    lambda best_value, current_value: best_value > current_value,
+                ),
+            )
+        elif args.criterion == "loss":
+            trainer.extend(
+                restore_snapshot(
+                    model, args.outdir + "/model.loss.best", load_fn=torch_load
+                ),
+                trigger=CompareValueTrigger(
+                    "validation/main/loss",
+                    lambda best_value, current_value: best_value < current_value,
+                ),
+            )
+            trainer.extend(
+                adadelta_eps_decay(args.eps_decay),
+                trigger=CompareValueTrigger(
+                    "validation/main/loss",
+                    lambda best_value, current_value: best_value < current_value,
+                ),
+            )
+    elif args.opt == "adam":
+        if args.criterion == "acc":
+            trainer.extend(
+                restore_snapshot(
+                    model, args.outdir + "/model.acc.best", load_fn=torch_load
+                ),
+                trigger=CompareValueTrigger(
+                    "validation/main/acc",
+                    lambda best_value, current_value: best_value > current_value,
+                ),
+            )
+            trainer.extend(
+                adam_lr_decay(args.lr_decay),
+                trigger=CompareValueTrigger(
+                    "validation/main/acc",
+                    lambda best_value, current_value: best_value > current_value,
+                ),
+            )
+        elif args.criterion == "loss":
+            trainer.extend(
+                restore_snapshot(
+                    model, args.outdir + "/model.loss.best", load_fn=torch_load
+                ),
+                trigger=CompareValueTrigger(
+                    "validation/main/loss",
+                    lambda best_value, current_value: best_value < current_value,
+                ),
+            )
+            trainer.extend(
+                adam_lr_decay(args.lr_decay),
+                trigger=CompareValueTrigger(
+                    "validation/main/loss",
+                    lambda best_value, current_value: best_value < current_value,
+                ),
+            )
+    # Write a log of evaluation statistics for each epoch
+    trainer.extend(
+        extensions.LogReport(trigger=(args.report_interval_iters, "iteration"))
+    )
+    report_keys = [
+        "epoch",
+        "iteration",
+        "main/loss",
+        "validation/main/loss",
+        "main/acc",
+        "validation/main/acc",
+        "main/ppl",
+        "validation/main/ppl",
+        "elapsed_time",
+    ]
+    if args.opt == "adadelta":
+        trainer.extend(
+            extensions.observe_value(
+                "eps",
+                lambda trainer: trainer.updater.get_optimizer("main").param_groups[0][
+                    "eps"
+                ],
+            ),
+            trigger=(args.report_interval_iters, "iteration"),
+        )
+        report_keys.append("eps")
+    elif args.opt in ["adam", "noam"]:
+        trainer.extend(
+            extensions.observe_value(
+                "lr",
+                lambda trainer: trainer.updater.get_optimizer("main").param_groups[0][
+                    "lr"
+                ],
+            ),
+            trigger=(args.report_interval_iters, "iteration"),
+        )
+        report_keys.append("lr")
+    if args.report_bleu:
+        report_keys.append("main/bleu")
+        report_keys.append("validation/main/bleu")
+    trainer.extend(
+        extensions.PrintReport(report_keys),
+        trigger=(args.report_interval_iters, "iteration"),
+    )
+    trainer.extend(extensions.ProgressBar(update_interval=args.report_interval_iters))
+    set_early_stop(trainer, args)
+    if args.tensorboard_dir is not None and args.tensorboard_dir != "":
+        trainer.extend(
+            TensorboardLogger(SummaryWriter(args.tensorboard_dir), att_reporter),
+            trigger=(args.report_interval_iters, "iteration"),
+        )
+    # Run the training
+    trainer.run()
+    check_early_stop(trainer, args.epochs)
+def trans(args):
+    """Decode with the given args.
+    Args:
+        args (namespace): The program arguments.
+    """
+    set_deterministic_pytorch(args)
+    model, train_args = load_trained_model(args.model)
+    assert isinstance(model, MTInterface)
+    model.trans_args = args
+    # gpu
+    if args.ngpu == 1:
+        gpu_id = list(range(args.ngpu))
+        logging.info("gpu id: " + str(gpu_id))
+        model.cuda()
+    # read json data
+    with open(args.trans_json, "rb") as f:
+        js = json.load(f)["utts"]
+    new_js = {}
+    # remove enmpy utterances
+    if train_args.multilingual:
+        js = {
+            k: v
+            for k, v in js.items()
+            if v["output"][0]["shape"][0] > 1 and v["output"][1]["shape"][0] > 1
+        }
+    else:
+        js = {
+            k: v
+            for k, v in js.items()
+            if v["output"][0]["shape"][0] > 0 and v["output"][1]["shape"][0] > 0
+        }
+    if args.batchsize == 0:
+        with torch.no_grad():
+            for idx, name in enumerate(js.keys(), 1):
+                logging.info("(%d/%d) decoding " + name, idx, len(js.keys()))
+                feat = [js[name]["output"][1]["tokenid"].split()]
+                nbest_hyps = model.translate(feat, args, train_args.char_list)
+                new_js[name] = add_results_to_json(
+                    js[name], nbest_hyps, train_args.char_list
+                )
+    else:
+        def grouper(n, iterable, fillvalue=None):
+            kargs = [iter(iterable)] * n
+            return zip_longest(*kargs, fillvalue=fillvalue)
+        # sort data
+        keys = list(js.keys())
+        feat_lens = [js[key]["output"][1]["shape"][0] for key in keys]
+        sorted_index = sorted(range(len(feat_lens)), key=lambda i: -feat_lens[i])
+        keys = [keys[i] for i in sorted_index]
+        with torch.no_grad():
+            for names in grouper(args.batchsize, keys, None):
+                names = [name for name in names if name]
+                feats = [
+                    np.fromiter(
+                        map(int, js[name]["output"][1]["tokenid"].split()),
+                        dtype=np.int64,
+                    )
+                    for name in names
+                ]
+                nbest_hyps = model.translate_batch(
+                    feats,
+                    args,
+                    train_args.char_list,
+                )
+                for i, nbest_hyp in enumerate(nbest_hyps):
+                    name = names[i]
+                    new_js[name] = add_results_to_json(
+                        js[name], nbest_hyp, train_args.char_list
+                    )
+    with open(args.result_label, "wb") as f:
+        f.write(
+            json.dumps(
+                {"utts": new_js}, indent=4, ensure_ascii=False, sort_keys=True
+            ).encode("utf_8")
+        )

espnet/nets/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Initialize sub package."""

espnet/nets/asr_interface.py ADDED Viewed

	@@ -0,0 +1,172 @@

+"""ASR Interface module."""
+import argparse
+from espnet.bin.asr_train import get_parser
+from espnet.utils.dynamic_import import dynamic_import
+from espnet.utils.fill_missing_args import fill_missing_args
+class ASRInterface:
+    """ASR Interface for ESPnet model implementation."""
+    @staticmethod
+    def add_arguments(parser):
+        """Add arguments to parser."""
+        return parser
+    @classmethod
+    def build(cls, idim: int, odim: int, **kwargs):
+        """Initialize this class with python-level args.
+        Args:
+            idim (int): The number of an input feature dim.
+            odim (int): The number of output vocab.
+        Returns:
+            ASRinterface: A new instance of ASRInterface.
+        """
+        def wrap(parser):
+            return get_parser(parser, required=False)
+        args = argparse.Namespace(**kwargs)
+        args = fill_missing_args(args, wrap)
+        args = fill_missing_args(args, cls.add_arguments)
+        return cls(idim, odim, args)
+    def forward(self, xs, ilens, ys):
+        """Compute loss for training.
+        :param xs:
+            For pytorch, batch of padded source sequences torch.Tensor (B, Tmax, idim)
+            For chainer, list of source sequences chainer.Variable
+        :param ilens: batch of lengths of source sequences (B)
+            For pytorch, torch.Tensor
+            For chainer, list of int
+        :param ys:
+            For pytorch, batch of padded source sequences torch.Tensor (B, Lmax)
+            For chainer, list of source sequences chainer.Variable
+        :return: loss value
+        :rtype: torch.Tensor for pytorch, chainer.Variable for chainer
+        """
+        raise NotImplementedError("forward method is not implemented")
+    def recognize(self, x, recog_args, char_list=None, rnnlm=None):
+        """Recognize x for evaluation.
+        :param ndarray x: input acouctic feature (B, T, D) or (T, D)
+        :param namespace recog_args: argment namespace contraining options
+        :param list char_list: list of characters
+        :param torch.nn.Module rnnlm: language model module
+        :return: N-best decoding results
+        :rtype: list
+        """
+        raise NotImplementedError("recognize method is not implemented")
+    def recognize_batch(self, x, recog_args, char_list=None, rnnlm=None):
+        """Beam search implementation for batch.
+        :param torch.Tensor x: encoder hidden state sequences (B, Tmax, Henc)
+        :param namespace recog_args: argument namespace containing options
+        :param list char_list: list of characters
+        :param torch.nn.Module rnnlm: language model module
+        :return: N-best decoding results
+        :rtype: list
+        """
+        raise NotImplementedError("Batch decoding is not supported yet.")
+    def calculate_all_attentions(self, xs, ilens, ys):
+        """Caluculate attention.
+        :param list xs: list of padded input sequences [(T1, idim), (T2, idim), ...]
+        :param ndarray ilens: batch of lengths of input sequences (B)
+        :param list ys: list of character id sequence tensor [(L1), (L2), (L3), ...]
+        :return: attention weights (B, Lmax, Tmax)
+        :rtype: float ndarray
+        """
+        raise NotImplementedError("calculate_all_attentions method is not implemented")
+    def calculate_all_ctc_probs(self, xs, ilens, ys):
+        """Caluculate CTC probability.
+        :param list xs_pad: list of padded input sequences [(T1, idim), (T2, idim), ...]
+        :param ndarray ilens: batch of lengths of input sequences (B)
+        :param list ys: list of character id sequence tensor [(L1), (L2), (L3), ...]
+        :return: CTC probabilities (B, Tmax, vocab)
+        :rtype: float ndarray
+        """
+        raise NotImplementedError("calculate_all_ctc_probs method is not implemented")
+    @property
+    def attention_plot_class(self):
+        """Get attention plot class."""
+        from espnet.asr.asr_utils import PlotAttentionReport
+        return PlotAttentionReport
+    @property
+    def ctc_plot_class(self):
+        """Get CTC plot class."""
+        from espnet.asr.asr_utils import PlotCTCReport
+        return PlotCTCReport
+    def get_total_subsampling_factor(self):
+        """Get total subsampling factor."""
+        raise NotImplementedError(
+            "get_total_subsampling_factor method is not implemented"
+        )
+    def encode(self, feat):
+        """Encode feature in `beam_search` (optional).
+        Args:
+            x (numpy.ndarray): input feature (T, D)
+        Returns:
+            torch.Tensor for pytorch, chainer.Variable for chainer:
+                encoded feature (T, D)
+        """
+        raise NotImplementedError("encode method is not implemented")
+    def scorers(self):
+        """Get scorers for `beam_search` (optional).
+        Returns:
+            dict[str, ScorerInterface]: dict of `ScorerInterface` objects
+        """
+        raise NotImplementedError("decoders method is not implemented")
+predefined_asr = {
+    "pytorch": {
+        "rnn": "espnet.nets.pytorch_backend.e2e_asr:E2E",
+        "transducer": "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E",
+        "transformer": "espnet.nets.pytorch_backend.e2e_asr_transformer:E2E",
+        "conformer": "espnet.nets.pytorch_backend.e2e_asr_conformer:E2E",
+    },
+    "chainer": {
+        "rnn": "espnet.nets.chainer_backend.e2e_asr:E2E",
+        "transformer": "espnet.nets.chainer_backend.e2e_asr_transformer:E2E",
+    },
+}
+def dynamic_import_asr(module, backend):
+    """Import ASR models dynamically.
+    Args:
+        module (str): module_name:class_name or alias in `predefined_asr`
+        backend (str): NN backend. e.g., pytorch, chainer
+    Returns:
+        type: ASR class
+    """
+    model_class = dynamic_import(module, predefined_asr.get(backend, dict()))
+    assert issubclass(
+        model_class, ASRInterface
+    ), f"{module} does not implement ASRInterface"
+    return model_class

espnet/nets/batch_beam_search.py ADDED Viewed

	@@ -0,0 +1,348 @@

+"""Parallel beam search module."""
+import logging
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import NamedTuple
+from typing import Tuple
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from espnet.nets.beam_search import BeamSearch
+from espnet.nets.beam_search import Hypothesis
+class BatchHypothesis(NamedTuple):
+    """Batchfied/Vectorized hypothesis data type."""
+    yseq: torch.Tensor = torch.tensor([])  # (batch, maxlen)
+    score: torch.Tensor = torch.tensor([])  # (batch,)
+    length: torch.Tensor = torch.tensor([])  # (batch,)
+    scores: Dict[str, torch.Tensor] = dict()  # values: (batch,)
+    states: Dict[str, Dict] = dict()
+    def __len__(self) -> int:
+        """Return a batch size."""
+        return len(self.length)
+class BatchBeamSearch(BeamSearch):
+    """Batch beam search implementation."""
+    def batchfy(self, hyps: List[Hypothesis]) -> BatchHypothesis:
+        """Convert list to batch."""
+        if len(hyps) == 0:
+            return BatchHypothesis()
+        return BatchHypothesis(
+            yseq=pad_sequence(
+                [h.yseq for h in hyps], batch_first=True, padding_value=self.eos
+            ),
+            length=torch.tensor([len(h.yseq) for h in hyps], dtype=torch.int64),
+            score=torch.tensor([h.score for h in hyps]),
+            scores={k: torch.tensor([h.scores[k] for h in hyps]) for k in self.scorers},
+            states={k: [h.states[k] for h in hyps] for k in self.scorers},
+        )
+    def _batch_select(self, hyps: BatchHypothesis, ids: List[int]) -> BatchHypothesis:
+        return BatchHypothesis(
+            yseq=hyps.yseq[ids],
+            score=hyps.score[ids],
+            length=hyps.length[ids],
+            scores={k: v[ids] for k, v in hyps.scores.items()},
+            states={
+                k: [self.scorers[k].select_state(v, i) for i in ids]
+                for k, v in hyps.states.items()
+            },
+        )
+    def _select(self, hyps: BatchHypothesis, i: int) -> Hypothesis:
+        return Hypothesis(
+            yseq=hyps.yseq[i, : hyps.length[i]],
+            score=hyps.score[i],
+            scores={k: v[i] for k, v in hyps.scores.items()},
+            states={
+                k: self.scorers[k].select_state(v, i) for k, v in hyps.states.items()
+            },
+        )
+    def unbatchfy(self, batch_hyps: BatchHypothesis) -> List[Hypothesis]:
+        """Revert batch to list."""
+        return [
+            Hypothesis(
+                yseq=batch_hyps.yseq[i][: batch_hyps.length[i]],
+                score=batch_hyps.score[i],
+                scores={k: batch_hyps.scores[k][i] for k in self.scorers},
+                states={
+                    k: v.select_state(batch_hyps.states[k], i)
+                    for k, v in self.scorers.items()
+                },
+            )
+            for i in range(len(batch_hyps.length))
+        ]
+    def batch_beam(
+        self, weighted_scores: torch.Tensor, ids: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Batch-compute topk full token ids and partial token ids.
+        Args:
+            weighted_scores (torch.Tensor): The weighted sum scores for each tokens.
+                Its shape is `(n_beam, self.vocab_size)`.
+            ids (torch.Tensor): The partial token ids to compute topk.
+                Its shape is `(n_beam, self.pre_beam_size)`.
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+                The topk full (prev_hyp, new_token) ids
+                and partial (prev_hyp, new_token) ids.
+                Their shapes are all `(self.beam_size,)`
+        """
+        top_ids = weighted_scores.view(-1).topk(self.beam_size)[1]
+        # Because of the flatten above, `top_ids` is organized as:
+        # [hyp1 * V + token1, hyp2 * V + token2, ..., hypK * V + tokenK],
+        # where V is `self.n_vocab` and K is `self.beam_size`
+        prev_hyp_ids = top_ids // self.n_vocab
+        new_token_ids = top_ids % self.n_vocab
+        return prev_hyp_ids, new_token_ids, prev_hyp_ids, new_token_ids
+    def init_hyp(self, x: torch.Tensor) -> BatchHypothesis:
+        """Get an initial hypothesis data.
+        Args:
+            x (torch.Tensor): The encoder output feature
+        Returns:
+            Hypothesis: The initial hypothesis.
+        """
+        init_states = dict()
+        init_scores = dict()
+        for k, d in self.scorers.items():
+            init_states[k] = d.batch_init_state(x)
+            init_scores[k] = 0.0
+        return self.batchfy(
+            [
+                Hypothesis(
+                    score=0.0,
+                    scores=init_scores,
+                    states=init_states,
+                    yseq=torch.tensor([self.sos], device=x.device),
+                )
+            ]
+        )
+    def score_full(
+        self, hyp: BatchHypothesis, x: torch.Tensor
+    ) -> Tuple[Dict[str, torch.Tensor], Dict[str, Any]]:
+        """Score new hypothesis by `self.full_scorers`.
+        Args:
+            hyp (Hypothesis): Hypothesis with prefix tokens to score
+            x (torch.Tensor): Corresponding input feature
+        Returns:
+            Tuple[Dict[str, torch.Tensor], Dict[str, Any]]: Tuple of
+                score dict of `hyp` that has string keys of `self.full_scorers`
+                and tensor score values of shape: `(self.n_vocab,)`,
+                and state dict that has string keys
+                and state values of `self.full_scorers`
+        """
+        scores = dict()
+        states = dict()
+        for k, d in self.full_scorers.items():
+            scores[k], states[k] = d.batch_score(hyp.yseq, hyp.states[k], x)
+        return scores, states
+    def score_partial(
+        self, hyp: BatchHypothesis, ids: torch.Tensor, x: torch.Tensor
+    ) -> Tuple[Dict[str, torch.Tensor], Dict[str, Any]]:
+        """Score new hypothesis by `self.full_scorers`.
+        Args:
+            hyp (Hypothesis): Hypothesis with prefix tokens to score
+            ids (torch.Tensor): 2D tensor of new partial tokens to score
+            x (torch.Tensor): Corresponding input feature
+        Returns:
+            Tuple[Dict[str, torch.Tensor], Dict[str, Any]]: Tuple of
+                score dict of `hyp` that has string keys of `self.full_scorers`
+                and tensor score values of shape: `(self.n_vocab,)`,
+                and state dict that has string keys
+                and state values of `self.full_scorers`
+        """
+        scores = dict()
+        states = dict()
+        for k, d in self.part_scorers.items():
+            scores[k], states[k] = d.batch_score_partial(
+                hyp.yseq, ids, hyp.states[k], x
+            )
+        return scores, states
+    def merge_states(self, states: Any, part_states: Any, part_idx: int) -> Any:
+        """Merge states for new hypothesis.
+        Args:
+            states: states of `self.full_scorers`
+            part_states: states of `self.part_scorers`
+            part_idx (int): The new token id for `part_scores`
+        Returns:
+            Dict[str, torch.Tensor]: The new score dict.
+                Its keys are names of `self.full_scorers` and `self.part_scorers`.
+                Its values are states of the scorers.
+        """
+        new_states = dict()
+        for k, v in states.items():
+            new_states[k] = v
+        for k, v in part_states.items():
+            new_states[k] = v
+        return new_states
+    def search(self, running_hyps: BatchHypothesis, x: torch.Tensor) -> BatchHypothesis:
+        """Search new tokens for running hypotheses and encoded speech x.
+        Args:
+            running_hyps (BatchHypothesis): Running hypotheses on beam
+            x (torch.Tensor): Encoded speech feature (T, D)
+        Returns:
+            BatchHypothesis: Best sorted hypotheses
+        """
+        n_batch = len(running_hyps)
+        part_ids = None  # no pre-beam
+        # batch scoring
+        weighted_scores = torch.zeros(
+            n_batch, self.n_vocab, dtype=x.dtype, device=x.device
+        )
+        scores, states = self.score_full(running_hyps, x.expand(n_batch, *x.shape))
+        for k in self.full_scorers:
+            weighted_scores += self.weights[k] * scores[k]
+        # partial scoring
+        if self.do_pre_beam:
+            pre_beam_scores = (
+                weighted_scores
+                if self.pre_beam_score_key == "full"
+                else scores[self.pre_beam_score_key]
+            )
+            part_ids = torch.topk(pre_beam_scores, self.pre_beam_size, dim=-1)[1]
+        # NOTE(takaaki-hori): Unlike BeamSearch, we assume that score_partial returns
+        # full-size score matrices, which has non-zero scores for part_ids and zeros
+        # for others.
+        part_scores, part_states = self.score_partial(running_hyps, part_ids, x)
+        for k in self.part_scorers:
+            weighted_scores += self.weights[k] * part_scores[k]
+        # add previous hyp scores
+        weighted_scores += running_hyps.score.to(
+            dtype=x.dtype, device=x.device
+        ).unsqueeze(1)
+        # TODO(karita): do not use list. use batch instead
+        # see also https://github.com/espnet/espnet/pull/1402#discussion_r354561029
+        # update hyps
+        best_hyps = []
+        prev_hyps = self.unbatchfy(running_hyps)
+        for (
+            full_prev_hyp_id,
+            full_new_token_id,
+            part_prev_hyp_id,
+            part_new_token_id,
+        ) in zip(*self.batch_beam(weighted_scores, part_ids)):
+            prev_hyp = prev_hyps[full_prev_hyp_id]
+            best_hyps.append(
+                Hypothesis(
+                    score=weighted_scores[full_prev_hyp_id, full_new_token_id],
+                    yseq=self.append_token(prev_hyp.yseq, full_new_token_id),
+                    scores=self.merge_scores(
+                        prev_hyp.scores,
+                        {k: v[full_prev_hyp_id] for k, v in scores.items()},
+                        full_new_token_id,
+                        {k: v[part_prev_hyp_id] for k, v in part_scores.items()},
+                        part_new_token_id,
+                    ),
+                    states=self.merge_states(
+                        {
+                            k: self.full_scorers[k].select_state(v, full_prev_hyp_id)
+                            for k, v in states.items()
+                        },
+                        {
+                            k: self.part_scorers[k].select_state(
+                                v, part_prev_hyp_id, part_new_token_id
+                            )
+                            for k, v in part_states.items()
+                        },
+                        part_new_token_id,
+                    ),
+                )
+            )
+        return self.batchfy(best_hyps)
+    def post_process(
+        self,
+        i: int,
+        maxlen: int,
+        maxlenratio: float,
+        running_hyps: BatchHypothesis,
+        ended_hyps: List[Hypothesis],
+    ) -> BatchHypothesis:
+        """Perform post-processing of beam search iterations.
+        Args:
+            i (int): The length of hypothesis tokens.
+            maxlen (int): The maximum length of tokens in beam search.
+            maxlenratio (int): The maximum length ratio in beam search.
+            running_hyps (BatchHypothesis): The running hypotheses in beam search.
+            ended_hyps (List[Hypothesis]): The ended hypotheses in beam search.
+        Returns:
+            BatchHypothesis: The new running hypotheses.
+        """
+        n_batch = running_hyps.yseq.shape[0]
+        logging.debug(f"the number of running hypothes: {n_batch}")
+        if self.token_list is not None:
+            logging.debug(
+                "best hypo: "
+                + "".join(
+                    [
+                        self.token_list[x]
+                        for x in running_hyps.yseq[0, 1 : running_hyps.length[0]]
+                    ]
+                )
+            )
+        # add eos in the final loop to avoid that there are no ended hyps
+        if i == maxlen - 1:
+            logging.info("adding <eos> in the last position in the loop")
+            yseq_eos = torch.cat(
+                (
+                    running_hyps.yseq,
+                    torch.full(
+                        (n_batch, 1),
+                        self.eos,
+                        device=running_hyps.yseq.device,
+                        dtype=torch.int64,
+                    ),
+                ),
+                1,
+            )
+            running_hyps.yseq.resize_as_(yseq_eos)
+            running_hyps.yseq[:] = yseq_eos
+            running_hyps.length[:] = yseq_eos.shape[1]
+        # add ended hypotheses to a final list, and removed them from current hypotheses
+        # (this will be a probmlem, number of hyps < beam)
+        is_eos = (
+            running_hyps.yseq[torch.arange(n_batch), running_hyps.length - 1]
+            == self.eos
+        )
+        for b in torch.nonzero(is_eos).view(-1):
+            hyp = self._select(running_hyps, b)
+            ended_hyps.append(hyp)
+        remained_ids = torch.nonzero(is_eos == 0).view(-1)
+        return self._batch_select(running_hyps, remained_ids)

espnet/nets/batch_beam_search_online_sim.py ADDED Viewed

	@@ -0,0 +1,270 @@

+"""Parallel beam search module for online simulation."""
+import logging
+from pathlib import Path
+from typing import List
+import yaml
+import torch
+from espnet.nets.batch_beam_search import BatchBeamSearch
+from espnet.nets.beam_search import Hypothesis
+from espnet.nets.e2e_asr_common import end_detect
+class BatchBeamSearchOnlineSim(BatchBeamSearch):
+    """Online beam search implementation.
+    This simulates streaming decoding.
+    It requires encoded features of entire utterance and
+    extracts block by block from it as it shoud be done
+    in streaming processing.
+    This is based on Tsunoo et al, "STREAMING TRANSFORMER ASR
+    WITH BLOCKWISE SYNCHRONOUS BEAM SEARCH"
+    (https://arxiv.org/abs/2006.14941).
+    """
+    def set_streaming_config(self, asr_config: str):
+        """Set config file for streaming decoding.
+        Args:
+            asr_config (str): The config file for asr training
+        """
+        train_config_file = Path(asr_config)
+        self.block_size = None
+        self.hop_size = None
+        self.look_ahead = None
+        config = None
+        with train_config_file.open("r", encoding="utf-8") as f:
+            args = yaml.safe_load(f)
+            if "encoder_conf" in args.keys():
+                if "block_size" in args["encoder_conf"].keys():
+                    self.block_size = args["encoder_conf"]["block_size"]
+                if "hop_size" in args["encoder_conf"].keys():
+                    self.hop_size = args["encoder_conf"]["hop_size"]
+                if "look_ahead" in args["encoder_conf"].keys():
+                    self.look_ahead = args["encoder_conf"]["look_ahead"]
+            elif "config" in args.keys():
+                config = args["config"]
+                if config is None:
+                    logging.info(
+                        "Cannot find config file for streaming decoding: "
+                        + "apply batch beam search instead."
+                    )
+                    return
+        if (
+            self.block_size is None or self.hop_size is None or self.look_ahead is None
+        ) and config is not None:
+            config_file = Path(config)
+            with config_file.open("r", encoding="utf-8") as f:
+                args = yaml.safe_load(f)
+            if "encoder_conf" in args.keys():
+                enc_args = args["encoder_conf"]
+            if enc_args and "block_size" in enc_args:
+                self.block_size = enc_args["block_size"]
+            if enc_args and "hop_size" in enc_args:
+                self.hop_size = enc_args["hop_size"]
+            if enc_args and "look_ahead" in enc_args:
+                self.look_ahead = enc_args["look_ahead"]
+    def set_block_size(self, block_size: int):
+        """Set block size for streaming decoding.
+        Args:
+            block_size (int): The block size of encoder
+        """
+        self.block_size = block_size
+    def set_hop_size(self, hop_size: int):
+        """Set hop size for streaming decoding.
+        Args:
+            hop_size (int): The hop size of encoder
+        """
+        self.hop_size = hop_size
+    def set_look_ahead(self, look_ahead: int):
+        """Set look ahead size for streaming decoding.
+        Args:
+            look_ahead (int): The look ahead size of encoder
+        """
+        self.look_ahead = look_ahead
+    def forward(
+        self, x: torch.Tensor, maxlenratio: float = 0.0, minlenratio: float = 0.0
+    ) -> List[Hypothesis]:
+        """Perform beam search.
+        Args:
+            x (torch.Tensor): Encoded speech feature (T, D)
+            maxlenratio (float): Input length ratio to obtain max output length.
+                If maxlenratio=0.0 (default), it uses a end-detect function
+                to automatically find maximum hypothesis lengths
+            minlenratio (float): Input length ratio to obtain min output length.
+        Returns:
+            list[Hypothesis]: N-best decoding results
+        """
+        self.conservative = True  # always true
+        if self.block_size and self.hop_size and self.look_ahead:
+            cur_end_frame = int(self.block_size - self.look_ahead)
+        else:
+            cur_end_frame = x.shape[0]
+        process_idx = 0
+        if cur_end_frame < x.shape[0]:
+            h = x.narrow(0, 0, cur_end_frame)
+        else:
+            h = x
+        # set length bounds
+        if maxlenratio == 0:
+            maxlen = x.shape[0]
+        else:
+            maxlen = max(1, int(maxlenratio * x.size(0)))
+        minlen = int(minlenratio * x.size(0))
+        logging.info("decoder input length: " + str(x.shape[0]))
+        logging.info("max output length: " + str(maxlen))
+        logging.info("min output length: " + str(minlen))
+        # main loop of prefix search
+        running_hyps = self.init_hyp(h)
+        prev_hyps = []
+        ended_hyps = []
+        prev_repeat = False
+        continue_decode = True
+        while continue_decode:
+            move_to_next_block = False
+            if cur_end_frame < x.shape[0]:
+                h = x.narrow(0, 0, cur_end_frame)
+            else:
+                h = x
+            # extend states for ctc
+            self.extend(h, running_hyps)
+            while process_idx < maxlen:
+                logging.debug("position " + str(process_idx))
+                best = self.search(running_hyps, h)
+                if process_idx == maxlen - 1:
+                    # end decoding
+                    running_hyps = self.post_process(
+                        process_idx, maxlen, maxlenratio, best, ended_hyps
+                    )
+                n_batch = best.yseq.shape[0]
+                local_ended_hyps = []
+                is_local_eos = (
+                    best.yseq[torch.arange(n_batch), best.length - 1] == self.eos
+                )
+                for i in range(is_local_eos.shape[0]):
+                    if is_local_eos[i]:
+                        hyp = self._select(best, i)
+                        local_ended_hyps.append(hyp)
+                    # NOTE(tsunoo): check repetitions here
+                    # This is a implicit implementation of
+                    # Eq (11) in https://arxiv.org/abs/2006.14941
+                    # A flag prev_repeat is used instead of using set
+                    elif (
+                        not prev_repeat
+                        and best.yseq[i, -1] in best.yseq[i, :-1]
+                        and cur_end_frame < x.shape[0]
+                    ):
+                        move_to_next_block = True
+                        prev_repeat = True
+                if maxlenratio == 0.0 and end_detect(
+                    [lh.asdict() for lh in local_ended_hyps], process_idx
+                ):
+                    logging.info(f"end detected at {process_idx}")
+                    continue_decode = False
+                    break
+                if len(local_ended_hyps) > 0 and cur_end_frame < x.shape[0]:
+                    move_to_next_block = True
+                if move_to_next_block:
+                    if (
+                        self.hop_size
+                        and cur_end_frame + int(self.hop_size) + int(self.look_ahead)
+                        < x.shape[0]
+                    ):
+                        cur_end_frame += int(self.hop_size)
+                    else:
+                        cur_end_frame = x.shape[0]
+                    logging.debug("Going to next block: %d", cur_end_frame)
+                    if process_idx > 1 and len(prev_hyps) > 0 and self.conservative:
+                        running_hyps = prev_hyps
+                        process_idx -= 1
+                        prev_hyps = []
+                    break
+                prev_repeat = False
+                prev_hyps = running_hyps
+                running_hyps = self.post_process(
+                    process_idx, maxlen, maxlenratio, best, ended_hyps
+                )
+                if cur_end_frame >= x.shape[0]:
+                    for hyp in local_ended_hyps:
+                        ended_hyps.append(hyp)
+                if len(running_hyps) == 0:
+                    logging.info("no hypothesis. Finish decoding.")
+                    continue_decode = False
+                    break
+                else:
+                    logging.debug(f"remained hypotheses: {len(running_hyps)}")
+                # increment number
+                process_idx += 1
+        nbest_hyps = sorted(ended_hyps, key=lambda x: x.score, reverse=True)
+        # check the number of hypotheses reaching to eos
+        if len(nbest_hyps) == 0:
+            logging.warning(
+                "there is no N-best results, perform recognition "
+                "again with smaller minlenratio."
+            )
+            return (
+                []
+                if minlenratio < 0.1
+                else self.forward(x, maxlenratio, max(0.0, minlenratio - 0.1))
+            )
+        # report the best result
+        best = nbest_hyps[0]
+        for k, v in best.scores.items():
+            logging.info(
+                f"{v:6.2f} * {self.weights[k]:3} = {v * self.weights[k]:6.2f} for {k}"
+            )
+        logging.info(f"total log probability: {best.score:.2f}")
+        logging.info(f"normalized log probability: {best.score / len(best.yseq):.2f}")
+        logging.info(f"total number of ended hypotheses: {len(nbest_hyps)}")
+        if self.token_list is not None:
+            logging.info(
+                "best hypo: "
+                + "".join([self.token_list[x] for x in best.yseq[1:-1]])
+                + "\n"
+            )
+        return nbest_hyps
+    def extend(self, x: torch.Tensor, hyps: Hypothesis) -> List[Hypothesis]:
+        """Extend probabilities and states with more encoded chunks.
+        Args:
+            x (torch.Tensor): The extended encoder output feature
+            hyps (Hypothesis): Current list of hypothesis
+        Returns:
+            Hypothesis: The exxtended hypothesis
+        """
+        for k, d in self.scorers.items():
+            if hasattr(d, "extend_prob"):
+                d.extend_prob(x)
+            if hasattr(d, "extend_state"):
+                hyps.states[k] = d.extend_state(hyps.states[k])

espnet/nets/beam_search.py ADDED Viewed

	@@ -0,0 +1,512 @@

+"""Beam search module."""
+from itertools import chain
+import logging
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import NamedTuple
+from typing import Tuple
+from typing import Union
+import torch
+from espnet.nets.e2e_asr_common import end_detect
+from espnet.nets.scorer_interface import PartialScorerInterface
+from espnet.nets.scorer_interface import ScorerInterface
+class Hypothesis(NamedTuple):
+    """Hypothesis data type."""
+    yseq: torch.Tensor
+    score: Union[float, torch.Tensor] = 0
+    scores: Dict[str, Union[float, torch.Tensor]] = dict()
+    states: Dict[str, Any] = dict()
+    def asdict(self) -> dict:
+        """Convert data to JSON-friendly dict."""
+        return self._replace(
+            yseq=self.yseq.tolist(),
+            score=float(self.score),
+            scores={k: float(v) for k, v in self.scores.items()},
+        )._asdict()
+class BeamSearch(torch.nn.Module):
+    """Beam search implementation."""
+    def __init__(
+        self,
+        scorers: Dict[str, ScorerInterface],
+        weights: Dict[str, float],
+        beam_size: int,
+        vocab_size: int,
+        sos: int,
+        eos: int,
+        token_list: List[str] = None,
+        pre_beam_ratio: float = 1.5,
+        pre_beam_score_key: str = None,
+    ):
+        """Initialize beam search.
+        Args:
+            scorers (dict[str, ScorerInterface]): Dict of decoder modules
+                e.g., Decoder, CTCPrefixScorer, LM
+                The scorer will be ignored if it is `None`
+            weights (dict[str, float]): Dict of weights for each scorers
+                The scorer will be ignored if its weight is 0
+            beam_size (int): The number of hypotheses kept during search
+            vocab_size (int): The number of vocabulary
+            sos (int): Start of sequence id
+            eos (int): End of sequence id
+            token_list (list[str]): List of tokens for debug log
+            pre_beam_score_key (str): key of scores to perform pre-beam search
+            pre_beam_ratio (float): beam size in the pre-beam search
+                will be `int(pre_beam_ratio * beam_size)`
+        """
+        super().__init__()
+        # set scorers
+        self.weights = weights
+        self.scorers = dict()
+        self.full_scorers = dict()
+        self.part_scorers = dict()
+        # this module dict is required for recursive cast
+        # `self.to(device, dtype)` in `recog.py`
+        self.nn_dict = torch.nn.ModuleDict()
+        for k, v in scorers.items():
+            w = weights.get(k, 0)
+            if w == 0 or v is None:
+                continue
+            assert isinstance(
+                v, ScorerInterface
+            ), f"{k} ({type(v)}) does not implement ScorerInterface"
+            self.scorers[k] = v
+            if isinstance(v, PartialScorerInterface):
+                self.part_scorers[k] = v
+            else:
+                self.full_scorers[k] = v
+            if isinstance(v, torch.nn.Module):
+                self.nn_dict[k] = v
+        # set configurations
+        self.sos = sos
+        self.eos = eos
+        self.token_list = token_list
+        self.pre_beam_size = int(pre_beam_ratio * beam_size)
+        self.beam_size = beam_size
+        self.n_vocab = vocab_size
+        if (
+            pre_beam_score_key is not None
+            and pre_beam_score_key != "full"
+            and pre_beam_score_key not in self.full_scorers
+        ):
+            raise KeyError(f"{pre_beam_score_key} is not found in {self.full_scorers}")
+        self.pre_beam_score_key = pre_beam_score_key
+        self.do_pre_beam = (
+            self.pre_beam_score_key is not None
+            and self.pre_beam_size < self.n_vocab
+            and len(self.part_scorers) > 0
+        )
+    def init_hyp(self, x: torch.Tensor) -> List[Hypothesis]:
+        """Get an initial hypothesis data.
+        Args:
+            x (torch.Tensor): The encoder output feature
+        Returns:
+            Hypothesis: The initial hypothesis.
+        """
+        init_states = dict()
+        init_scores = dict()
+        for k, d in self.scorers.items():
+            init_states[k] = d.init_state(x)
+            init_scores[k] = 0.0
+        return [
+            Hypothesis(
+                score=0.0,
+                scores=init_scores,
+                states=init_states,
+                yseq=torch.tensor([self.sos], device=x.device),
+            )
+        ]
+    @staticmethod
+    def append_token(xs: torch.Tensor, x: int) -> torch.Tensor:
+        """Append new token to prefix tokens.
+        Args:
+            xs (torch.Tensor): The prefix token
+            x (int): The new token to append
+        Returns:
+            torch.Tensor: New tensor contains: xs + [x] with xs.dtype and xs.device
+        """
+        x = torch.tensor([x], dtype=xs.dtype, device=xs.device)
+        return torch.cat((xs, x))
+    def score_full(
+        self, hyp: Hypothesis, x: torch.Tensor
+    ) -> Tuple[Dict[str, torch.Tensor], Dict[str, Any]]:
+        """Score new hypothesis by `self.full_scorers`.
+        Args:
+            hyp (Hypothesis): Hypothesis with prefix tokens to score
+            x (torch.Tensor): Corresponding input feature
+        Returns:
+            Tuple[Dict[str, torch.Tensor], Dict[str, Any]]: Tuple of
+                score dict of `hyp` that has string keys of `self.full_scorers`
+                and tensor score values of shape: `(self.n_vocab,)`,
+                and state dict that has string keys
+                and state values of `self.full_scorers`
+        """
+        scores = dict()
+        states = dict()
+        for k, d in self.full_scorers.items():
+            scores[k], states[k] = d.score(hyp.yseq, hyp.states[k], x)
+        return scores, states
+    def score_partial(
+        self, hyp: Hypothesis, ids: torch.Tensor, x: torch.Tensor
+    ) -> Tuple[Dict[str, torch.Tensor], Dict[str, Any]]:
+        """Score new hypothesis by `self.part_scorers`.
+        Args:
+            hyp (Hypothesis): Hypothesis with prefix tokens to score
+            ids (torch.Tensor): 1D tensor of new partial tokens to score
+            x (torch.Tensor): Corresponding input feature
+        Returns:
+            Tuple[Dict[str, torch.Tensor], Dict[str, Any]]: Tuple of
+                score dict of `hyp` that has string keys of `self.part_scorers`
+                and tensor score values of shape: `(len(ids),)`,
+                and state dict that has string keys
+                and state values of `self.part_scorers`
+        """
+        scores = dict()
+        states = dict()
+        for k, d in self.part_scorers.items():
+            scores[k], states[k] = d.score_partial(hyp.yseq, ids, hyp.states[k], x)
+        return scores, states
+    def beam(
+        self, weighted_scores: torch.Tensor, ids: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute topk full token ids and partial token ids.
+        Args:
+            weighted_scores (torch.Tensor): The weighted sum scores for each tokens.
+            Its shape is `(self.n_vocab,)`.
+            ids (torch.Tensor): The partial token ids to compute topk
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]:
+                The topk full token ids and partial token ids.
+                Their shapes are `(self.beam_size,)`
+        """
+        # no pre beam performed
+        if weighted_scores.size(0) == ids.size(0):
+            top_ids = weighted_scores.topk(self.beam_size)[1]
+            return top_ids, top_ids
+        # mask pruned in pre-beam not to select in topk
+        tmp = weighted_scores[ids]
+        weighted_scores[:] = -float("inf")
+        weighted_scores[ids] = tmp
+        top_ids = weighted_scores.topk(self.beam_size)[1]
+        local_ids = weighted_scores[ids].topk(self.beam_size)[1]
+        return top_ids, local_ids
+    @staticmethod
+    def merge_scores(
+        prev_scores: Dict[str, float],
+        next_full_scores: Dict[str, torch.Tensor],
+        full_idx: int,
+        next_part_scores: Dict[str, torch.Tensor],
+        part_idx: int,
+    ) -> Dict[str, torch.Tensor]:
+        """Merge scores for new hypothesis.
+        Args:
+            prev_scores (Dict[str, float]):
+                The previous hypothesis scores by `self.scorers`
+            next_full_scores (Dict[str, torch.Tensor]): scores by `self.full_scorers`
+            full_idx (int): The next token id for `next_full_scores`
+            next_part_scores (Dict[str, torch.Tensor]):
+                scores of partial tokens by `self.part_scorers`
+            part_idx (int): The new token id for `next_part_scores`
+        Returns:
+            Dict[str, torch.Tensor]: The new score dict.
+                Its keys are names of `self.full_scorers` and `self.part_scorers`.
+                Its values are scalar tensors by the scorers.
+        """
+        new_scores = dict()
+        for k, v in next_full_scores.items():
+            new_scores[k] = prev_scores[k] + v[full_idx]
+        for k, v in next_part_scores.items():
+            new_scores[k] = prev_scores[k] + v[part_idx]
+        return new_scores
+    def merge_states(self, states: Any, part_states: Any, part_idx: int) -> Any:
+        """Merge states for new hypothesis.
+        Args:
+            states: states of `self.full_scorers`
+            part_states: states of `self.part_scorers`
+            part_idx (int): The new token id for `part_scores`
+        Returns:
+            Dict[str, torch.Tensor]: The new score dict.
+                Its keys are names of `self.full_scorers` and `self.part_scorers`.
+                Its values are states of the scorers.
+        """
+        new_states = dict()
+        for k, v in states.items():
+            new_states[k] = v
+        for k, d in self.part_scorers.items():
+            new_states[k] = d.select_state(part_states[k], part_idx)
+        return new_states
+    def search(
+        self, running_hyps: List[Hypothesis], x: torch.Tensor
+    ) -> List[Hypothesis]:
+        """Search new tokens for running hypotheses and encoded speech x.
+        Args:
+            running_hyps (List[Hypothesis]): Running hypotheses on beam
+            x (torch.Tensor): Encoded speech feature (T, D)
+        Returns:
+            List[Hypotheses]: Best sorted hypotheses
+        """
+        best_hyps = []
+        part_ids = torch.arange(self.n_vocab, device=x.device)  # no pre-beam
+        for hyp in running_hyps:
+            # scoring
+            weighted_scores = torch.zeros(self.n_vocab, dtype=x.dtype, device=x.device)
+            scores, states = self.score_full(hyp, x)
+            for k in self.full_scorers:
+                weighted_scores += self.weights[k] * scores[k]
+            # partial scoring
+            if self.do_pre_beam:
+                pre_beam_scores = (
+                    weighted_scores
+                    if self.pre_beam_score_key == "full"
+                    else scores[self.pre_beam_score_key]
+                )
+                part_ids = torch.topk(pre_beam_scores, self.pre_beam_size)[1]
+            part_scores, part_states = self.score_partial(hyp, part_ids, x)
+            for k in self.part_scorers:
+                weighted_scores[part_ids] += self.weights[k] * part_scores[k]
+            # add previous hyp score
+            weighted_scores += hyp.score
+            # update hyps
+            for j, part_j in zip(*self.beam(weighted_scores, part_ids)):
+                # will be (2 x beam at most)
+                best_hyps.append(
+                    Hypothesis(
+                        score=weighted_scores[j],
+                        yseq=self.append_token(hyp.yseq, j),
+                        scores=self.merge_scores(
+                            hyp.scores, scores, j, part_scores, part_j
+                        ),
+                        states=self.merge_states(states, part_states, part_j),
+                    )
+                )
+            # sort and prune 2 x beam -> beam
+            best_hyps = sorted(best_hyps, key=lambda x: x.score, reverse=True)[
+                : min(len(best_hyps), self.beam_size)
+            ]
+        return best_hyps
+    def forward(
+        self, x: torch.Tensor, maxlenratio: float = 0.0, minlenratio: float = 0.0
+    ) -> List[Hypothesis]:
+        """Perform beam search.
+        Args:
+            x (torch.Tensor): Encoded speech feature (T, D)
+            maxlenratio (float): Input length ratio to obtain max output length.
+                If maxlenratio=0.0 (default), it uses a end-detect function
+                to automatically find maximum hypothesis lengths
+            minlenratio (float): Input length ratio to obtain min output length.
+        Returns:
+            list[Hypothesis]: N-best decoding results
+        """
+        # set length bounds
+        if maxlenratio == 0:
+            maxlen = x.shape[0]
+        else:
+            maxlen = max(1, int(maxlenratio * x.size(0)))
+        minlen = int(minlenratio * x.size(0))
+        logging.info("decoder input length: " + str(x.shape[0]))
+        logging.info("max output length: " + str(maxlen))
+        logging.info("min output length: " + str(minlen))
+        # main loop of prefix search
+        running_hyps = self.init_hyp(x)
+        ended_hyps = []
+        for i in range(maxlen):
+            logging.debug("position " + str(i))
+            best = self.search(running_hyps, x)
+            # post process of one iteration
+            running_hyps = self.post_process(i, maxlen, maxlenratio, best, ended_hyps)
+            # end detection
+            if maxlenratio == 0.0 and end_detect([h.asdict() for h in ended_hyps], i):
+                logging.info(f"end detected at {i}")
+                break
+            if len(running_hyps) == 0:
+                logging.info("no hypothesis. Finish decoding.")
+                break
+            else:
+                logging.debug(f"remained hypotheses: {len(running_hyps)}")
+        nbest_hyps = sorted(ended_hyps, key=lambda x: x.score, reverse=True)
+        # check the number of hypotheses reaching to eos
+        if len(nbest_hyps) == 0:
+            logging.warning(
+                "there is no N-best results, perform recognition "
+                "again with smaller minlenratio."
+            )
+            return (
+                []
+                if minlenratio < 0.1
+                else self.forward(x, maxlenratio, max(0.0, minlenratio - 0.1))
+            )
+        # report the best result
+        best = nbest_hyps[0]
+        for k, v in best.scores.items():
+            logging.info(
+                f"{v:6.2f} * {self.weights[k]:3} = {v * self.weights[k]:6.2f} for {k}"
+            )
+        logging.info(f"total log probability: {best.score:.2f}")
+        logging.info(f"normalized log probability: {best.score / len(best.yseq):.2f}")
+        logging.info(f"total number of ended hypotheses: {len(nbest_hyps)}")
+        if self.token_list is not None:
+            logging.info(
+                "best hypo: "
+                + "".join([self.token_list[x] for x in best.yseq[1:-1]])
+                + "\n"
+            )
+        return nbest_hyps
+    def post_process(
+        self,
+        i: int,
+        maxlen: int,
+        maxlenratio: float,
+        running_hyps: List[Hypothesis],
+        ended_hyps: List[Hypothesis],
+    ) -> List[Hypothesis]:
+        """Perform post-processing of beam search iterations.
+        Args:
+            i (int): The length of hypothesis tokens.
+            maxlen (int): The maximum length of tokens in beam search.
+            maxlenratio (int): The maximum length ratio in beam search.
+            running_hyps (List[Hypothesis]): The running hypotheses in beam search.
+            ended_hyps (List[Hypothesis]): The ended hypotheses in beam search.
+        Returns:
+            List[Hypothesis]: The new running hypotheses.
+        """
+        logging.debug(f"the number of running hypotheses: {len(running_hyps)}")
+        if self.token_list is not None:
+            logging.debug(
+                "best hypo: "
+                + "".join([self.token_list[x] for x in running_hyps[0].yseq[1:]])
+            )
+        # add eos in the final loop to avoid that there are no ended hyps
+        if i == maxlen - 1:
+            logging.info("adding <eos> in the last position in the loop")
+            running_hyps = [
+                h._replace(yseq=self.append_token(h.yseq, self.eos))
+                for h in running_hyps
+            ]
+        # add ended hypotheses to a final list, and removed them from current hypotheses
+        # (this will be a problem, number of hyps < beam)
+        remained_hyps = []
+        for hyp in running_hyps:
+            if hyp.yseq[-1] == self.eos:
+                # e.g., Word LM needs to add final <eos> score
+                for k, d in chain(self.full_scorers.items(), self.part_scorers.items()):
+                    s = d.final_score(hyp.states[k])
+                    hyp.scores[k] += s
+                    hyp = hyp._replace(score=hyp.score + self.weights[k] * s)
+                ended_hyps.append(hyp)
+            else:
+                remained_hyps.append(hyp)
+        return remained_hyps
+def beam_search(
+    x: torch.Tensor,
+    sos: int,
+    eos: int,
+    beam_size: int,
+    vocab_size: int,
+    scorers: Dict[str, ScorerInterface],
+    weights: Dict[str, float],
+    token_list: List[str] = None,
+    maxlenratio: float = 0.0,
+    minlenratio: float = 0.0,
+    pre_beam_ratio: float = 1.5,
+    pre_beam_score_key: str = "full",
+) -> list:
+    """Perform beam search with scorers.
+    Args:
+        x (torch.Tensor): Encoded speech feature (T, D)
+        sos (int): Start of sequence id
+        eos (int): End of sequence id
+        beam_size (int): The number of hypotheses kept during search
+        vocab_size (int): The number of vocabulary
+        scorers (dict[str, ScorerInterface]): Dict of decoder modules
+            e.g., Decoder, CTCPrefixScorer, LM
+            The scorer will be ignored if it is `None`
+        weights (dict[str, float]): Dict of weights for each scorers
+            The scorer will be ignored if its weight is 0
+        token_list (list[str]): List of tokens for debug log
+        maxlenratio (float): Input length ratio to obtain max output length.
+            If maxlenratio=0.0 (default), it uses a end-detect function
+            to automatically find maximum hypothesis lengths
+        minlenratio (float): Input length ratio to obtain min output length.
+        pre_beam_score_key (str): key of scores to perform pre-beam search
+        pre_beam_ratio (float): beam size in the pre-beam search
+            will be `int(pre_beam_ratio * beam_size)`
+    Returns:
+        list: N-best decoding results
+    """
+    ret = BeamSearch(
+        scorers,
+        weights,
+        beam_size=beam_size,
+        vocab_size=vocab_size,
+        pre_beam_ratio=pre_beam_ratio,
+        pre_beam_score_key=pre_beam_score_key,
+        sos=sos,
+        eos=eos,
+        token_list=token_list,
+    ).forward(x=x, maxlenratio=maxlenratio, minlenratio=minlenratio)
+    return [h.asdict() for h in ret]

espnet/nets/beam_search_transducer.py ADDED Viewed

	@@ -0,0 +1,629 @@

+"""Search algorithms for transducer models."""
+from typing import List
+from typing import Union
+import numpy as np
+import torch
+from espnet.nets.pytorch_backend.transducer.utils import create_lm_batch_state
+from espnet.nets.pytorch_backend.transducer.utils import init_lm_state
+from espnet.nets.pytorch_backend.transducer.utils import is_prefix
+from espnet.nets.pytorch_backend.transducer.utils import recombine_hyps
+from espnet.nets.pytorch_backend.transducer.utils import select_lm_state
+from espnet.nets.pytorch_backend.transducer.utils import substract
+from espnet.nets.transducer_decoder_interface import Hypothesis
+from espnet.nets.transducer_decoder_interface import NSCHypothesis
+from espnet.nets.transducer_decoder_interface import TransducerDecoderInterface
+class BeamSearchTransducer:
+    """Beam search implementation for transducer."""
+    def __init__(
+        self,
+        decoder: Union[TransducerDecoderInterface, torch.nn.Module],
+        joint_network: torch.nn.Module,
+        beam_size: int,
+        lm: torch.nn.Module = None,
+        lm_weight: float = 0.1,
+        search_type: str = "default",
+        max_sym_exp: int = 2,
+        u_max: int = 50,
+        nstep: int = 1,
+        prefix_alpha: int = 1,
+        score_norm: bool = True,
+        nbest: int = 1,
+    ):
+        """Initialize transducer beam search.
+        Args:
+            decoder: Decoder class to use
+            joint_network: Joint Network class
+            beam_size: Number of hypotheses kept during search
+            lm: LM class to use
+            lm_weight: lm weight for soft fusion
+            search_type: type of algorithm to use for search
+            max_sym_exp: number of maximum symbol expansions at each time step ("tsd")
+            u_max: maximum output sequence length ("alsd")
+            nstep: number of maximum expansion steps at each time step ("nsc")
+            prefix_alpha: maximum prefix length in prefix search ("nsc")
+            score_norm: normalize final scores by length ("default")
+            nbest: number of returned final hypothesis
+        """
+        self.decoder = decoder
+        self.joint_network = joint_network
+        self.beam_size = beam_size
+        self.hidden_size = decoder.dunits
+        self.vocab_size = decoder.odim
+        self.blank = decoder.blank
+        if self.beam_size <= 1:
+            self.search_algorithm = self.greedy_search
+        elif search_type == "default":
+            self.search_algorithm = self.default_beam_search
+        elif search_type == "tsd":
+            self.search_algorithm = self.time_sync_decoding
+        elif search_type == "alsd":
+            self.search_algorithm = self.align_length_sync_decoding
+        elif search_type == "nsc":
+            self.search_algorithm = self.nsc_beam_search
+        else:
+            raise NotImplementedError
+        self.lm = lm
+        self.lm_weight = lm_weight
+        if lm is not None:
+            self.use_lm = True
+            self.is_wordlm = True if hasattr(lm.predictor, "wordlm") else False
+            self.lm_predictor = lm.predictor.wordlm if self.is_wordlm else lm.predictor
+            self.lm_layers = len(self.lm_predictor.rnn)
+        else:
+            self.use_lm = False
+        self.max_sym_exp = max_sym_exp
+        self.u_max = u_max
+        self.nstep = nstep
+        self.prefix_alpha = prefix_alpha
+        self.score_norm = score_norm
+        self.nbest = nbest
+    def __call__(self, h: torch.Tensor) -> Union[List[Hypothesis], List[NSCHypothesis]]:
+        """Perform beam search.
+        Args:
+            h: Encoded speech features (T_max, D_enc)
+        Returns:
+            nbest_hyps: N-best decoding results
+        """
+        self.decoder.set_device(h.device)
+        if not hasattr(self.decoder, "decoders"):
+            self.decoder.set_data_type(h.dtype)
+        nbest_hyps = self.search_algorithm(h)
+        return nbest_hyps
+    def sort_nbest(
+        self, hyps: Union[List[Hypothesis], List[NSCHypothesis]]
+    ) -> Union[List[Hypothesis], List[NSCHypothesis]]:
+        """Sort hypotheses by score or score given sequence length.
+        Args:
+            hyps: list of hypotheses
+        Return:
+            hyps: sorted list of hypotheses
+        """
+        if self.score_norm:
+            hyps.sort(key=lambda x: x.score / len(x.yseq), reverse=True)
+        else:
+            hyps.sort(key=lambda x: x.score, reverse=True)
+        return hyps[: self.nbest]
+    def greedy_search(self, h: torch.Tensor) -> List[Hypothesis]:
+        """Greedy search implementation for transformer-transducer.
+        Args:
+            h: Encoded speech features (T_max, D_enc)
+        Returns:
+            hyp: 1-best decoding results
+        """
+        dec_state = self.decoder.init_state(1)
+        hyp = Hypothesis(score=0.0, yseq=[self.blank], dec_state=dec_state)
+        cache = {}
+        y, state, _ = self.decoder.score(hyp, cache)
+        for i, hi in enumerate(h):
+            ytu = torch.log_softmax(self.joint_network(hi, y), dim=-1)
+            logp, pred = torch.max(ytu, dim=-1)
+            if pred != self.blank:
+                hyp.yseq.append(int(pred))
+                hyp.score += float(logp)
+                hyp.dec_state = state
+                y, state, _ = self.decoder.score(hyp, cache)
+        return [hyp]
+    def default_beam_search(self, h: torch.Tensor) -> List[Hypothesis]:
+        """Beam search implementation.
+        Args:
+            x: Encoded speech features (T_max, D_enc)
+        Returns:
+            nbest_hyps: N-best decoding results
+        """
+        beam = min(self.beam_size, self.vocab_size)
+        beam_k = min(beam, (self.vocab_size - 1))
+        dec_state = self.decoder.init_state(1)
+        kept_hyps = [Hypothesis(score=0.0, yseq=[self.blank], dec_state=dec_state)]
+        cache = {}
+        for hi in h:
+            hyps = kept_hyps
+            kept_hyps = []
+            while True:
+                max_hyp = max(hyps, key=lambda x: x.score)
+                hyps.remove(max_hyp)
+                y, state, lm_tokens = self.decoder.score(max_hyp, cache)
+                ytu = torch.log_softmax(self.joint_network(hi, y), dim=-1)
+                top_k = ytu[1:].topk(beam_k, dim=-1)
+                kept_hyps.append(
+                    Hypothesis(
+                        score=(max_hyp.score + float(ytu[0:1])),
+                        yseq=max_hyp.yseq[:],
+                        dec_state=max_hyp.dec_state,
+                        lm_state=max_hyp.lm_state,
+                    )
+                )
+                if self.use_lm:
+                    lm_state, lm_scores = self.lm.predict(max_hyp.lm_state, lm_tokens)
+                else:
+                    lm_state = max_hyp.lm_state
+                for logp, k in zip(*top_k):
+                    score = max_hyp.score + float(logp)
+                    if self.use_lm:
+                        score += self.lm_weight * lm_scores[0][k + 1]
+                    hyps.append(
+                        Hypothesis(
+                            score=score,
+                            yseq=max_hyp.yseq[:] + [int(k + 1)],
+                            dec_state=state,
+                            lm_state=lm_state,
+                        )
+                    )
+                hyps_max = float(max(hyps, key=lambda x: x.score).score)
+                kept_most_prob = sorted(
+                    [hyp for hyp in kept_hyps if hyp.score > hyps_max],
+                    key=lambda x: x.score,
+                )
+                if len(kept_most_prob) >= beam:
+                    kept_hyps = kept_most_prob
+                    break
+        return self.sort_nbest(kept_hyps)
+    def time_sync_decoding(self, h: torch.Tensor) -> List[Hypothesis]:
+        """Time synchronous beam search implementation.
+        Based on https://ieeexplore.ieee.org/document/9053040
+        Args:
+            h: Encoded speech features (T_max, D_enc)
+        Returns:
+            nbest_hyps: N-best decoding results
+        """
+        beam = min(self.beam_size, self.vocab_size)
+        beam_state = self.decoder.init_state(beam)
+        B = [
+            Hypothesis(
+                yseq=[self.blank],
+                score=0.0,
+                dec_state=self.decoder.select_state(beam_state, 0),
+            )
+        ]
+        cache = {}
+        if self.use_lm and not self.is_wordlm:
+            B[0].lm_state = init_lm_state(self.lm_predictor)
+        for hi in h:
+            A = []
+            C = B
+            h_enc = hi.unsqueeze(0)
+            for v in range(self.max_sym_exp):
+                D = []
+                beam_y, beam_state, beam_lm_tokens = self.decoder.batch_score(
+                    C,
+                    beam_state,
+                    cache,
+                    self.use_lm,
+                )
+                beam_logp = torch.log_softmax(self.joint_network(h_enc, beam_y), dim=-1)
+                beam_topk = beam_logp[:, 1:].topk(beam, dim=-1)
+                seq_A = [h.yseq for h in A]
+                for i, hyp in enumerate(C):
+                    if hyp.yseq not in seq_A:
+                        A.append(
+                            Hypothesis(
+                                score=(hyp.score + float(beam_logp[i, 0])),
+                                yseq=hyp.yseq[:],
+                                dec_state=hyp.dec_state,
+                                lm_state=hyp.lm_state,
+                            )
+                        )
+                    else:
+                        dict_pos = seq_A.index(hyp.yseq)
+                        A[dict_pos].score = np.logaddexp(
+                            A[dict_pos].score, (hyp.score + float(beam_logp[i, 0]))
+                        )
+                if v < (self.max_sym_exp - 1):
+                    if self.use_lm:
+                        beam_lm_states = create_lm_batch_state(
+                            [c.lm_state for c in C], self.lm_layers, self.is_wordlm
+                        )
+                        beam_lm_states, beam_lm_scores = self.lm.buff_predict(
+                            beam_lm_states, beam_lm_tokens, len(C)
+                        )
+                    for i, hyp in enumerate(C):
+                        for logp, k in zip(beam_topk[0][i], beam_topk[1][i] + 1):
+                            new_hyp = Hypothesis(
+                                score=(hyp.score + float(logp)),
+                                yseq=(hyp.yseq + [int(k)]),
+                                dec_state=self.decoder.select_state(beam_state, i),
+                                lm_state=hyp.lm_state,
+                            )
+                            if self.use_lm:
+                                new_hyp.score += self.lm_weight * beam_lm_scores[i, k]
+                                new_hyp.lm_state = select_lm_state(
+                                    beam_lm_states, i, self.lm_layers, self.is_wordlm
+                                )
+                            D.append(new_hyp)
+                C = sorted(D, key=lambda x: x.score, reverse=True)[:beam]
+            B = sorted(A, key=lambda x: x.score, reverse=True)[:beam]
+        return self.sort_nbest(B)
+    def align_length_sync_decoding(self, h: torch.Tensor) -> List[Hypothesis]:
+        """Alignment-length synchronous beam search implementation.
+        Based on https://ieeexplore.ieee.org/document/9053040
+        Args:
+            h: Encoded speech features (T_max, D_enc)
+        Returns:
+            nbest_hyps: N-best decoding results
+        """
+        beam = min(self.beam_size, self.vocab_size)
+        h_length = int(h.size(0))
+        u_max = min(self.u_max, (h_length - 1))
+        beam_state = self.decoder.init_state(beam)
+        B = [
+            Hypothesis(
+                yseq=[self.blank],
+                score=0.0,
+                dec_state=self.decoder.select_state(beam_state, 0),
+            )
+        ]
+        final = []
+        cache = {}
+        if self.use_lm and not self.is_wordlm:
+            B[0].lm_state = init_lm_state(self.lm_predictor)
+        for i in range(h_length + u_max):
+            A = []
+            B_ = []
+            h_states = []
+            for hyp in B:
+                u = len(hyp.yseq) - 1
+                t = i - u + 1
+                if t > (h_length - 1):
+                    continue
+                B_.append(hyp)
+                h_states.append((t, h[t]))
+            if B_:
+                beam_y, beam_state, beam_lm_tokens = self.decoder.batch_score(
+                    B_,
+                    beam_state,
+                    cache,
+                    self.use_lm,
+                )
+                h_enc = torch.stack([h[1] for h in h_states])
+                beam_logp = torch.log_softmax(self.joint_network(h_enc, beam_y), dim=-1)
+                beam_topk = beam_logp[:, 1:].topk(beam, dim=-1)
+                if self.use_lm:
+                    beam_lm_states = create_lm_batch_state(
+                        [b.lm_state for b in B_], self.lm_layers, self.is_wordlm
+                    )
+                    beam_lm_states, beam_lm_scores = self.lm.buff_predict(
+                        beam_lm_states, beam_lm_tokens, len(B_)
+                    )
+                for i, hyp in enumerate(B_):
+                    new_hyp = Hypothesis(
+                        score=(hyp.score + float(beam_logp[i, 0])),
+                        yseq=hyp.yseq[:],
+                        dec_state=hyp.dec_state,
+                        lm_state=hyp.lm_state,
+                    )
+                    A.append(new_hyp)
+                    if h_states[i][0] == (h_length - 1):
+                        final.append(new_hyp)
+                    for logp, k in zip(beam_topk[0][i], beam_topk[1][i] + 1):
+                        new_hyp = Hypothesis(
+                            score=(hyp.score + float(logp)),
+                            yseq=(hyp.yseq[:] + [int(k)]),
+                            dec_state=self.decoder.select_state(beam_state, i),
+                            lm_state=hyp.lm_state,
+                        )
+                        if self.use_lm:
+                            new_hyp.score += self.lm_weight * beam_lm_scores[i, k]
+                            new_hyp.lm_state = select_lm_state(
+                                beam_lm_states, i, self.lm_layers, self.is_wordlm
+                            )
+                        A.append(new_hyp)
+                B = sorted(A, key=lambda x: x.score, reverse=True)[:beam]
+                B = recombine_hyps(B)
+        if final:
+            return self.sort_nbest(final)
+        else:
+            return B
+    def nsc_beam_search(self, h: torch.Tensor) -> List[NSCHypothesis]:
+        """N-step constrained beam search implementation.
+        Based and modified from https://arxiv.org/pdf/2002.03577.pdf.
+        Please reference ESPnet (b-flo, PR #2444) for any usage outside ESPnet
+        until further modifications.
+        Note: the algorithm is not in his "complete" form but works almost as
+        intended.
+        Args:
+            h: Encoded speech features (T_max, D_enc)
+        Returns:
+            nbest_hyps: N-best decoding results
+        """
+        beam = min(self.beam_size, self.vocab_size)
+        beam_k = min(beam, (self.vocab_size - 1))
+        beam_state = self.decoder.init_state(beam)
+        init_tokens = [
+            NSCHypothesis(
+                yseq=[self.blank],
+                score=0.0,
+                dec_state=self.decoder.select_state(beam_state, 0),
+            )
+        ]
+        cache = {}
+        beam_y, beam_state, beam_lm_tokens = self.decoder.batch_score(
+            init_tokens,
+            beam_state,
+            cache,
+            self.use_lm,
+        )
+        state = self.decoder.select_state(beam_state, 0)
+        if self.use_lm:
+            beam_lm_states, beam_lm_scores = self.lm.buff_predict(
+                None, beam_lm_tokens, 1
+            )
+            lm_state = select_lm_state(
+                beam_lm_states, 0, self.lm_layers, self.is_wordlm
+            )
+            lm_scores = beam_lm_scores[0]
+        else:
+            lm_state = None
+            lm_scores = None
+        kept_hyps = [
+            NSCHypothesis(
+                yseq=[self.blank],
+                score=0.0,
+                dec_state=state,
+                y=[beam_y[0]],
+                lm_state=lm_state,
+                lm_scores=lm_scores,
+            )
+        ]
+        for hi in h:
+            hyps = sorted(kept_hyps, key=lambda x: len(x.yseq), reverse=True)
+            kept_hyps = []
+            h_enc = hi.unsqueeze(0)
+            for j, hyp_j in enumerate(hyps[:-1]):
+                for hyp_i in hyps[(j + 1) :]:
+                    curr_id = len(hyp_j.yseq)
+                    next_id = len(hyp_i.yseq)
+                    if (
+                        is_prefix(hyp_j.yseq, hyp_i.yseq)
+                        and (curr_id - next_id) <= self.prefix_alpha
+                    ):
+                        ytu = torch.log_softmax(
+                            self.joint_network(hi, hyp_i.y[-1]), dim=-1
+                        )
+                        curr_score = hyp_i.score + float(ytu[hyp_j.yseq[next_id]])
+                        for k in range(next_id, (curr_id - 1)):
+                            ytu = torch.log_softmax(
+                                self.joint_network(hi, hyp_j.y[k]), dim=-1
+                            )
+                            curr_score += float(ytu[hyp_j.yseq[k + 1]])
+                        hyp_j.score = np.logaddexp(hyp_j.score, curr_score)
+            S = []
+            V = []
+            for n in range(self.nstep):
+                beam_y = torch.stack([hyp.y[-1] for hyp in hyps])
+                beam_logp = torch.log_softmax(self.joint_network(h_enc, beam_y), dim=-1)
+                beam_topk = beam_logp[:, 1:].topk(beam_k, dim=-1)
+                for i, hyp in enumerate(hyps):
+                    S.append(
+                        NSCHypothesis(
+                            yseq=hyp.yseq[:],
+                            score=hyp.score + float(beam_logp[i, 0:1]),
+                            y=hyp.y[:],
+                            dec_state=hyp.dec_state,
+                            lm_state=hyp.lm_state,
+                            lm_scores=hyp.lm_scores,
+                        )
+                    )
+                    for logp, k in zip(beam_topk[0][i], beam_topk[1][i] + 1):
+                        score = hyp.score + float(logp)
+                        if self.use_lm:
+                            score += self.lm_weight * float(hyp.lm_scores[k])
+                        V.append(
+                            NSCHypothesis(
+                                yseq=hyp.yseq[:] + [int(k)],
+                                score=score,
+                                y=hyp.y[:],
+                                dec_state=hyp.dec_state,
+                                lm_state=hyp.lm_state,
+                                lm_scores=hyp.lm_scores,
+                            )
+                        )
+                V.sort(key=lambda x: x.score, reverse=True)
+                V = substract(V, hyps)[:beam]
+                beam_state = self.decoder.create_batch_states(
+                    beam_state,
+                    [v.dec_state for v in V],
+                    [v.yseq for v in V],
+                )
+                beam_y, beam_state, beam_lm_tokens = self.decoder.batch_score(
+                    V,
+                    beam_state,
+                    cache,
+                    self.use_lm,
+                )
+                if self.use_lm:
+                    beam_lm_states = create_lm_batch_state(
+                        [v.lm_state for v in V], self.lm_layers, self.is_wordlm
+                    )
+                    beam_lm_states, beam_lm_scores = self.lm.buff_predict(
+                        beam_lm_states, beam_lm_tokens, len(V)
+                    )
+                if n < (self.nstep - 1):
+                    for i, v in enumerate(V):
+                        v.y.append(beam_y[i])
+                        v.dec_state = self.decoder.select_state(beam_state, i)
+                        if self.use_lm:
+                            v.lm_state = select_lm_state(
+                                beam_lm_states, i, self.lm_layers, self.is_wordlm
+                            )
+                            v.lm_scores = beam_lm_scores[i]
+                    hyps = V[:]
+                else:
+                    beam_logp = torch.log_softmax(
+                        self.joint_network(h_enc, beam_y), dim=-1
+                    )
+                    for i, v in enumerate(V):
+                        if self.nstep != 1:
+                            v.score += float(beam_logp[i, 0])
+                        v.y.append(beam_y[i])
+                        v.dec_state = self.decoder.select_state(beam_state, i)
+                        if self.use_lm:
+                            v.lm_state = select_lm_state(
+                                beam_lm_states, i, self.lm_layers, self.is_wordlm
+                            )
+                            v.lm_scores = beam_lm_scores[i]
+            kept_hyps = sorted((S + V), key=lambda x: x.score, reverse=True)[:beam]
+        return self.sort_nbest(kept_hyps)

espnet/nets/chainer_backend/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Initialize sub package."""

espnet/nets/chainer_backend/asr_interface.py ADDED Viewed

	@@ -0,0 +1,29 @@

+"""ASR Interface module."""
+import chainer
+from espnet.nets.asr_interface import ASRInterface
+class ChainerASRInterface(ASRInterface, chainer.Chain):
+    """ASR Interface for ESPnet model implementation."""
+    @staticmethod
+    def custom_converter(*args, **kw):
+        """Get customconverter of the model (Chainer only)."""
+        raise NotImplementedError("custom converter method is not implemented")
+    @staticmethod
+    def custom_updater(*args, **kw):
+        """Get custom_updater of the model (Chainer only)."""
+        raise NotImplementedError("custom updater method is not implemented")
+    @staticmethod
+    def custom_parallel_updater(*args, **kw):
+        """Get custom_parallel_updater of the model (Chainer only)."""
+        raise NotImplementedError("custom parallel updater method is not implemented")
+    def get_total_subsampling_factor(self):
+        """Get total subsampling factor."""
+        raise NotImplementedError(
+            "get_total_subsampling_factor method is not implemented"
+        )

espnet/nets/chainer_backend/ctc.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import logging
+import chainer
+from chainer import cuda
+import chainer.functions as F
+import chainer.links as L
+import numpy as np
+class CTC(chainer.Chain):
+    """Chainer implementation of ctc layer.
+    Args:
+        odim (int): The output dimension.
+        eprojs (int | None): Dimension of input vectors from encoder.
+        dropout_rate (float): Dropout rate.
+    """
+    def __init__(self, odim, eprojs, dropout_rate):
+        super(CTC, self).__init__()
+        self.dropout_rate = dropout_rate
+        self.loss = None
+        with self.init_scope():
+            self.ctc_lo = L.Linear(eprojs, odim)
+    def __call__(self, hs, ys):
+        """CTC forward.
+        Args:
+            hs (list of chainer.Variable | N-dimension array):
+                Input variable from encoder.
+            ys (list of chainer.Variable | N-dimension array):
+                Input variable of decoder.
+        Returns:
+            chainer.Variable: A variable holding a scalar value of the CTC loss.
+        """
+        self.loss = None
+        ilens = [x.shape[0] for x in hs]
+        olens = [x.shape[0] for x in ys]
+        # zero padding for hs
+        y_hat = self.ctc_lo(
+            F.dropout(F.pad_sequence(hs), ratio=self.dropout_rate), n_batch_axes=2
+        )
+        y_hat = F.separate(y_hat, axis=1)  # ilen list of batch x hdim
+        # zero padding for ys
+        y_true = F.pad_sequence(ys, padding=-1)  # batch x olen
+        # get length info
+        input_length = chainer.Variable(self.xp.array(ilens, dtype=np.int32))
+        label_length = chainer.Variable(self.xp.array(olens, dtype=np.int32))
+        logging.info(
+            self.__class__.__name__ + " input lengths:  " + str(input_length.data)
+        )
+        logging.info(
+            self.__class__.__name__ + " output lengths: " + str(label_length.data)
+        )
+        # get ctc loss
+        self.loss = F.connectionist_temporal_classification(
+            y_hat, y_true, 0, input_length, label_length
+        )
+        logging.info("ctc loss:" + str(self.loss.data))
+        return self.loss
+    def log_softmax(self, hs):
+        """Log_softmax of frame activations.
+        Args:
+            hs (list of chainer.Variable | N-dimension array):
+                Input variable from encoder.
+        Returns:
+            chainer.Variable: A n-dimension float array.
+        """
+        y_hat = self.ctc_lo(F.pad_sequence(hs), n_batch_axes=2)
+        return F.log_softmax(y_hat.reshape(-1, y_hat.shape[-1])).reshape(y_hat.shape)
+class WarpCTC(chainer.Chain):
+    """Chainer implementation of warp-ctc layer.
+    Args:
+        odim (int): The output dimension.
+        eproj (int | None): Dimension of input vector from encoder.
+        dropout_rate (float): Dropout rate.
+    """
+    def __init__(self, odim, eprojs, dropout_rate):
+        super(WarpCTC, self).__init__()
+        self.dropout_rate = dropout_rate
+        self.loss = None
+        with self.init_scope():
+            self.ctc_lo = L.Linear(eprojs, odim)
+    def __call__(self, hs, ys):
+        """Core function of the Warp-CTC layer.
+        Args:
+            hs (iterable of chainer.Variable | N-dimention array):
+                Input variable from encoder.
+            ys (iterable of chainer.Variable | N-dimension array):
+                Input variable of decoder.
+        Returns:
+           chainer.Variable: A variable holding a scalar value of the CTC loss.
+        """
+        self.loss = None
+        ilens = [x.shape[0] for x in hs]
+        olens = [x.shape[0] for x in ys]
+        # zero padding for hs
+        y_hat = self.ctc_lo(
+            F.dropout(F.pad_sequence(hs), ratio=self.dropout_rate), n_batch_axes=2
+        )
+        y_hat = y_hat.transpose(1, 0, 2)  # batch x frames x hdim
+        # get length info
+        logging.info(self.__class__.__name__ + " input lengths:  " + str(ilens))
+        logging.info(self.__class__.__name__ + " output lengths: " + str(olens))
+        # get ctc loss
+        from chainer_ctc.warpctc import ctc as warp_ctc
+        self.loss = warp_ctc(y_hat, ilens, [cuda.to_cpu(y.data) for y in ys])[0]
+        logging.info("ctc loss:" + str(self.loss.data))
+        return self.loss
+    def log_softmax(self, hs):
+        """Log_softmax of frame activations.
+        Args:
+            hs (list of chainer.Variable | N-dimension array):
+                Input variable from encoder.
+        Returns:
+            chainer.Variable: A n-dimension float array.
+        """
+        y_hat = self.ctc_lo(F.pad_sequence(hs), n_batch_axes=2)
+        return F.log_softmax(y_hat.reshape(-1, y_hat.shape[-1])).reshape(y_hat.shape)
+    def argmax(self, hs_pad):
+        """argmax of frame activations
+        :param chainer variable hs_pad: 3d tensor (B, Tmax, eprojs)
+        :return: argmax applied 2d tensor (B, Tmax)
+        :rtype: chainer.Variable
+        """
+        return F.argmax(self.ctc_lo(F.pad_sequence(hs_pad), n_batch_axes=2), axis=-1)
+def ctc_for(args, odim):
+    """Return the CTC layer corresponding to the args.
+    Args:
+        args (Namespace): The program arguments.
+        odim (int): The output dimension.
+    Returns:
+        The CTC module.
+    """
+    ctc_type = args.ctc_type
+    if ctc_type == "builtin":
+        logging.info("Using chainer CTC implementation")
+        ctc = CTC(odim, args.eprojs, args.dropout_rate)
+    elif ctc_type == "warpctc":
+        logging.info("Using warpctc CTC implementation")
+        ctc = WarpCTC(odim, args.eprojs, args.dropout_rate)
+    else:
+        raise ValueError('ctc_type must be "builtin" or "warpctc": {}'.format(ctc_type))
+    return ctc

espnet/nets/chainer_backend/deterministic_embed_id.py ADDED Viewed

	@@ -0,0 +1,253 @@

+import numpy
+import six
+import chainer
+from chainer import cuda
+from chainer import function_node
+from chainer.initializers import normal
+# from chainer.functions.connection import embed_id
+from chainer import link
+from chainer.utils import type_check
+from chainer import variable
+"""Deterministic EmbedID link and function
+   copied from chainer/links/connection/embed_id.py
+   and chainer/functions/connection/embed_id.py,
+   and modified not to use atomicAdd operation
+"""
+class EmbedIDFunction(function_node.FunctionNode):
+    def __init__(self, ignore_label=None):
+        self.ignore_label = ignore_label
+        self._w_shape = None
+    def check_type_forward(self, in_types):
+        type_check.expect(in_types.size() == 2)
+        x_type, w_type = in_types
+        type_check.expect(
+            x_type.dtype.kind == "i",
+            x_type.ndim >= 1,
+        )
+        type_check.expect(w_type.dtype == numpy.float32, w_type.ndim == 2)
+    def forward(self, inputs):
+        self.retain_inputs((0,))
+        x, W = inputs
+        self._w_shape = W.shape
+        if not type_check.same_types(*inputs):
+            raise ValueError(
+                "numpy and cupy must not be used together\n"
+                "type(W): {0}, type(x): {1}".format(type(W), type(x))
+            )
+        xp = cuda.get_array_module(*inputs)
+        if chainer.is_debug():
+            valid_x = xp.logical_and(0 <= x, x < len(W))
+            if self.ignore_label is not None:
+                valid_x = xp.logical_or(valid_x, x == self.ignore_label)
+            if not valid_x.all():
+                raise ValueError(
+                    "Each not ignored `x` value need to satisfy" "`0 <= x < len(W)`"
+                )
+        if self.ignore_label is not None:
+            mask = x == self.ignore_label
+            return (xp.where(mask[..., None], 0, W[xp.where(mask, 0, x)]),)
+        return (W[x],)
+    def backward(self, indexes, grad_outputs):
+        inputs = self.get_retained_inputs()
+        gW = EmbedIDGrad(self._w_shape, self.ignore_label).apply(inputs + grad_outputs)[
+            0
+        ]
+        return None, gW
+class EmbedIDGrad(function_node.FunctionNode):
+    def __init__(self, w_shape, ignore_label=None):
+        self.w_shape = w_shape
+        self.ignore_label = ignore_label
+        self._gy_shape = None
+    def forward(self, inputs):
+        self.retain_inputs((0,))
+        xp = cuda.get_array_module(*inputs)
+        x, gy = inputs
+        self._gy_shape = gy.shape
+        gW = xp.zeros(self.w_shape, dtype=gy.dtype)
+        if xp is numpy:
+            # It is equivalent to `numpy.add.at(gW, x, gy)` but ufunc.at is
+            # too slow.
+            for ix, igy in six.moves.zip(x.ravel(), gy.reshape(x.size, -1)):
+                if ix == self.ignore_label:
+                    continue
+                gW[ix] += igy
+        else:
+            """
+            # original code based on cuda elementwise method
+            if self.ignore_label is None:
+                cuda.elementwise(
+                    'T gy, S x, S n_out', 'raw T gW',
+                    'ptrdiff_t w_ind[] = {x, i % n_out};'
+                    'atomicAdd(&gW[w_ind], gy)',
+                    'embed_id_bwd')(
+                        gy, xp.expand_dims(x, -1), gW.shape[1], gW)
+            else:
+                cuda.elementwise(
+                    'T gy, S x, S n_out, S ignore', 'raw T gW',
+                    '''
+                    if (x != ignore) {
+                      ptrdiff_t w_ind[] = {x, i % n_out};
+                      atomicAdd(&gW[w_ind], gy);
+                    }
+                    ''',
+                    'embed_id_bwd_ignore_label')(
+                        gy, xp.expand_dims(x, -1), gW.shape[1],
+                        self.ignore_label, gW)
+            """
+            # EmbedID gradient alternative without atomicAdd, which simply
+            # creates a one-hot vector and applies dot product
+            xi = xp.zeros((x.size, len(gW)), dtype=numpy.float32)
+            idx = xp.arange(x.size, dtype=numpy.int32) * len(gW) + x.ravel()
+            xi.ravel()[idx] = 1.0
+            if self.ignore_label is not None:
+                xi[:, self.ignore_label] = 0.0
+            gW = xi.T.dot(gy.reshape(x.size, -1)).astype(gW.dtype, copy=False)
+        return (gW,)
+    def backward(self, indexes, grads):
+        xp = cuda.get_array_module(*grads)
+        x = self.get_retained_inputs()[0].data
+        ggW = grads[0]
+        if self.ignore_label is not None:
+            mask = x == self.ignore_label
+            # To prevent index out of bounds, we need to check if ignore_label
+            # is inside of W.
+            if not (0 <= self.ignore_label < self.w_shape[1]):
+                x = xp.where(mask, 0, x)
+        ggy = ggW[x]
+        if self.ignore_label is not None:
+            mask, zero, _ = xp.broadcast_arrays(
+                mask[..., None], xp.zeros((), "f"), ggy.data
+            )
+            ggy = chainer.functions.where(mask, zero, ggy)
+        return None, ggy
+def embed_id(x, W, ignore_label=None):
+    r"""Efficient linear function for one-hot input.
+    This function implements so called *word embeddings*. It takes two
+    arguments: a set of IDs (words) ``x`` in :math:`B` dimensional integer
+    vector, and a set of all ID (word) embeddings ``W`` in :math:`V \\times d`
+    float32 matrix. It outputs :math:`B \\times d` matrix whose ``i``-th
+    column is the ``x[i]``-th column of ``W``.
+    This function is only differentiable on the input ``W``.
+    Args:
+        x (chainer.Variable | np.ndarray): Batch vectors of IDs. Each
+            element must be signed integer.
+        W (chainer.Variable | np.ndarray): Distributed representation
+            of each ID (a.k.a. word embeddings).
+        ignore_label (int): If ignore_label is an int value, i-th column
+            of return value is filled with 0.
+    Returns:
+        chainer.Variable: Embedded variable.
+    .. rubric:: :class:`~chainer.links.EmbedID`
+    Examples:
+        >>> x = np.array([2, 1]).astype('i')
+        >>> x
+        array([2, 1], dtype=int32)
+        >>> W = np.array([[0, 0, 0],
+        ...               [1, 1, 1],
+        ...               [2, 2, 2]]).astype('f')
+        >>> W
+        array([[ 0.,  0.,  0.],
+               [ 1.,  1.,  1.],
+               [ 2.,  2.,  2.]], dtype=float32)
+        >>> F.embed_id(x, W).data
+        array([[ 2.,  2.,  2.],
+               [ 1.,  1.,  1.]], dtype=float32)
+        >>> F.embed_id(x, W, ignore_label=1).data
+        array([[ 2.,  2.,  2.],
+               [ 0.,  0.,  0.]], dtype=float32)
+    """
+    return EmbedIDFunction(ignore_label=ignore_label).apply((x, W))[0]
+class EmbedID(link.Link):
+    """Efficient linear layer for one-hot input.
+    This is a link that wraps the :func:`~chainer.functions.embed_id` function.
+    This link holds the ID (word) embedding matrix ``W`` as a parameter.
+    Args:
+        in_size (int): Number of different identifiers (a.k.a. vocabulary size).
+        out_size (int): Output dimension.
+        initialW (Initializer): Initializer to initialize the weight.
+        ignore_label (int): If `ignore_label` is an int value, i-th column of
+            return value is filled with 0.
+    .. rubric:: :func:`~chainer.functions.embed_id`
+    Attributes:
+        W (~chainer.Variable): Embedding parameter matrix.
+    Examples:
+        >>> W = np.array([[0, 0, 0],
+        ...               [1, 1, 1],
+        ...               [2, 2, 2]]).astype('f')
+        >>> W
+        array([[ 0.,  0.,  0.],
+               [ 1.,  1.,  1.],
+               [ 2.,  2.,  2.]], dtype=float32)
+        >>> l = L.EmbedID(W.shape[0], W.shape[1], initialW=W)
+        >>> x = np.array([2, 1]).astype('i')
+        >>> x
+        array([2, 1], dtype=int32)
+        >>> y = l(x)
+        >>> y.data
+        array([[ 2.,  2.,  2.],
+               [ 1.,  1.,  1.]], dtype=float32)
+    """
+    ignore_label = None
+    def __init__(self, in_size, out_size, initialW=None, ignore_label=None):
+        super(EmbedID, self).__init__()
+        self.ignore_label = ignore_label
+        with self.init_scope():
+            if initialW is None:
+                initialW = normal.Normal(1.0)
+            self.W = variable.Parameter(initialW, (in_size, out_size))
+    def __call__(self, x):
+        """Extracts the word embedding of given IDs.
+        Args:
+            x (chainer.Variable): Batch vectors of IDs.
+        Returns:
+            chainer.Variable: Batch of corresponding embeddings.
+        """
+        return embed_id(x, self.W, ignore_label=self.ignore_label)