camenduru's picture
thanks to NVIDIA ❤
7934b29
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
# Task 1: Speech Command Recognition
## Preparing the dataset
Use the `process_speech_commands_data.py` script under <NEMO_ROOT>/scripts/dataset_processing in order to prepare the dataset.
```sh
python <NEMO_ROOT>/scripts/dataset_processing/process_speech_commands_data.py \
--data_root=<absolute path to where the data should be stored> \
--data_version=<either 1 or 2, indicating version of the dataset> \
--class_split=<either "all" or "sub", indicates whether all 30/35 classes should be used, or the 10+2 split should be used> \
--rebalance \
--log
```
## Train to convergence
```sh
python speech_to_label.py \
# (Optional: --config-path=<path to dir of configs> --config-name=<name of config without .yaml>) \
model.train_ds.manifest_filepath="<path to train manifest>" \
model.validation_ds.manifest_filepath=["<path to val manifest>","<path to test manifest>"] \
trainer.devices=2 \
trainer.accelerator="gpu" \
strategy="ddp" \
trainer.max_epochs=200 \
exp_manager.create_wandb_logger=True \
exp_manager.wandb_logger_kwargs.name="MatchboxNet-3x1x64-v1" \
exp_manager.wandb_logger_kwargs.project="MatchboxNet-v1" \
+trainer.precision=16 \
+trainer.amp_level=O1 # needed if using PyTorch < 1.6
```
# Task 2: Voice Activity Detection
## Preparing the dataset
Use the `process_vad_data.py` script under <NEMO_ROOT>/scripts/dataset_processing in order to prepare the dataset.
```sh
python process_vad_data.py \
--out_dir=<output path to where the generated manifest should be stored> \
--speech_data_root=<path where the speech data are stored> \
--background_data_root=<path where the background data are stored> \
--rebalance_method=<'under' or 'over' of 'fixed'> \
--log
(Optional --demo (for demonstration in tutorial). If you want to use your own background noise data, make sure to delete --demo)
```
## Train to convergence
```sh
python speech_to_label.py \
--config-path=<path to dir of configs e.g. "conf">
--config-name=<name of config without .yaml e.g. "matchboxnet_3x1x64_vad"> \
model.train_ds.manifest_filepath="<path to train manifest>" \
model.validation_ds.manifest_filepath=["<path to val manifest>","<path to test manifest>"] \
trainer.devices=2 \
trainer.accelerator="gpu" \
strategy="ddp" \
trainer.max_epochs=200 \
exp_manager.create_wandb_logger=True \
exp_manager.wandb_logger_kwargs.name="MatchboxNet-3x1x64-vad" \
exp_manager.wandb_logger_kwargs.project="MatchboxNet-vad" \
+trainer.precision=16 \
+trainer.amp_level=O1 # needed if using PyTorch < 1.6
```
# Task 3: Language Identification
## Preparing the dataset
Use the `filelist_to_manifest.py` script under <NEMO_ROOT>/scripts/speaker_tasks in order to prepare the dataset.
```
## Train to convergence
```sh
python speech_to_label.py \
--config-path=<path to dir of configs e.g. "../conf/lang_id">
--config-name=<name of config without .yaml e.g. "titanet_large"> \
model.train_ds.manifest_filepath="<path to train manifest>" \
model.validation_ds.manifest_filepath="<path to val manifest>" \
model.train_ds.augmentor.noise.manifest_path="<path to noise manifest>" \
model.train_ds.augmentor.impulse.manifest_path="<path to impulse manifest>" \
model.decoder.num_classes=<num of languages> \
trainer.devices=2 \
trainer.max_epochs=40 \
exp_manager.create_wandb_logger=True \
exp_manager.wandb_logger_kwargs.name="titanet" \
exp_manager.wandb_logger_kwargs.project="langid" \
+exp_manager.checkpoint_callback_params.monitor="val_acc_macro" \
+exp_manager.checkpoint_callback_params.mode="max" \
+trainer.precision=16 \
```
# Optional: Use tarred dataset to speed up data loading. Apply to both tasks.
## Prepare tarred dataset.
Prepare ONE manifest that contains all training data you would like to include. Validation should use non-tarred dataset.
Note that it's possible that tarred datasets impacts validation scores because it drop values in order to have same amount of files per tarfile;
Scores might be off since some data is missing.
Use the `convert_to_tarred_audio_dataset.py` script under <NEMO_ROOT>/scripts/speech_recognition in order to prepare tarred audio dataset.
For details, please see TarredAudioToClassificationLabelDataset in <NEMO_ROOT>/nemo/collections/asr/data/audio_to_label.py
python speech_to_label.py \
--config-path=<path to dir of configs e.g. "conf">
--config-name=<name of config without .yaml e.g. "matchboxnet_3x1x64_vad"> \
model.train_ds.manifest_filepath=<path to train tarred_audio_manifest.json> \
model.train_ds.is_tarred=True \
model.train_ds.tarred_audio_filepaths=<path to train tarred audio dataset e.g. audio_{0..2}.tar> \
+model.train_ds.num_worker=<num_shards used generating tarred dataset> \
model.validation_ds.manifest_filepath=<path to validation audio_manifest.json>\
trainer.devices=2 \
trainer.accelerator="gpu" \
strategy="ddp" \ \
trainer.max_epochs=200 \
exp_manager.create_wandb_logger=True \
exp_manager.wandb_logger_kwargs.name="MatchboxNet-3x1x64-vad" \
exp_manager.wandb_logger_kwargs.project="MatchboxNet-vad" \
+trainer.precision=16 \
+trainer.amp_level=O1 # needed if using PyTorch < 1.6
# Fine-tune a model
For documentation on fine-tuning this model, please visit -
https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/configs.html#fine-tuning-configurations
# Pretrained Models
For documentation on existing pretrained models, please visit -
https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/speech_classification/results.html#
"""
import pytorch_lightning as pl
import torch
from omegaconf import OmegaConf
from nemo.collections.asr.models import EncDecClassificationModel, EncDecSpeakerLabelModel
from nemo.core.config import hydra_runner
from nemo.utils import logging
from nemo.utils.exp_manager import exp_manager
@hydra_runner(config_path="../conf/matchboxnet", config_name="matchboxnet_3x1x64_v1")
def main(cfg):
logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
trainer = pl.Trainer(**cfg.trainer)
exp_manager(trainer, cfg.get("exp_manager", None))
if 'titanet' in cfg.name.lower():
model = EncDecSpeakerLabelModel(cfg=cfg.model, trainer=trainer)
else:
model = EncDecClassificationModel(cfg=cfg.model, trainer=trainer)
# Initialize the weights of the model from another model, if provided via config
model.maybe_init_from_pretrained_checkpoint(cfg)
trainer.fit(model)
torch.distributed.destroy_process_group()
if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None:
if trainer.is_global_zero:
trainer = pl.Trainer(devices=1, accelerator=cfg.trainer.accelerator, strategy=cfg.trainer.strategy)
if model.prepare_test(trainer):
trainer.test(model)
if __name__ == '__main__':
main() # noqa pylint: disable=no-value-for-parameter