|
name: &name "QuartzNet15x5" |
|
|
|
model: |
|
sample_rate: &sample_rate 16000 |
|
repeat: &repeat 5 |
|
dropout: &dropout 0.0 |
|
separable: &separable true |
|
labels: &labels [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "á", "é", "í", "ñ", "ó", "ú", "ü"] |
|
|
|
train_ds: |
|
manifest_filepath: ??? |
|
sample_rate: 16000 |
|
labels: *labels |
|
batch_size: 16 |
|
trim_silence: True |
|
max_duration: 16.7 |
|
shuffle: True |
|
num_workers: 8 |
|
pin_memory: true |
|
|
|
is_tarred: false |
|
tarred_audio_filepaths: null |
|
shuffle_n: 2048 |
|
|
|
bucketing_strategy: "synced_randomized" |
|
bucketing_batch_size: null |
|
|
|
validation_ds: |
|
manifest_filepath: ??? |
|
sample_rate: 16000 |
|
labels: *labels |
|
batch_size: 16 |
|
shuffle: False |
|
num_workers: 8 |
|
pin_memory: true |
|
|
|
preprocessor: |
|
_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor |
|
normalize: "per_feature" |
|
window_size: 0.02 |
|
sample_rate: *sample_rate |
|
window_stride: 0.01 |
|
window: "hann" |
|
features: &n_mels 64 |
|
n_fft: 512 |
|
frame_splicing: 1 |
|
dither: 1.0e-05 |
|
|
|
spec_augment: |
|
_target_: nemo.collections.asr.modules.SpectrogramAugmentation |
|
rect_freq: 50 |
|
rect_masks: 5 |
|
rect_time: 120 |
|
|
|
encoder: |
|
_target_: nemo.collections.asr.modules.ConvASREncoder |
|
feat_in: *n_mels |
|
activation: relu |
|
conv_mask: true |
|
|
|
jasper: |
|
|
|
- dilation: [1] |
|
dropout: *dropout |
|
filters: 256 |
|
kernel: [33] |
|
repeat: 1 |
|
residual: false |
|
separable: *separable |
|
stride: [2] |
|
|
|
- dilation: [1] |
|
dropout: *dropout |
|
filters: 256 |
|
kernel: [33] |
|
repeat: *repeat |
|
residual: true |
|
separable: *separable |
|
stride: [1] |
|
|
|
- dilation: [1] |
|
dropout: *dropout |
|
filters: 256 |
|
kernel: [33] |
|
repeat: *repeat |
|
residual: true |
|
separable: *separable |
|
stride: [1] |
|
|
|
- dilation: [1] |
|
dropout: *dropout |
|
filters: 256 |
|
kernel: [33] |
|
repeat: *repeat |
|
residual: true |
|
separable: *separable |
|
stride: [1] |
|
|
|
- dilation: [1] |
|
dropout: *dropout |
|
filters: 256 |
|
kernel: [39] |
|
repeat: *repeat |
|
residual: true |
|
separable: *separable |
|
stride: [1] |
|
|
|
- dilation: [1] |
|
dropout: *dropout |
|
filters: 256 |
|
kernel: [39] |
|
repeat: *repeat |
|
residual: true |
|
separable: *separable |
|
stride: [1] |
|
|
|
- dilation: [1] |
|
dropout: *dropout |
|
filters: 256 |
|
kernel: [39] |
|
repeat: *repeat |
|
residual: true |
|
separable: *separable |
|
stride: [1] |
|
|
|
- dilation: [1] |
|
dropout: *dropout |
|
filters: 512 |
|
kernel: [51] |
|
repeat: *repeat |
|
residual: true |
|
separable: *separable |
|
stride: [1] |
|
|
|
- dilation: [1] |
|
dropout: *dropout |
|
filters: 512 |
|
kernel: [51] |
|
repeat: *repeat |
|
residual: true |
|
separable: *separable |
|
stride: [1] |
|
|
|
- dilation: [1] |
|
dropout: *dropout |
|
filters: 512 |
|
kernel: [51] |
|
repeat: *repeat |
|
residual: true |
|
separable: *separable |
|
stride: [1] |
|
|
|
- dilation: [1] |
|
dropout: *dropout |
|
filters: 512 |
|
kernel: [63] |
|
repeat: *repeat |
|
residual: true |
|
separable: *separable |
|
stride: [1] |
|
|
|
- dilation: [1] |
|
dropout: *dropout |
|
filters: 512 |
|
kernel: [63] |
|
repeat: *repeat |
|
residual: true |
|
separable: *separable |
|
stride: [1] |
|
|
|
- dilation: [1] |
|
dropout: *dropout |
|
filters: 512 |
|
kernel: [63] |
|
repeat: *repeat |
|
residual: true |
|
separable: *separable |
|
stride: [1] |
|
|
|
- dilation: [1] |
|
dropout: *dropout |
|
filters: 512 |
|
kernel: [75] |
|
repeat: *repeat |
|
residual: true |
|
separable: *separable |
|
stride: [1] |
|
|
|
- dilation: [1] |
|
dropout: *dropout |
|
filters: 512 |
|
kernel: [75] |
|
repeat: *repeat |
|
residual: true |
|
separable: *separable |
|
stride: [1] |
|
|
|
- dilation: [1] |
|
dropout: *dropout |
|
filters: 512 |
|
kernel: [75] |
|
repeat: *repeat |
|
residual: true |
|
separable: *separable |
|
stride: [1] |
|
|
|
- dilation: [2] |
|
dropout: *dropout |
|
filters: 512 |
|
kernel: [87] |
|
repeat: 1 |
|
residual: false |
|
separable: *separable |
|
stride: [1] |
|
|
|
- dilation: [1] |
|
dropout: *dropout |
|
filters: &enc_filters 1024 |
|
kernel: [1] |
|
repeat: 1 |
|
residual: false |
|
stride: [1] |
|
|
|
decoder: |
|
_target_: nemo.collections.asr.modules.ConvASRDecoder |
|
feat_in: *enc_filters |
|
num_classes: 34 |
|
vocabulary: *labels |
|
|
|
optim: |
|
name: novograd |
|
|
|
lr: 0.0012 |
|
|
|
betas: [0.8, 0.5] |
|
weight_decay: 0.001 |
|
|
|
|
|
sched: |
|
name: CosineAnnealing |
|
|
|
|
|
|
|
|
|
|
|
|
|
warmup_steps: null |
|
warmup_ratio: null |
|
min_lr: 0.0 |
|
last_epoch: -1 |
|
|
|
trainer: |
|
devices: 1 |
|
max_epochs: 5 |
|
max_steps: -1 |
|
num_nodes: 1 |
|
accelerator: gpu |
|
strategy: ddp |
|
accumulate_grad_batches: 1 |
|
enable_checkpointing: False |
|
logger: False |
|
log_every_n_steps: 1 |
|
val_check_interval: 1.0 |
|
benchmark: false |
|
|
|
exp_manager: |
|
exp_dir: null |
|
name: *name |
|
create_tensorboard_logger: True |
|
create_checkpoint_callback: True |
|
checkpoint_callback_params: |
|
monitor: "val_wer" |
|
mode: "min" |
|
create_wandb_logger: False |
|
wandb_logger_kwargs: |
|
name: null |
|
project: null |
|
|
|
|