Automatic Speech Recognition
NeMo
PyTorch
Icelandic
speech
audio
CTC
NeMo
QuartzNet
QuartzNet15x5
icelandic
Eval Results
stt_is_quartznet15x5_ft_ep56_875h / QuartzNet_FT15x5_Icelandic.yaml
carlosdanielhernandezmena's picture
Adding the acoustic model (.nemo) and the architecture (.yaml) to the Repo
bbb3d6f
name: &name "QuartzNet15x5"
model:
sample_rate: &sample_rate 16000
repeat: &repeat 5
dropout: &dropout 0.0
separable: &separable true
labels: &labels [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "á", "æ", "é", "í", "ð", "ó", "ö", "ú", "ý", "þ"]
train_ds:
manifest_filepath: ???
sample_rate: 16000
labels: *labels
batch_size: 16 ##########################
trim_silence: True
max_duration: 16.7
shuffle: True
num_workers: 8
pin_memory: true
# tarred datasets
is_tarred: false
tarred_audio_filepaths: null
shuffle_n: 2048
# bucketing params
bucketing_strategy: "synced_randomized"
bucketing_batch_size: null
validation_ds:
manifest_filepath: ???
sample_rate: 16000
labels: *labels
batch_size: 16 ##########################
shuffle: False
num_workers: 8
pin_memory: true
preprocessor:
_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
normalize: "per_feature"
window_size: 0.02
sample_rate: *sample_rate
window_stride: 0.01
window: "hann"
features: &n_mels 64
n_fft: 512
frame_splicing: 1
dither: 1.0e-05
spec_augment:
_target_: nemo.collections.asr.modules.SpectrogramAugmentation
rect_freq: 50
rect_masks: 5
rect_time: 120
encoder:
_target_: nemo.collections.asr.modules.ConvASREncoder
feat_in: *n_mels
activation: relu
conv_mask: true
jasper:
#1
- dilation: [1]
dropout: *dropout
filters: 256
kernel: [33]
repeat: 1
residual: false
separable: *separable
stride: [2]
#2
- dilation: [1]
dropout: *dropout
filters: 256
kernel: [33]
repeat: *repeat
residual: true
separable: *separable
stride: [1]
#3
- dilation: [1]
dropout: *dropout
filters: 256
kernel: [33]
repeat: *repeat
residual: true
separable: *separable
stride: [1]
#4
- dilation: [1]
dropout: *dropout
filters: 256
kernel: [33]
repeat: *repeat
residual: true
separable: *separable
stride: [1]
#5
- dilation: [1]
dropout: *dropout
filters: 256
kernel: [39]
repeat: *repeat
residual: true
separable: *separable
stride: [1]
#6
- dilation: [1]
dropout: *dropout
filters: 256
kernel: [39]
repeat: *repeat
residual: true
separable: *separable
stride: [1]
#7
- dilation: [1]
dropout: *dropout
filters: 256
kernel: [39]
repeat: *repeat
residual: true
separable: *separable
stride: [1]
#8
- dilation: [1]
dropout: *dropout
filters: 512
kernel: [51]
repeat: *repeat
residual: true
separable: *separable
stride: [1]
#9
- dilation: [1]
dropout: *dropout
filters: 512
kernel: [51]
repeat: *repeat
residual: true
separable: *separable
stride: [1]
#10
- dilation: [1]
dropout: *dropout
filters: 512
kernel: [51]
repeat: *repeat
residual: true
separable: *separable
stride: [1]
#11
- dilation: [1]
dropout: *dropout
filters: 512
kernel: [63]
repeat: *repeat
residual: true
separable: *separable
stride: [1]
#12
- dilation: [1]
dropout: *dropout
filters: 512
kernel: [63]
repeat: *repeat
residual: true
separable: *separable
stride: [1]
#13
- dilation: [1]
dropout: *dropout
filters: 512
kernel: [63]
repeat: *repeat
residual: true
separable: *separable
stride: [1]
#14
- dilation: [1]
dropout: *dropout
filters: 512
kernel: [75]
repeat: *repeat
residual: true
separable: *separable
stride: [1]
#15
- dilation: [1]
dropout: *dropout
filters: 512
kernel: [75]
repeat: *repeat
residual: true
separable: *separable
stride: [1]
#16
- dilation: [1]
dropout: *dropout
filters: 512
kernel: [75]
repeat: *repeat
residual: true
separable: *separable
stride: [1]
#17
- dilation: [2]
dropout: *dropout
filters: 512
kernel: [87]
repeat: 1
residual: false
separable: *separable
stride: [1]
#18
- dilation: [1]
dropout: *dropout
filters: &enc_filters 1024
kernel: [1]
repeat: 1
residual: false
stride: [1]
decoder:
_target_: nemo.collections.asr.modules.ConvASRDecoder
feat_in: *enc_filters
num_classes: 37
vocabulary: *labels
optim:
name: novograd
# _target_: nemo.core.optim.optimizers.Novograd
lr: 0.0012
# optimizer arguments
betas: [0.95, 0.25]
weight_decay: 0.001
# scheduler setup
sched:
name: CosineAnnealing
# pytorch lightning args
# monitor: val_loss
# reduce_on_plateau: false
# Scheduler params
warmup_steps: null
warmup_ratio: null
min_lr: 0.0
last_epoch: -1
trainer:
devices: 1 # number of gpus
max_epochs: 5
max_steps: -1 # computed at runtime if not set
num_nodes: 1
accelerator: gpu
strategy: ddp
accumulate_grad_batches: 1
enable_checkpointing: False # Provided by exp_manager
logger: False # Provided by exp_manager
log_every_n_steps: 1 # Interval of logging.
val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
benchmark: false # needs to be false for models with variable-length speech input as it slows down training
exp_manager:
exp_dir: null
name: *name
create_tensorboard_logger: True
create_checkpoint_callback: True
checkpoint_callback_params:
monitor: "val_wer"
mode: "min"
create_wandb_logger: False
wandb_logger_kwargs:
name: null
project: null