File size: 8,222 Bytes
39a3709 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 |
# Generated 2022-10-03 from:
# /netscratch/sagar/thesis/speechbrain/recipes/CommonVoice_de/ASR-Libri/seq2seq/hparams/train.yaml
# yamllint disable
# ############################################################################
# Model: E2E ASR with attention-based ASR
# Encoder: CRDNN model
# Decoder: GRU + beamsearch + RNNLM
# Tokens: BPE with unigram
# losses: CTC+ NLL
# Training: Librispeech 960h
# Authors: Ju-Chieh Chou, Mirco Ravanelli, Abdel Heba, Peter Plantinga,
# Samuele Cornell 2020
# ############################################################################
# Seed needs to be set at top of yaml, before objects with parameters
seed: 1200
__set_seed: !apply:torch.manual_seed [1200]
output_folder: results/CRDNN_BPE_960h_LM/1200
wer_file: results/CRDNN_BPE_960h_LM/1200/wer.txt
save_folder: results/CRDNN_BPE_960h_LM/1200/save
train_log: results/CRDNN_BPE_960h_LM/1200/train_log.txt
# Language model (LM) pretraining
# NB: To avoid mismatch, the speech recognizer must be trained with the same
# tokenizer used for LM training. Here, we download everything from the
# speechbrain HuggingFace repository. However, a local path pointing to a
# directory containing the lm.ckpt and tokenizer.ckpt may also be specified
# instead. E.g if you want to use your own LM / tokenizer.
# We have bos/eos id 0/0 so we use the same tokenizer and LM that uses bos id and eos id as 0/0.
pretrained_tokenizer_path: ../../Tokenizer/results/unigram/
pretrained_lm_path: ../../LM/results/RNN/2995/save/CKPT+2022-08-18+18-22-18+00
# Data files
data_folder: ../../CommonVoice # !PLACEHOLDER
# e,g./path/to/LibriSpeech
# noise/ris dataset will automatically be downloaded
# Data files
train_tsv_file: ../../CommonVoice/train.tsv # Standard CommonVoice .tsv files
dev_tsv_file: ../../CommonVoice/dev.tsv # Standard CommonVoice .tsv files
test_tsv_file: ../../CommonVoice/test.tsv # Standard CommonVoice .tsv files
accented_letters: true
language: de
ckpt_interval_minutes: 15 # save checkpoint every N min
csv_dir: ../../cv_de_acc
data_folder_rirs: ../../cv_de_acc # where to store noisy data for augment (change it if needed)
train_csv: ../../cv_de_acc/train.csv
valid_csv: ../../cv_de_acc/dev.csv
test_csv: ../../cv_de_acc/test.csv
skip_prep: false
# Training parameters
number_of_epochs: 25
number_of_ctc_epochs: 5
batch_size: 8
valid_batch_size: 8
test_batch_size: 8
lr: 1.0
ctc_weight: 0.5
sorting: ascending
dynamic_batching: false
# dynamic batching parameters, if used
dynamic_batch_sampler:
feats_hop_size: 0.01
max_batch_len: 20000 # in terms of frames
shuffle_ex: true
batch_ordering: random
num_buckets: 20
# Feature parameters
sample_rate: 16000
n_fft: 400
n_mels: 40
opt_class: !name:torch.optim.Adadelta
lr: 1.0
rho: 0.95
eps: 1.e-8
# Dataloader options
train_dataloader_opts:
batch_size: 8
valid_dataloader_opts:
batch_size: 8
test_dataloader_opts:
batch_size: 8
# Model parameters
activation: &id001 !name:torch.nn.LeakyReLU
dropout: 0.15
cnn_blocks: 2
cnn_channels: (128, 256)
inter_layer_pooling_size: (2, 2)
cnn_kernelsize: (3, 3)
time_pooling_size: 4
rnn_class: &id002 !name:speechbrain.nnet.RNN.LSTM
rnn_layers: 4
rnn_neurons: 1024
rnn_bidirectional: true
dnn_blocks: 2
dnn_neurons: 512
emb_size: 128
dec_neurons: 1024
output_neurons: 1000 # Number of tokens (same as LM)
blank_index: 0
bos_index: 0
eos_index: 0
# Decoding parameters
min_decode_ratio: 0.0
max_decode_ratio: 1.0
valid_beam_size: 80
test_beam_size: 80
eos_threshold: 1.5
using_max_attn_shift: true
max_attn_shift: 240
lm_weight: 0.50
ctc_weight_decode: 0.0
coverage_penalty: 1.5
temperature: 1.25
temperature_lm: 1.25
epoch_counter: &id013 !new:speechbrain.utils.epoch_loop.EpochCounter
limit: 25
normalize: &id008 !new:speechbrain.processing.features.InputNormalization
norm_type: global
compute_features: !new:speechbrain.lobes.features.Fbank
sample_rate: 16000
n_fft: 400
n_mels: 40
env_corrupt: &id009 !new:speechbrain.lobes.augment.EnvCorrupt
openrir_folder: ../../cv_de_acc
babble_prob: 0.0
reverb_prob: 0.0
noise_prob: 1.0
noise_snr_low: 0
noise_snr_high: 15
augmentation: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
sample_rate: 16000
speeds: [95, 100, 105]
enc: &id003 !new:speechbrain.lobes.models.CRDNN.CRDNN
input_shape: [null, null, 40]
activation: *id001
dropout: 0.15
cnn_blocks: 2
cnn_channels: (128, 256)
cnn_kernelsize: (3, 3)
inter_layer_pooling_size: (2, 2)
time_pooling: true
using_2d_pooling: false
time_pooling_size: 4
rnn_class: *id002
rnn_layers: 4
rnn_neurons: 1024
rnn_bidirectional: true
rnn_re_init: true
dnn_blocks: 2
dnn_neurons: 512
use_rnnp: false
emb: &id004 !new:speechbrain.nnet.embedding.Embedding
num_embeddings: 1000
embedding_dim: 128
dec: &id005 !new:speechbrain.nnet.RNN.AttentionalRNNDecoder
enc_dim: 512
input_size: 128
rnn_type: gru
attn_type: location
hidden_size: 1024
attn_dim: 1024
num_layers: 1
scaling: 1.0
channels: 10
kernel_size: 100
re_init: true
dropout: 0.15
ctc_lin: &id006 !new:speechbrain.nnet.linear.Linear
input_size: 512
n_neurons: 1000
seq_lin: &id007 !new:speechbrain.nnet.linear.Linear
input_size: 1024
n_neurons: 1000
log_softmax: !new:speechbrain.nnet.activations.Softmax
apply_log: true
ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
blank_index: 0
seq_cost: !name:speechbrain.nnet.losses.nll_loss
label_smoothing: 0.1
# This is the RNNLM that is used according to the Huggingface repository
# NB: It has to match the pre-trained RNNLM!!
lm_model: &id010 !new:speechbrain.lobes.models.RNNLM.RNNLM
output_neurons: 1000
embedding_dim: 128
activation: !name:torch.nn.LeakyReLU
dropout: 0.0
rnn_layers: 2
rnn_neurons: 2048
dnn_blocks: 1
dnn_neurons: 512
return_hidden: true # For inference
tokenizer: &id014 !new:sentencepiece.SentencePieceProcessor
# Models
modules:
enc: *id003
emb: *id004
dec: *id005
ctc_lin: *id006
seq_lin: *id007
normalize: *id008
env_corrupt: *id009
lm_model: *id010
model: &id011 !new:torch.nn.ModuleList
- [*id003, *id004, *id005, *id006, *id007]
valid_search: !new:speechbrain.decoders.S2SRNNBeamSearcher
embedding: *id004
decoder: *id005
linear: *id007
ctc_linear: *id006
bos_index: 0
eos_index: 0
blank_index: 0
min_decode_ratio: 0.0
max_decode_ratio: 1.0
beam_size: 80
eos_threshold: 1.5
using_max_attn_shift: true
max_attn_shift: 240
coverage_penalty: 1.5
temperature: 1.25
test_search: !new:speechbrain.decoders.S2SRNNBeamSearchLM
embedding: *id004
decoder: *id005
linear: *id007
ctc_linear: *id006
language_model: *id010
bos_index: 0
eos_index: 0
blank_index: 0
min_decode_ratio: 0.0
max_decode_ratio: 1.0
beam_size: 80
eos_threshold: 1.5
using_max_attn_shift: true
max_attn_shift: 240
coverage_penalty: 1.5
lm_weight: 0.50
ctc_weight: 0.0
temperature: 1.25
temperature_lm: 1.25
lr_annealing: &id012 !new:speechbrain.nnet.schedulers.NewBobScheduler
initial_value: 1.0
improvement_threshold: 0.0025
annealing_factor: 0.8
patient: 0
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
checkpoints_dir: results/CRDNN_BPE_960h_LM/1200/save
recoverables:
model: *id011
scheduler: *id012
normalizer: *id008
counter: *id013
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
save_file: results/CRDNN_BPE_960h_LM/1200/train_log.txt
error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
split_tokens: true
# The pretrainer allows a mapping between pretrained files and instances that
# are declared in the yaml. E.g here, we will download the file lm.ckpt
# and it will be loaded into "lm" which is pointing to the <lm_model> defined
# before.
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
collect_in: results/CRDNN_BPE_960h_LM/1200/save
loadables:
lm: *id010
tokenizer: *id014
paths:
lm: ../../LM/results/RNN/2995/save/CKPT+2022-08-18+18-22-18+00/model.ckpt
tokenizer: ../../Tokenizer/results/unigram//1000_unigram.model
|