Upload speechbrain IC model
Browse files- .gitattributes +9 -0
- fluent-speech-commands/README.md +58 -0
- fluent-speech-commands/Tokenizer/hparams/tokenizer_bpe51.yaml +32 -0
- fluent-speech-commands/Tokenizer/prepare.py +1 -0
- fluent-speech-commands/Tokenizer/train.py +53 -0
- fluent-speech-commands/direct/__pycache__/prepare.cpython-37.pyc +0 -0
- fluent-speech-commands/direct/hparams/train.yaml +204 -0
- fluent-speech-commands/direct/prepare.py +1 -0
- fluent-speech-commands/direct/pretrained_models/EncoderDecoderASR-6406358104753086746/asr.ckpt +1 -0
- fluent-speech-commands/direct/pretrained_models/EncoderDecoderASR-6406358104753086746/hyperparams.yaml +1 -0
- fluent-speech-commands/direct/pretrained_models/EncoderDecoderASR-6406358104753086746/lm.ckpt +1 -0
- fluent-speech-commands/direct/pretrained_models/EncoderDecoderASR-6406358104753086746/normalizer.ckpt +1 -0
- fluent-speech-commands/direct/pretrained_models/EncoderDecoderASR-6406358104753086746/tokenizer.ckpt +1 -0
- fluent-speech-commands/direct/results/BPE51/112011/env.log +434 -0
- fluent-speech-commands/direct/results/BPE51/112011/hyperparams.yaml +200 -0
- fluent-speech-commands/direct/results/BPE51/112011/log.txt +454 -0
- fluent-speech-commands/direct/results/BPE51/112011/save/CKPT+2022-02-15+04-14-03+00/CKPT.yaml +4 -0
- fluent-speech-commands/direct/results/BPE51/112011/save/CKPT+2022-02-15+04-14-03+00/brain.ckpt +2 -0
- fluent-speech-commands/direct/results/BPE51/112011/save/CKPT+2022-02-15+04-14-03+00/counter.ckpt +1 -0
- fluent-speech-commands/direct/results/BPE51/112011/save/CKPT+2022-02-15+04-14-03+00/dataloader-TRAIN.ckpt +1 -0
- fluent-speech-commands/direct/results/BPE51/112011/save/CKPT+2022-02-15+04-14-03+00/model.ckpt +3 -0
- fluent-speech-commands/direct/results/BPE51/112011/save/CKPT+2022-02-15+04-14-03+00/optimizer.ckpt +3 -0
- fluent-speech-commands/direct/results/BPE51/112011/save/CKPT+2022-02-15+04-14-03+00/scheduler.ckpt +0 -0
- fluent-speech-commands/direct/results/BPE51/112011/save/CKPT+2022-02-15+04-53-03+00/CKPT.yaml +4 -0
- fluent-speech-commands/direct/results/BPE51/112011/save/CKPT+2022-02-15+04-53-03+00/brain.ckpt +2 -0
- fluent-speech-commands/direct/results/BPE51/112011/save/CKPT+2022-02-15+04-53-03+00/counter.ckpt +1 -0
- fluent-speech-commands/direct/results/BPE51/112011/save/CKPT+2022-02-15+04-53-03+00/dataloader-TRAIN.ckpt +1 -0
- fluent-speech-commands/direct/results/BPE51/112011/save/CKPT+2022-02-15+04-53-03+00/model.ckpt +3 -0
- fluent-speech-commands/direct/results/BPE51/112011/save/CKPT+2022-02-15+04-53-03+00/optimizer.ckpt +3 -0
- fluent-speech-commands/direct/results/BPE51/112011/save/CKPT+2022-02-15+04-53-03+00/scheduler.ckpt +0 -0
- fluent-speech-commands/direct/results/BPE51/112011/save/FSC_tokenizer/tokenizer.ckpt +0 -0
- fluent-speech-commands/direct/results/BPE51/112011/test.csv +0 -0
- fluent-speech-commands/direct/results/BPE51/112011/train.csv +0 -0
- fluent-speech-commands/direct/results/BPE51/112011/train.py +347 -0
- fluent-speech-commands/direct/results/BPE51/112011/train_log.txt +5 -0
- fluent-speech-commands/direct/results/BPE51/112011/valid.csv +0 -0
- fluent-speech-commands/direct/results/BPE51/112011/wer_test.txt +0 -0
- fluent-speech-commands/direct/train.py +347 -0
- fluent-speech-commands/extra_requirements.txt +1 -0
- fluent-speech-commands/prepare.py +103 -0
- pretrained_models/EncoderDecoderASR--5348169877143464308/asr.ckpt +1 -0
- pretrained_models/EncoderDecoderASR--5348169877143464308/hyperparams.yaml +1 -0
- pretrained_models/EncoderDecoderASR--5348169877143464308/lm.ckpt +1 -0
- pretrained_models/EncoderDecoderASR--5348169877143464308/normalizer.ckpt +1 -0
- pretrained_models/EncoderDecoderASR--5348169877143464308/tokenizer.ckpt +1 -0
- pretrained_models/EndToEndSLU-7990244956535603082/hyperparams.yaml +1 -0
- pretrained_models/EndToEndSLU-7990244956535603082/model.ckpt +1 -0
- pretrained_models/EndToEndSLU-7990244956535603082/tokenizer.ckpt +1 -0
.gitattributes
CHANGED
@@ -25,3 +25,12 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
25 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
26 |
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
27 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
26 |
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
27 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
fluent-speech-commands/direct/pretrained_models/EncoderDecoderASR-6406358104753086746/asr.ckpt filter=lfs diff=lfs merge=lfs -text
|
29 |
+
fluent-speech-commands/direct/pretrained_models/EncoderDecoderASR-6406358104753086746/lm.ckpt filter=lfs diff=lfs merge=lfs -text
|
30 |
+
fluent-speech-commands/direct/results/BPE51/112011/save/CKPT+2022-02-15+04-14-03+00/model.ckpt filter=lfs diff=lfs merge=lfs -text
|
31 |
+
fluent-speech-commands/direct/results/BPE51/112011/save/CKPT+2022-02-15+04-14-03+00/optimizer.ckpt filter=lfs diff=lfs merge=lfs -text
|
32 |
+
fluent-speech-commands/direct/results/BPE51/112011/save/CKPT+2022-02-15+04-53-03+00/model.ckpt filter=lfs diff=lfs merge=lfs -text
|
33 |
+
fluent-speech-commands/direct/results/BPE51/112011/save/CKPT+2022-02-15+04-53-03+00/optimizer.ckpt filter=lfs diff=lfs merge=lfs -text
|
34 |
+
pretrained_models/EncoderDecoderASR--5348169877143464308/asr.ckpt filter=lfs diff=lfs merge=lfs -text
|
35 |
+
pretrained_models/EncoderDecoderASR--5348169877143464308/lm.ckpt filter=lfs diff=lfs merge=lfs -text
|
36 |
+
pretrained_models/EndToEndSLU-7990244956535603082/model.ckpt filter=lfs diff=lfs merge=lfs -text
|
fluent-speech-commands/README.md
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# SLU recipes for Fluent Speech Commands
|
2 |
+
This folder contains recipes for spoken language understanding (SLU) with [Fluent Speech Commands](fluent.ai/research/fluent-speech-commands/).
|
3 |
+
|
4 |
+
### Tokenizer recipe
|
5 |
+
(You don't need to run this because the other recipes download a tokenizer, but you can run this if you want to train a new tokenizer for Fluent Speech Commands.)
|
6 |
+
|
7 |
+
Run this to train the tokenizer:
|
8 |
+
|
9 |
+
```
|
10 |
+
cd Tokenizer
|
11 |
+
python train.py hparams/tokenizer_bpe51.yaml
|
12 |
+
```
|
13 |
+
|
14 |
+
### Direct recipe
|
15 |
+
The "direct" recipe maps the input speech to directly to semantics using a seq2seq model.
|
16 |
+
The encoder is pre-trained using the LibriSpeech seq2seq recipe.
|
17 |
+
|
18 |
+
```
|
19 |
+
cd direct
|
20 |
+
python train.py hparams/train.yaml
|
21 |
+
```
|
22 |
+
|
23 |
+
# Results
|
24 |
+
|
25 |
+
| Release | hyperparams file | Test Acc | Model link | GPUs |
|
26 |
+
|:-------------:|:---------------------------:| -----:| -----:| --------:|
|
27 |
+
| 21-06-03 | train.yaml | 99.60% | https://drive.google.com/drive/folders/13t2PYdedrPQoNYo_QSf6s04WXu2_vAb-?usp=sharing | 1xV100 32GB |
|
28 |
+
|
29 |
+
|
30 |
+
# PreTrained Model + Easy-Inference
|
31 |
+
You can find the pre-trained model with an easy-inference function on [HuggingFace](https://huggingface.co/speechbrain/slu-direct-fluent-speech-commands-librispeech-asr).
|
32 |
+
|
33 |
+
|
34 |
+
# Training Time
|
35 |
+
About 15 minutes for each epoch with a TESLA V100.
|
36 |
+
|
37 |
+
|
38 |
+
# **About SpeechBrain**
|
39 |
+
- Website: https://speechbrain.github.io/
|
40 |
+
- Code: https://github.com/speechbrain/speechbrain/
|
41 |
+
- HuggingFace: https://huggingface.co/speechbrain/
|
42 |
+
|
43 |
+
|
44 |
+
# **Citing SpeechBrain**
|
45 |
+
Please, cite SpeechBrain if you use it for your research or business.
|
46 |
+
|
47 |
+
```bibtex
|
48 |
+
@misc{speechbrain,
|
49 |
+
title={{SpeechBrain}: A General-Purpose Speech Toolkit},
|
50 |
+
author={Mirco Ravanelli and Titouan Parcollet and Peter Plantinga and Aku Rouhe and Samuele Cornell and Loren Lugosch and Cem Subakan and Nauman Dawalatabad and Abdelwahab Heba and Jianyuan Zhong and Ju-Chieh Chou and Sung-Lin Yeh and Szu-Wei Fu and Chien-Feng Liao and Elena Rastorgueva and François Grondin and William Aris and Hwidong Na and Yan Gao and Renato De Mori and Yoshua Bengio},
|
51 |
+
year={2021},
|
52 |
+
eprint={2106.04624},
|
53 |
+
archivePrefix={arXiv},
|
54 |
+
primaryClass={eess.AS},
|
55 |
+
note={arXiv:2106.04624}
|
56 |
+
}
|
57 |
+
```
|
58 |
+
|
fluent-speech-commands/Tokenizer/hparams/tokenizer_bpe51.yaml
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ############################################################################
|
2 |
+
# Tokenizer: subword BPE with unigram 51
|
3 |
+
# Training: Fluent Speech Commands
|
4 |
+
# Authors: Abdel Heba 2021
|
5 |
+
# ############################################################################
|
6 |
+
|
7 |
+
output_folder: !ref results/tokenizer_bpe51/
|
8 |
+
train_log: !ref <output_folder>/train_log.txt
|
9 |
+
|
10 |
+
# Data files
|
11 |
+
data_folder: !PLACEHOLDER # e,g. /localscratch/fluent_speech_commands_dataset
|
12 |
+
train_csv: !ref <output_folder>/train.csv
|
13 |
+
valid_csv: !ref <output_folder>/valid.csv
|
14 |
+
skip_prep: False
|
15 |
+
|
16 |
+
# Training parameters
|
17 |
+
token_type: unigram # ["unigram", "bpe", "char"]
|
18 |
+
token_output: 51 # index(blank/eos/bos/unk) = 0
|
19 |
+
character_coverage: 1.0
|
20 |
+
num_sequences: 10000
|
21 |
+
csv_read: semantics
|
22 |
+
|
23 |
+
|
24 |
+
tokenizer: !name:speechbrain.tokenizers.SentencePiece.SentencePiece
|
25 |
+
model_dir: !ref <output_folder>
|
26 |
+
vocab_size: !ref <token_output>
|
27 |
+
annotation_train: !ref <train_csv>
|
28 |
+
annotation_read: !ref <csv_read>
|
29 |
+
model_type: !ref <token_type> # ["unigram", "bpe", "char"]
|
30 |
+
character_coverage: !ref <character_coverage>
|
31 |
+
num_sequences: !ref <num_sequences>
|
32 |
+
annotation_list_to_check: [!ref <train_csv>, !ref <valid_csv>]
|
fluent-speech-commands/Tokenizer/prepare.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
../prepare.py
|
fluent-speech-commands/Tokenizer/train.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env/python3
|
2 |
+
"""Recipe for training a BPE tokenizer for Fluent Speech Commands.
|
3 |
+
The tokenizer coverts semantics into sub-word units that can
|
4 |
+
be used to train a language (LM) or an acoustic model (AM).
|
5 |
+
|
6 |
+
To run this recipe, do the following:
|
7 |
+
> python train.py hparams/tokenizer_bpe51.yaml
|
8 |
+
|
9 |
+
|
10 |
+
Authors
|
11 |
+
* Abdel Heba 2021
|
12 |
+
* Mirco Ravanelli 2021
|
13 |
+
* Loren Lugosch 2021
|
14 |
+
"""
|
15 |
+
|
16 |
+
import sys
|
17 |
+
import speechbrain as sb
|
18 |
+
from hyperpyyaml import load_hyperpyyaml
|
19 |
+
from speechbrain.utils.distributed import run_on_main
|
20 |
+
|
21 |
+
if __name__ == "__main__":
|
22 |
+
|
23 |
+
# CLI:
|
24 |
+
hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
|
25 |
+
with open(hparams_file) as fin:
|
26 |
+
hparams = load_hyperpyyaml(fin, overrides)
|
27 |
+
|
28 |
+
# If distributed_launch=True then
|
29 |
+
# create ddp_group with the right communication protocol
|
30 |
+
sb.utils.distributed.ddp_init_group(run_opts)
|
31 |
+
|
32 |
+
# Create experiment directory
|
33 |
+
sb.create_experiment_directory(
|
34 |
+
experiment_directory=hparams["output_folder"],
|
35 |
+
hyperparams_to_save=hparams_file,
|
36 |
+
overrides=overrides,
|
37 |
+
)
|
38 |
+
|
39 |
+
# 1. # Dataset prep
|
40 |
+
from prepare import prepare_FSC # noqa
|
41 |
+
|
42 |
+
# multi-gpu (ddp) save data preparation
|
43 |
+
run_on_main(
|
44 |
+
prepare_FSC,
|
45 |
+
kwargs={
|
46 |
+
"data_folder": hparams["data_folder"],
|
47 |
+
"save_folder": hparams["output_folder"],
|
48 |
+
"skip_prep": hparams["skip_prep"],
|
49 |
+
},
|
50 |
+
)
|
51 |
+
|
52 |
+
# Train tokenizer
|
53 |
+
hparams["tokenizer"]()
|
fluent-speech-commands/direct/__pycache__/prepare.cpython-37.pyc
ADDED
Binary file (2.05 kB). View file
|
|
fluent-speech-commands/direct/hparams/train.yaml
ADDED
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ############################################################################
|
2 |
+
# Model: Direct SLU
|
3 |
+
# Encoder: Pre-trained ASR encoder -> LSTM
|
4 |
+
# Decoder: GRU + beamsearch
|
5 |
+
# Tokens: BPE with unigram
|
6 |
+
# losses: NLL
|
7 |
+
# Training: Fluent Speech Commands
|
8 |
+
# Authors: Loren Lugosch, Mirco Ravanelli 2020
|
9 |
+
# ############################################################################
|
10 |
+
|
11 |
+
# Seed needs to be set at top of yaml, before objects with parameters are made
|
12 |
+
seed: 112011
|
13 |
+
__set_seed: !apply:torch.manual_seed [!ref <seed>]
|
14 |
+
output_folder: !ref results/BPE51/<seed>
|
15 |
+
save_folder: !ref <output_folder>/save
|
16 |
+
train_log: !ref <output_folder>/train_log.txt
|
17 |
+
|
18 |
+
# Data files
|
19 |
+
data_folder: !PLACEHOLDER # e.g, /localscratch/fluent_speech_commands_dataset
|
20 |
+
rir_folder: !ref <data_folder> # Change it if needed
|
21 |
+
csv_train: !ref <output_folder>/train.csv
|
22 |
+
csv_valid: !ref <output_folder>/valid.csv
|
23 |
+
csv_test: !ref <output_folder>/test.csv
|
24 |
+
tokenizer_file: https://www.dropbox.com/s/hvf2huofnq0sjbn/51_unigram.model?dl=1
|
25 |
+
skip_prep: False
|
26 |
+
# Training parameters
|
27 |
+
number_of_epochs: 4
|
28 |
+
batch_size: 8
|
29 |
+
lr: 0.0003
|
30 |
+
token_type: unigram # ["unigram", "bpe", "char"]
|
31 |
+
sorting: random
|
32 |
+
|
33 |
+
# Model parameters
|
34 |
+
sample_rate: 16000
|
35 |
+
emb_size: 128
|
36 |
+
dec_neurons: 512
|
37 |
+
output_neurons: 51 # index(eos/bos) = 0
|
38 |
+
ASR_encoder_dim: 512
|
39 |
+
encoder_dim: 256
|
40 |
+
|
41 |
+
# Decoding parameters
|
42 |
+
bos_index: 0
|
43 |
+
eos_index: 0
|
44 |
+
min_decode_ratio: 0.0
|
45 |
+
max_decode_ratio: 10.0
|
46 |
+
slu_beam_size: 80
|
47 |
+
eos_threshold: 1.5
|
48 |
+
temperature: 1.25
|
49 |
+
|
50 |
+
dataloader_opts:
|
51 |
+
batch_size: !ref <batch_size>
|
52 |
+
shuffle: True
|
53 |
+
|
54 |
+
epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
|
55 |
+
limit: !ref <number_of_epochs>
|
56 |
+
|
57 |
+
# Models
|
58 |
+
asr_model: !apply:speechbrain.pretrained.EncoderDecoderASR.from_hparams
|
59 |
+
source: speechbrain/asr-crdnn-rnnlm-librispeech
|
60 |
+
run_opts: {"device":"cuda:0"}
|
61 |
+
|
62 |
+
slu_enc: !new:speechbrain.nnet.containers.Sequential
|
63 |
+
input_shape: [null, null, !ref <ASR_encoder_dim>]
|
64 |
+
lstm: !new:speechbrain.nnet.RNN.LSTM
|
65 |
+
input_size: !ref <ASR_encoder_dim>
|
66 |
+
bidirectional: True
|
67 |
+
hidden_size: !ref <encoder_dim>
|
68 |
+
num_layers: 2
|
69 |
+
linear: !new:speechbrain.nnet.linear.Linear
|
70 |
+
input_size: !ref <encoder_dim> * 2
|
71 |
+
n_neurons: !ref <encoder_dim>
|
72 |
+
|
73 |
+
output_emb: !new:speechbrain.nnet.embedding.Embedding
|
74 |
+
num_embeddings: !ref <output_neurons>
|
75 |
+
embedding_dim: !ref <emb_size>
|
76 |
+
|
77 |
+
dec: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder
|
78 |
+
enc_dim: !ref <encoder_dim>
|
79 |
+
input_size: !ref <emb_size>
|
80 |
+
rnn_type: gru
|
81 |
+
attn_type: keyvalue
|
82 |
+
hidden_size: !ref <dec_neurons>
|
83 |
+
attn_dim: 512
|
84 |
+
num_layers: 3
|
85 |
+
scaling: 1.0
|
86 |
+
dropout: 0.0
|
87 |
+
|
88 |
+
seq_lin: !new:speechbrain.nnet.linear.Linear
|
89 |
+
input_size: !ref <dec_neurons>
|
90 |
+
n_neurons: !ref <output_neurons>
|
91 |
+
|
92 |
+
augment_wavedrop: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
|
93 |
+
sample_rate: !ref <sample_rate>
|
94 |
+
speeds: [100]
|
95 |
+
|
96 |
+
augment_speed: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
|
97 |
+
sample_rate: !ref <sample_rate>
|
98 |
+
speeds: [95, 100, 105]
|
99 |
+
|
100 |
+
add_rev: !new:speechbrain.lobes.augment.EnvCorrupt
|
101 |
+
openrir_folder: !ref <rir_folder>
|
102 |
+
openrir_max_noise_len: 3.0 # seconds
|
103 |
+
reverb_prob: 1.0
|
104 |
+
noise_prob: 0.0
|
105 |
+
noise_snr_low: 0
|
106 |
+
noise_snr_high: 15
|
107 |
+
rir_scale_factor: 1.0
|
108 |
+
|
109 |
+
add_noise: !new:speechbrain.lobes.augment.EnvCorrupt
|
110 |
+
openrir_folder: !ref <rir_folder>
|
111 |
+
openrir_max_noise_len: 3.0 # seconds
|
112 |
+
reverb_prob: 0.0
|
113 |
+
noise_prob: 1.0
|
114 |
+
noise_snr_low: 0
|
115 |
+
noise_snr_high: 15
|
116 |
+
rir_scale_factor: 1.0
|
117 |
+
|
118 |
+
add_rev_noise: !new:speechbrain.lobes.augment.EnvCorrupt
|
119 |
+
openrir_folder: !ref <rir_folder>
|
120 |
+
openrir_max_noise_len: 3.0 # seconds
|
121 |
+
reverb_prob: 1.0
|
122 |
+
noise_prob: 1.0
|
123 |
+
noise_snr_low: 0
|
124 |
+
noise_snr_high: 15
|
125 |
+
rir_scale_factor: 1.0
|
126 |
+
|
127 |
+
|
128 |
+
augment_pipeline: [
|
129 |
+
!ref <augment_wavedrop>,
|
130 |
+
!ref <augment_speed>,
|
131 |
+
!ref <add_rev>,
|
132 |
+
!ref <add_noise>,
|
133 |
+
!ref <add_rev_noise>
|
134 |
+
]
|
135 |
+
|
136 |
+
|
137 |
+
modules:
|
138 |
+
augment_wavedrop: !ref <augment_wavedrop>
|
139 |
+
augment_speed: !ref <augment_speed>
|
140 |
+
add_rev: !ref <add_rev>
|
141 |
+
add_noise: !ref <add_noise>
|
142 |
+
add_rev_noise: !ref <add_rev_noise>
|
143 |
+
slu_enc: !ref <slu_enc>
|
144 |
+
output_emb: !ref <output_emb>
|
145 |
+
dec: !ref <dec>
|
146 |
+
seq_lin: !ref <seq_lin>
|
147 |
+
|
148 |
+
model: !new:torch.nn.ModuleList
|
149 |
+
- [!ref <slu_enc>, !ref <output_emb>,
|
150 |
+
!ref <dec>, !ref <seq_lin>]
|
151 |
+
|
152 |
+
tokenizer: !new:sentencepiece.SentencePieceProcessor
|
153 |
+
|
154 |
+
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
|
155 |
+
collect_in: !ref <save_folder>/FSC_tokenizer
|
156 |
+
loadables:
|
157 |
+
tokenizer: !ref <tokenizer>
|
158 |
+
paths:
|
159 |
+
tokenizer: !ref <tokenizer_file>
|
160 |
+
|
161 |
+
beam_searcher: !new:speechbrain.decoders.S2SRNNBeamSearcher
|
162 |
+
embedding: !ref <output_emb>
|
163 |
+
decoder: !ref <dec>
|
164 |
+
linear: !ref <seq_lin>
|
165 |
+
bos_index: !ref <bos_index>
|
166 |
+
eos_index: !ref <eos_index>
|
167 |
+
min_decode_ratio: !ref <min_decode_ratio>
|
168 |
+
max_decode_ratio: !ref <max_decode_ratio>
|
169 |
+
beam_size: !ref <slu_beam_size>
|
170 |
+
eos_threshold: !ref <eos_threshold>
|
171 |
+
temperature: !ref <temperature>
|
172 |
+
using_max_attn_shift: False
|
173 |
+
max_attn_shift: 30
|
174 |
+
coverage_penalty: 0.
|
175 |
+
|
176 |
+
opt_class: !name:torch.optim.Adam
|
177 |
+
lr: !ref <lr>
|
178 |
+
|
179 |
+
lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler
|
180 |
+
initial_value: !ref <lr>
|
181 |
+
improvement_threshold: 0.0025
|
182 |
+
annealing_factor: 0.8
|
183 |
+
patient: 0
|
184 |
+
|
185 |
+
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
|
186 |
+
checkpoints_dir: !ref <save_folder>
|
187 |
+
recoverables:
|
188 |
+
model: !ref <model>
|
189 |
+
scheduler: !ref <lr_annealing>
|
190 |
+
counter: !ref <epoch_counter>
|
191 |
+
|
192 |
+
log_softmax: !new:speechbrain.nnet.activations.Softmax
|
193 |
+
apply_log: True
|
194 |
+
|
195 |
+
seq_cost: !name:speechbrain.nnet.losses.nll_loss
|
196 |
+
label_smoothing: 0.1
|
197 |
+
|
198 |
+
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
|
199 |
+
save_file: !ref <train_log>
|
200 |
+
|
201 |
+
error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
|
202 |
+
|
203 |
+
cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
|
204 |
+
split_tokens: True
|
fluent-speech-commands/direct/prepare.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
../prepare.py
|
fluent-speech-commands/direct/pretrained_models/EncoderDecoderASR-6406358104753086746/asr.ckpt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
/root/.cache/huggingface/hub/83e944252a91fe1d0883daa1e87077df4d64c35fffb45e22fff924faace4a59c.7fdf4aabd8400c69a6228ccc17c83b7a8ebf34c5d76f23497b7cf0d7a1baaea3
|
fluent-speech-commands/direct/pretrained_models/EncoderDecoderASR-6406358104753086746/hyperparams.yaml
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
/root/.cache/huggingface/hub/7aac72d39109ee19b4004d94239c2924caf33de6d85b0aff9296d844982210cb.d14310ea63844fb38520a592ea3a92e4f131b5f4683f8fa08e27b1e403c92293
|
fluent-speech-commands/direct/pretrained_models/EncoderDecoderASR-6406358104753086746/lm.ckpt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
/root/.cache/huggingface/hub/651df066b5d0b2efef7208f51df93d3a0a65bedc3a3a2500cd7b8faf064e631e.b438b9af3f549a23c4458bb066c11cd51dc1cfe9bfef30d3eb66b472e93b1e8c
|
fluent-speech-commands/direct/pretrained_models/EncoderDecoderASR-6406358104753086746/normalizer.ckpt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
/root/.cache/huggingface/hub/e733854cce680bcb58ce4b86bacb3cab5222880933b7b85ab17758aa5b10e9da.587fb748e80e719ed5721d5e0098c5feb2a901017135271ce2b2c6baea7e9f6e
|
fluent-speech-commands/direct/pretrained_models/EncoderDecoderASR-6406358104753086746/tokenizer.ckpt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
/root/.cache/huggingface/hub/f39208eba495042a59a8404b5703ca08a39a85e4d2bf707e197b90a3323f92ab.cd7af7ea8cfcfbf0f6dd61514c361972eb82b3b76f12b0e9ee0b371f36fdc078
|
fluent-speech-commands/direct/results/BPE51/112011/env.log
ADDED
@@ -0,0 +1,434 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
SpeechBrain system description
|
2 |
+
==============================
|
3 |
+
Python version:
|
4 |
+
3.7.12 (default, Jan 15 2022, 18:48:18)
|
5 |
+
[GCC 7.5.0]
|
6 |
+
==============================
|
7 |
+
Installed Python packages:
|
8 |
+
absl-py==1.0.0
|
9 |
+
aiohttp==3.8.1
|
10 |
+
aiosignal==1.2.0
|
11 |
+
alabaster==0.7.12
|
12 |
+
albumentations==0.1.12
|
13 |
+
altair==4.2.0
|
14 |
+
appdirs==1.4.4
|
15 |
+
argon2-cffi==21.3.0
|
16 |
+
argon2-cffi-bindings==21.2.0
|
17 |
+
arviz==0.11.4
|
18 |
+
astor==0.8.1
|
19 |
+
astropy==4.3.1
|
20 |
+
astunparse==1.6.3
|
21 |
+
async-timeout==4.0.2
|
22 |
+
asynctest==0.13.0
|
23 |
+
atari-py==0.2.9
|
24 |
+
atomicwrites==1.4.0
|
25 |
+
attrs==21.4.0
|
26 |
+
audioread==2.1.9
|
27 |
+
autograd==1.3
|
28 |
+
Babel==2.9.1
|
29 |
+
backcall==0.2.0
|
30 |
+
beautifulsoup4==4.6.3
|
31 |
+
black==19.10b0
|
32 |
+
bleach==4.1.0
|
33 |
+
blis==0.4.1
|
34 |
+
bokeh==2.3.3
|
35 |
+
Bottleneck==1.3.2
|
36 |
+
branca==0.4.2
|
37 |
+
bs4==0.0.1
|
38 |
+
CacheControl==0.12.10
|
39 |
+
cached-property==1.5.2
|
40 |
+
cachetools==4.2.4
|
41 |
+
catalogue==1.0.0
|
42 |
+
certifi==2021.10.8
|
43 |
+
cffi==1.15.0
|
44 |
+
cfgv==3.3.1
|
45 |
+
cftime==1.5.2
|
46 |
+
chardet==3.0.4
|
47 |
+
charset-normalizer==2.0.11
|
48 |
+
click==7.1.2
|
49 |
+
cloudpickle==1.3.0
|
50 |
+
cmake==3.12.0
|
51 |
+
cmdstanpy==0.9.5
|
52 |
+
colorcet==3.0.0
|
53 |
+
colorlover==0.3.0
|
54 |
+
community==1.0.0b1
|
55 |
+
contextlib2==0.5.5
|
56 |
+
convertdate==2.4.0
|
57 |
+
coverage==3.7.1
|
58 |
+
coveralls==0.5
|
59 |
+
crcmod==1.7
|
60 |
+
cufflinks==0.17.3
|
61 |
+
cupy-cuda111==9.4.0
|
62 |
+
cvxopt==1.2.7
|
63 |
+
cvxpy==1.0.31
|
64 |
+
cycler==0.11.0
|
65 |
+
cymem==2.0.6
|
66 |
+
Cython==0.29.27
|
67 |
+
daft==0.0.4
|
68 |
+
dask==2.12.0
|
69 |
+
datascience==0.10.6
|
70 |
+
datasets==1.18.3
|
71 |
+
debugpy==1.0.0
|
72 |
+
decorator==4.4.2
|
73 |
+
defusedxml==0.7.1
|
74 |
+
descartes==1.1.0
|
75 |
+
dill==0.3.4
|
76 |
+
distlib==0.3.4
|
77 |
+
distributed==1.25.3
|
78 |
+
dlib @ file:///dlib-19.18.0-cp37-cp37m-linux_x86_64.whl
|
79 |
+
dm-tree==0.1.6
|
80 |
+
docopt==0.6.2
|
81 |
+
docutils==0.17.1
|
82 |
+
dopamine-rl==1.0.5
|
83 |
+
earthengine-api==0.1.297
|
84 |
+
easydict==1.9
|
85 |
+
ecos==2.0.10
|
86 |
+
editdistance==0.5.3
|
87 |
+
en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz
|
88 |
+
entrypoints==0.3
|
89 |
+
ephem==4.1.3
|
90 |
+
et-xmlfile==1.1.0
|
91 |
+
fa2==0.3.5
|
92 |
+
fastai==1.0.61
|
93 |
+
fastdtw==0.3.4
|
94 |
+
fastprogress==1.0.0
|
95 |
+
fastrlock==0.8
|
96 |
+
fbprophet==0.7.1
|
97 |
+
feather-format==0.4.1
|
98 |
+
filelock==3.4.2
|
99 |
+
firebase-admin==4.4.0
|
100 |
+
fix-yahoo-finance==0.0.22
|
101 |
+
flake8==3.7.9
|
102 |
+
Flask==1.1.4
|
103 |
+
flatbuffers==2.0
|
104 |
+
folium==0.8.3
|
105 |
+
frozenlist==1.3.0
|
106 |
+
fsspec==2022.1.0
|
107 |
+
future==0.16.0
|
108 |
+
gast==0.4.0
|
109 |
+
GDAL==2.2.2
|
110 |
+
gdown==4.2.1
|
111 |
+
gensim==3.6.0
|
112 |
+
geographiclib==1.52
|
113 |
+
geopy==1.17.0
|
114 |
+
gin-config==0.5.0
|
115 |
+
glob2==0.7
|
116 |
+
google==2.0.3
|
117 |
+
google-api-core==1.26.3
|
118 |
+
google-api-python-client==1.12.10
|
119 |
+
google-auth==1.35.0
|
120 |
+
google-auth-httplib2==0.0.4
|
121 |
+
google-auth-oauthlib==0.4.6
|
122 |
+
google-cloud-bigquery==1.21.0
|
123 |
+
google-cloud-bigquery-storage==1.1.0
|
124 |
+
google-cloud-core==1.0.3
|
125 |
+
google-cloud-datastore==1.8.0
|
126 |
+
google-cloud-firestore==1.7.0
|
127 |
+
google-cloud-language==1.2.0
|
128 |
+
google-cloud-storage==1.18.1
|
129 |
+
google-cloud-translate==1.5.0
|
130 |
+
google-colab @ file:///colabtools/dist/google-colab-1.0.0.tar.gz
|
131 |
+
google-pasta==0.2.0
|
132 |
+
google-resumable-media==0.4.1
|
133 |
+
googleapis-common-protos==1.54.0
|
134 |
+
googledrivedownloader==0.4
|
135 |
+
graphviz==0.10.1
|
136 |
+
greenlet==1.1.2
|
137 |
+
grpcio==1.43.0
|
138 |
+
gspread==3.4.2
|
139 |
+
gspread-dataframe==3.0.8
|
140 |
+
gym==0.17.3
|
141 |
+
h5py==3.1.0
|
142 |
+
HeapDict==1.0.1
|
143 |
+
hijri-converter==2.2.2
|
144 |
+
holidays==0.10.5.2
|
145 |
+
holoviews==1.14.7
|
146 |
+
html5lib==1.0.1
|
147 |
+
httpimport==0.5.18
|
148 |
+
httplib2==0.17.4
|
149 |
+
httplib2shim==0.0.3
|
150 |
+
huggingface-hub==0.4.0
|
151 |
+
humanize==0.5.1
|
152 |
+
hyperopt==0.1.2
|
153 |
+
HyperPyYAML==1.0.0
|
154 |
+
ideep4py==2.0.0.post3
|
155 |
+
identify==2.4.10
|
156 |
+
idna==2.10
|
157 |
+
imageio==2.4.1
|
158 |
+
imagesize==1.3.0
|
159 |
+
imbalanced-learn==0.8.1
|
160 |
+
imblearn==0.0
|
161 |
+
imgaug==0.2.9
|
162 |
+
importlib-metadata==4.10.1
|
163 |
+
importlib-resources==5.4.0
|
164 |
+
imutils==0.5.4
|
165 |
+
inflect==2.1.0
|
166 |
+
iniconfig==1.1.1
|
167 |
+
intel-openmp==2022.0.2
|
168 |
+
intervaltree==2.1.0
|
169 |
+
ipykernel==4.10.1
|
170 |
+
ipython==5.5.0
|
171 |
+
ipython-genutils==0.2.0
|
172 |
+
ipython-sql==0.3.9
|
173 |
+
ipywidgets==7.6.5
|
174 |
+
itsdangerous==1.1.0
|
175 |
+
jax==0.2.25
|
176 |
+
jaxlib @ https://storage.googleapis.com/jax-releases/cuda111/jaxlib-0.1.71+cuda111-cp37-none-manylinux2010_x86_64.whl
|
177 |
+
jedi==0.18.1
|
178 |
+
jieba==0.42.1
|
179 |
+
Jinja2==2.11.3
|
180 |
+
joblib==1.1.0
|
181 |
+
jpeg4py==0.1.4
|
182 |
+
jsonschema==4.3.3
|
183 |
+
jupyter==1.0.0
|
184 |
+
jupyter-client==5.3.5
|
185 |
+
jupyter-console==5.2.0
|
186 |
+
jupyter-core==4.9.1
|
187 |
+
jupyterlab-pygments==0.1.2
|
188 |
+
jupyterlab-widgets==1.0.2
|
189 |
+
kaggle==1.5.12
|
190 |
+
kapre==0.3.7
|
191 |
+
keras==2.7.0
|
192 |
+
Keras-Preprocessing==1.1.2
|
193 |
+
keras-vis==0.4.1
|
194 |
+
kiwisolver==1.3.2
|
195 |
+
korean-lunar-calendar==0.2.1
|
196 |
+
libclang==13.0.0
|
197 |
+
librosa==0.9.0
|
198 |
+
lightgbm==2.2.3
|
199 |
+
llvmlite==0.34.0
|
200 |
+
lmdb==0.99
|
201 |
+
LunarCalendar==0.0.9
|
202 |
+
lxml==4.2.6
|
203 |
+
Markdown==3.3.6
|
204 |
+
MarkupSafe==2.0.1
|
205 |
+
matplotlib==3.2.2
|
206 |
+
matplotlib-inline==0.1.3
|
207 |
+
matplotlib-venn==0.11.6
|
208 |
+
mccabe==0.6.1
|
209 |
+
missingno==0.5.0
|
210 |
+
mistune==0.8.4
|
211 |
+
mizani==0.6.0
|
212 |
+
mkl==2019.0
|
213 |
+
mlxtend==0.14.0
|
214 |
+
more-itertools==8.12.0
|
215 |
+
moviepy==0.2.3.5
|
216 |
+
mpmath==1.2.1
|
217 |
+
msgpack==1.0.3
|
218 |
+
multidict==6.0.2
|
219 |
+
multiprocess==0.70.12.2
|
220 |
+
multitasking==0.0.10
|
221 |
+
murmurhash==1.0.6
|
222 |
+
music21==5.5.0
|
223 |
+
natsort==5.5.0
|
224 |
+
nbclient==0.5.10
|
225 |
+
nbconvert==5.6.1
|
226 |
+
nbformat==5.1.3
|
227 |
+
nest-asyncio==1.5.4
|
228 |
+
netCDF4==1.5.8
|
229 |
+
networkx==2.6.3
|
230 |
+
nibabel==3.0.2
|
231 |
+
nltk==3.2.5
|
232 |
+
nodeenv==1.6.0
|
233 |
+
notebook==5.3.1
|
234 |
+
numba==0.51.2
|
235 |
+
numexpr==2.8.1
|
236 |
+
numpy==1.19.5
|
237 |
+
nvidia-ml-py3==7.352.0
|
238 |
+
oauth2client==4.1.3
|
239 |
+
oauthlib==3.2.0
|
240 |
+
okgrade==0.4.3
|
241 |
+
opencv-contrib-python==4.1.2.30
|
242 |
+
opencv-python==4.1.2.30
|
243 |
+
openpyxl==3.0.9
|
244 |
+
opt-einsum==3.3.0
|
245 |
+
osqp==0.6.2.post0
|
246 |
+
packaging==21.3
|
247 |
+
palettable==3.3.0
|
248 |
+
pandas==1.3.5
|
249 |
+
pandas-datareader==0.9.0
|
250 |
+
pandas-gbq==0.13.3
|
251 |
+
pandas-profiling==1.4.1
|
252 |
+
pandocfilters==1.5.0
|
253 |
+
panel==0.12.1
|
254 |
+
param==1.12.0
|
255 |
+
parso==0.8.3
|
256 |
+
pathlib==1.0.1
|
257 |
+
pathspec==0.9.0
|
258 |
+
patsy==0.5.2
|
259 |
+
pep517==0.12.0
|
260 |
+
pexpect==4.8.0
|
261 |
+
pickleshare==0.7.5
|
262 |
+
Pillow==7.1.2
|
263 |
+
pip-tools==6.2.0
|
264 |
+
plac==1.1.3
|
265 |
+
platformdirs==2.5.0
|
266 |
+
plotly==5.5.0
|
267 |
+
plotnine==0.6.0
|
268 |
+
pluggy==0.13.1
|
269 |
+
pooch==1.6.0
|
270 |
+
portpicker==1.3.9
|
271 |
+
pre-commit==2.17.0
|
272 |
+
prefetch-generator==1.0.1
|
273 |
+
preshed==3.0.6
|
274 |
+
prettytable==3.0.0
|
275 |
+
progressbar2==3.38.0
|
276 |
+
prometheus-client==0.13.1
|
277 |
+
promise==2.3
|
278 |
+
prompt-toolkit==1.0.18
|
279 |
+
protobuf==3.17.3
|
280 |
+
psutil==5.4.8
|
281 |
+
psycopg2==2.7.6.1
|
282 |
+
ptyprocess==0.7.0
|
283 |
+
py==1.11.0
|
284 |
+
pyarrow==6.0.1
|
285 |
+
pyasn1==0.4.8
|
286 |
+
pyasn1-modules==0.2.8
|
287 |
+
pycocotools==2.0.4
|
288 |
+
pycodestyle==2.5.0
|
289 |
+
pycparser==2.21
|
290 |
+
pyct==0.4.8
|
291 |
+
pydata-google-auth==1.3.0
|
292 |
+
pydot==1.3.0
|
293 |
+
pydot-ng==2.0.0
|
294 |
+
pydotplus==2.0.2
|
295 |
+
PyDrive==1.3.1
|
296 |
+
pyemd==0.5.1
|
297 |
+
pyerfa==2.0.0.1
|
298 |
+
pyflakes==2.1.1
|
299 |
+
pyglet==1.5.0
|
300 |
+
Pygments==2.6.1
|
301 |
+
pygobject==3.26.1
|
302 |
+
pymc3==3.11.4
|
303 |
+
PyMeeus==0.5.11
|
304 |
+
pymongo==4.0.1
|
305 |
+
pymystem3==0.2.0
|
306 |
+
PyOpenGL==3.1.5
|
307 |
+
pyparsing==3.0.7
|
308 |
+
pyrsistent==0.18.1
|
309 |
+
pysndfile==1.3.8
|
310 |
+
PySocks==1.7.1
|
311 |
+
pystan==2.19.1.1
|
312 |
+
pytest==5.4.1
|
313 |
+
python-apt==0.0.0
|
314 |
+
python-chess==0.23.11
|
315 |
+
python-dateutil==2.8.2
|
316 |
+
python-louvain==0.16
|
317 |
+
python-slugify==5.0.2
|
318 |
+
python-utils==3.1.0
|
319 |
+
pytz==2018.9
|
320 |
+
pyviz-comms==2.1.0
|
321 |
+
PyWavelets==1.2.0
|
322 |
+
PyYAML==6.0
|
323 |
+
pyzmq==22.3.0
|
324 |
+
qdldl==0.1.5.post0
|
325 |
+
qtconsole==5.2.2
|
326 |
+
QtPy==2.0.1
|
327 |
+
regex==2019.12.20
|
328 |
+
requests==2.23.0
|
329 |
+
requests-oauthlib==1.3.1
|
330 |
+
resampy==0.2.2
|
331 |
+
rpy2==3.4.5
|
332 |
+
rsa==4.8
|
333 |
+
ruamel.yaml==0.17.21
|
334 |
+
ruamel.yaml.clib==0.2.6
|
335 |
+
sacremoses==0.0.47
|
336 |
+
scikit-image==0.18.3
|
337 |
+
scikit-learn==1.0.2
|
338 |
+
scipy==1.4.1
|
339 |
+
screen-resolution-extra==0.0.0
|
340 |
+
scs==3.1.0
|
341 |
+
seaborn==0.11.2
|
342 |
+
semver==2.13.0
|
343 |
+
Send2Trash==1.8.0
|
344 |
+
sentencepiece==0.1.96
|
345 |
+
setuptools-git==1.2
|
346 |
+
Shapely==1.8.0
|
347 |
+
simplegeneric==0.8.1
|
348 |
+
six==1.15.0
|
349 |
+
sklearn==0.0
|
350 |
+
sklearn-pandas==1.8.0
|
351 |
+
smart-open==5.2.1
|
352 |
+
snowballstemmer==2.2.0
|
353 |
+
sortedcontainers==2.4.0
|
354 |
+
SoundFile==0.10.3.post1
|
355 |
+
spacy==2.2.4
|
356 |
+
speechbrain==0.5.11
|
357 |
+
Sphinx==1.8.6
|
358 |
+
sphinxcontrib-serializinghtml==1.1.5
|
359 |
+
sphinxcontrib-websupport==1.2.4
|
360 |
+
SQLAlchemy==1.4.31
|
361 |
+
sqlparse==0.4.2
|
362 |
+
srsly==1.0.5
|
363 |
+
statsmodels==0.10.2
|
364 |
+
sympy==1.7.1
|
365 |
+
tables==3.7.0
|
366 |
+
tabulate==0.8.9
|
367 |
+
tblib==1.7.0
|
368 |
+
tenacity==8.0.1
|
369 |
+
tensorboard==2.7.0
|
370 |
+
tensorboard-data-server==0.6.1
|
371 |
+
tensorboard-plugin-wit==1.8.1
|
372 |
+
tensorflow @ file:///tensorflow-2.7.0-cp37-cp37m-linux_x86_64.whl
|
373 |
+
tensorflow-datasets==4.0.1
|
374 |
+
tensorflow-estimator==2.7.0
|
375 |
+
tensorflow-gcs-config==2.7.0
|
376 |
+
tensorflow-hub==0.12.0
|
377 |
+
tensorflow-io-gcs-filesystem==0.24.0
|
378 |
+
tensorflow-metadata==1.6.0
|
379 |
+
tensorflow-probability==0.15.0
|
380 |
+
termcolor==1.1.0
|
381 |
+
terminado==0.13.1
|
382 |
+
testpath==0.5.0
|
383 |
+
text-unidecode==1.3
|
384 |
+
textblob==0.15.3
|
385 |
+
Theano-PyMC==1.1.2
|
386 |
+
thinc==7.4.0
|
387 |
+
threadpoolctl==3.1.0
|
388 |
+
tifffile==2021.11.2
|
389 |
+
tokenizers==0.11.4
|
390 |
+
toml==0.10.2
|
391 |
+
tomli==2.0.0
|
392 |
+
toolz==0.11.2
|
393 |
+
torch @ https://download.pytorch.org/whl/cu111/torch-1.10.0%2Bcu111-cp37-cp37m-linux_x86_64.whl
|
394 |
+
torchaudio @ https://download.pytorch.org/whl/cu111/torchaudio-0.10.0%2Bcu111-cp37-cp37m-linux_x86_64.whl
|
395 |
+
torchsummary==1.5.1
|
396 |
+
torchtext==0.11.0
|
397 |
+
torchvision @ https://download.pytorch.org/whl/cu111/torchvision-0.11.1%2Bcu111-cp37-cp37m-linux_x86_64.whl
|
398 |
+
tornado==5.1.1
|
399 |
+
tqdm==4.62.3
|
400 |
+
traitlets==5.1.1
|
401 |
+
transformers==4.16.2
|
402 |
+
tweepy==3.10.0
|
403 |
+
typed-ast==1.5.2
|
404 |
+
typeguard==2.7.1
|
405 |
+
typing-extensions==3.10.0.2
|
406 |
+
tzlocal==1.5.1
|
407 |
+
uritemplate==3.0.1
|
408 |
+
urllib3==1.24.3
|
409 |
+
vega-datasets==0.9.0
|
410 |
+
virtualenv==20.13.1
|
411 |
+
wasabi==0.9.0
|
412 |
+
wcwidth==0.2.5
|
413 |
+
webencodings==0.5.1
|
414 |
+
Werkzeug==1.0.1
|
415 |
+
widgetsnbextension==3.5.2
|
416 |
+
wordcloud==1.5.0
|
417 |
+
wrapt==1.13.3
|
418 |
+
xarray==0.18.2
|
419 |
+
xgboost==0.90
|
420 |
+
xkit==0.0.0
|
421 |
+
xlrd==1.1.0
|
422 |
+
xlwt==1.3.0
|
423 |
+
xxhash==2.0.2
|
424 |
+
yamllint==1.23.0
|
425 |
+
yarl==1.7.2
|
426 |
+
yellowbrick==1.3.post1
|
427 |
+
zict==2.0.0
|
428 |
+
zipp==3.7.0
|
429 |
+
==============================
|
430 |
+
Git revision:
|
431 |
+
9d56d508
|
432 |
+
==============================
|
433 |
+
Cuda version:
|
434 |
+
11.1
|
fluent-speech-commands/direct/results/BPE51/112011/hyperparams.yaml
ADDED
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Generated 2022-02-15 from:
|
2 |
+
# /content/speechbrain/recipes/fluent-speech-commands/direct/hparams/train.yaml
|
3 |
+
# yamllint disable
|
4 |
+
# ############################################################################
|
5 |
+
# Model: Direct SLU
|
6 |
+
# Encoder: Pre-trained ASR encoder -> LSTM
|
7 |
+
# Decoder: GRU + beamsearch
|
8 |
+
# Tokens: BPE with unigram
|
9 |
+
# losses: NLL
|
10 |
+
# Training: Fluent Speech Commands
|
11 |
+
# Authors: Loren Lugosch, Mirco Ravanelli 2020
|
12 |
+
# ############################################################################
|
13 |
+
|
14 |
+
# Seed needs to be set at top of yaml, before objects with parameters are made
|
15 |
+
seed: 112011
|
16 |
+
__set_seed: !apply:torch.manual_seed [112011]
|
17 |
+
output_folder: results/BPE51/112011
|
18 |
+
save_folder: results/BPE51/112011/save
|
19 |
+
train_log: results/BPE51/112011/train_log.txt
|
20 |
+
|
21 |
+
# Data files
|
22 |
+
data_folder: /content/fluent_speech_commands_dataset
|
23 |
+
# e.g, /localscratch/fluent_speech_commands_dataset
|
24 |
+
rir_folder: /content/fluent_speech_commands_dataset # Change it if needed
|
25 |
+
csv_train: results/BPE51/112011/train.csv
|
26 |
+
csv_valid: results/BPE51/112011/valid.csv
|
27 |
+
csv_test: results/BPE51/112011/test.csv
|
28 |
+
tokenizer_file: https://www.dropbox.com/s/hvf2huofnq0sjbn/51_unigram.model?dl=1
|
29 |
+
skip_prep: false
|
30 |
+
# Training parameters
|
31 |
+
number_of_epochs: 4
|
32 |
+
batch_size: 8
|
33 |
+
lr: 0.0003
|
34 |
+
token_type: unigram # ["unigram", "bpe", "char"]
|
35 |
+
sorting: random
|
36 |
+
|
37 |
+
# Model parameters
|
38 |
+
sample_rate: 16000
|
39 |
+
emb_size: 128
|
40 |
+
dec_neurons: 512
|
41 |
+
output_neurons: 51 # index(eos/bos) = 0
|
42 |
+
ASR_encoder_dim: 512
|
43 |
+
encoder_dim: 256
|
44 |
+
|
45 |
+
# Decoding parameters
|
46 |
+
bos_index: 0
|
47 |
+
eos_index: 0
|
48 |
+
min_decode_ratio: 0.0
|
49 |
+
max_decode_ratio: 10.0
|
50 |
+
slu_beam_size: 80
|
51 |
+
eos_threshold: 1.5
|
52 |
+
temperature: 1.25
|
53 |
+
|
54 |
+
dataloader_opts:
|
55 |
+
batch_size: 8
|
56 |
+
shuffle: true
|
57 |
+
|
58 |
+
epoch_counter: &id013 !new:speechbrain.utils.epoch_loop.EpochCounter
|
59 |
+
|
60 |
+
limit: 4
|
61 |
+
|
62 |
+
# Models
|
63 |
+
asr_model: !apply:speechbrain.pretrained.EncoderDecoderASR.from_hparams
|
64 |
+
source: speechbrain/asr-crdnn-rnnlm-librispeech
|
65 |
+
run_opts: {device: cuda:0}
|
66 |
+
|
67 |
+
slu_enc: &id006 !new:speechbrain.nnet.containers.Sequential
|
68 |
+
input_shape: [null, null, 512]
|
69 |
+
lstm: !new:speechbrain.nnet.RNN.LSTM
|
70 |
+
input_size: 512
|
71 |
+
bidirectional: true
|
72 |
+
hidden_size: 256
|
73 |
+
num_layers: 2
|
74 |
+
linear: !new:speechbrain.nnet.linear.Linear
|
75 |
+
input_size: 512
|
76 |
+
n_neurons: 256
|
77 |
+
|
78 |
+
output_emb: &id007 !new:speechbrain.nnet.embedding.Embedding
|
79 |
+
num_embeddings: 51
|
80 |
+
embedding_dim: 128
|
81 |
+
|
82 |
+
dec: &id008 !new:speechbrain.nnet.RNN.AttentionalRNNDecoder
|
83 |
+
enc_dim: 256
|
84 |
+
input_size: 128
|
85 |
+
rnn_type: gru
|
86 |
+
attn_type: keyvalue
|
87 |
+
hidden_size: 512
|
88 |
+
attn_dim: 512
|
89 |
+
num_layers: 3
|
90 |
+
scaling: 1.0
|
91 |
+
dropout: 0.0
|
92 |
+
|
93 |
+
seq_lin: &id009 !new:speechbrain.nnet.linear.Linear
|
94 |
+
|
95 |
+
input_size: 512
|
96 |
+
n_neurons: 51
|
97 |
+
|
98 |
+
augment_wavedrop: &id001 !new:speechbrain.lobes.augment.TimeDomainSpecAugment
|
99 |
+
sample_rate: 16000
|
100 |
+
speeds: [100]
|
101 |
+
|
102 |
+
augment_speed: &id002 !new:speechbrain.lobes.augment.TimeDomainSpecAugment
|
103 |
+
sample_rate: 16000
|
104 |
+
speeds: [95, 100, 105]
|
105 |
+
|
106 |
+
add_rev: &id003 !new:speechbrain.lobes.augment.EnvCorrupt
|
107 |
+
openrir_folder: /content/fluent_speech_commands_dataset
|
108 |
+
openrir_max_noise_len: 3.0 # seconds
|
109 |
+
reverb_prob: 1.0
|
110 |
+
noise_prob: 0.0
|
111 |
+
noise_snr_low: 0
|
112 |
+
noise_snr_high: 15
|
113 |
+
rir_scale_factor: 1.0
|
114 |
+
|
115 |
+
add_noise: &id004 !new:speechbrain.lobes.augment.EnvCorrupt
|
116 |
+
openrir_folder: /content/fluent_speech_commands_dataset
|
117 |
+
openrir_max_noise_len: 3.0 # seconds
|
118 |
+
reverb_prob: 0.0
|
119 |
+
noise_prob: 1.0
|
120 |
+
noise_snr_low: 0
|
121 |
+
noise_snr_high: 15
|
122 |
+
rir_scale_factor: 1.0
|
123 |
+
|
124 |
+
add_rev_noise: &id005 !new:speechbrain.lobes.augment.EnvCorrupt
|
125 |
+
openrir_folder: /content/fluent_speech_commands_dataset
|
126 |
+
openrir_max_noise_len: 3.0 # seconds
|
127 |
+
reverb_prob: 1.0
|
128 |
+
noise_prob: 1.0
|
129 |
+
noise_snr_low: 0
|
130 |
+
noise_snr_high: 15
|
131 |
+
rir_scale_factor: 1.0
|
132 |
+
|
133 |
+
|
134 |
+
augment_pipeline: [*id001, *id002, *id003, *id004, *id005]
|
135 |
+
|
136 |
+
|
137 |
+
modules:
|
138 |
+
augment_wavedrop: *id001
|
139 |
+
augment_speed: *id002
|
140 |
+
add_rev: *id003
|
141 |
+
add_noise: *id004
|
142 |
+
add_rev_noise: *id005
|
143 |
+
slu_enc: *id006
|
144 |
+
output_emb: *id007
|
145 |
+
dec: *id008
|
146 |
+
seq_lin: *id009
|
147 |
+
model: &id011 !new:torch.nn.ModuleList
|
148 |
+
- [*id006, *id007, *id008, *id009]
|
149 |
+
tokenizer: &id010 !new:sentencepiece.SentencePieceProcessor
|
150 |
+
|
151 |
+
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
|
152 |
+
collect_in: results/BPE51/112011/save/FSC_tokenizer
|
153 |
+
loadables:
|
154 |
+
tokenizer: *id010
|
155 |
+
paths:
|
156 |
+
tokenizer: https://www.dropbox.com/s/hvf2huofnq0sjbn/51_unigram.model?dl=1
|
157 |
+
|
158 |
+
beam_searcher: !new:speechbrain.decoders.S2SRNNBeamSearcher
|
159 |
+
embedding: *id007
|
160 |
+
decoder: *id008
|
161 |
+
linear: *id009
|
162 |
+
bos_index: 0
|
163 |
+
eos_index: 0
|
164 |
+
min_decode_ratio: 0.0
|
165 |
+
max_decode_ratio: 10.0
|
166 |
+
beam_size: 80
|
167 |
+
eos_threshold: 1.5
|
168 |
+
temperature: 1.25
|
169 |
+
using_max_attn_shift: false
|
170 |
+
max_attn_shift: 30
|
171 |
+
coverage_penalty: 0.
|
172 |
+
|
173 |
+
opt_class: !name:torch.optim.Adam
|
174 |
+
lr: 0.0003
|
175 |
+
|
176 |
+
lr_annealing: &id012 !new:speechbrain.nnet.schedulers.NewBobScheduler
|
177 |
+
initial_value: 0.0003
|
178 |
+
improvement_threshold: 0.0025
|
179 |
+
annealing_factor: 0.8
|
180 |
+
patient: 0
|
181 |
+
|
182 |
+
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
|
183 |
+
checkpoints_dir: results/BPE51/112011/save
|
184 |
+
recoverables:
|
185 |
+
model: *id011
|
186 |
+
scheduler: *id012
|
187 |
+
counter: *id013
|
188 |
+
log_softmax: !new:speechbrain.nnet.activations.Softmax
|
189 |
+
apply_log: true
|
190 |
+
|
191 |
+
seq_cost: !name:speechbrain.nnet.losses.nll_loss
|
192 |
+
label_smoothing: 0.1
|
193 |
+
|
194 |
+
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
|
195 |
+
save_file: results/BPE51/112011/train_log.txt
|
196 |
+
|
197 |
+
error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
|
198 |
+
|
199 |
+
cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
|
200 |
+
split_tokens: true
|
fluent-speech-commands/direct/results/BPE51/112011/log.txt
ADDED
@@ -0,0 +1,454 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2022-02-15 02:15:13,142 - speechbrain.core - INFO - Beginning experiment!
|
2 |
+
2022-02-15 02:15:13,142 - speechbrain.core - INFO - Experiment folder: results/BPE51/112011
|
3 |
+
2022-02-15 02:15:14,668 - speechbrain.utils.superpowers - DEBUG - absl-py==1.0.0
|
4 |
+
aiohttp==3.8.1
|
5 |
+
aiosignal==1.2.0
|
6 |
+
alabaster==0.7.12
|
7 |
+
albumentations==0.1.12
|
8 |
+
altair==4.2.0
|
9 |
+
appdirs==1.4.4
|
10 |
+
argon2-cffi==21.3.0
|
11 |
+
argon2-cffi-bindings==21.2.0
|
12 |
+
arviz==0.11.4
|
13 |
+
astor==0.8.1
|
14 |
+
astropy==4.3.1
|
15 |
+
astunparse==1.6.3
|
16 |
+
async-timeout==4.0.2
|
17 |
+
asynctest==0.13.0
|
18 |
+
atari-py==0.2.9
|
19 |
+
atomicwrites==1.4.0
|
20 |
+
attrs==21.4.0
|
21 |
+
audioread==2.1.9
|
22 |
+
autograd==1.3
|
23 |
+
Babel==2.9.1
|
24 |
+
backcall==0.2.0
|
25 |
+
beautifulsoup4==4.6.3
|
26 |
+
black==19.10b0
|
27 |
+
bleach==4.1.0
|
28 |
+
blis==0.4.1
|
29 |
+
bokeh==2.3.3
|
30 |
+
Bottleneck==1.3.2
|
31 |
+
branca==0.4.2
|
32 |
+
bs4==0.0.1
|
33 |
+
CacheControl==0.12.10
|
34 |
+
cached-property==1.5.2
|
35 |
+
cachetools==4.2.4
|
36 |
+
catalogue==1.0.0
|
37 |
+
certifi==2021.10.8
|
38 |
+
cffi==1.15.0
|
39 |
+
cfgv==3.3.1
|
40 |
+
cftime==1.5.2
|
41 |
+
chardet==3.0.4
|
42 |
+
charset-normalizer==2.0.11
|
43 |
+
click==7.1.2
|
44 |
+
cloudpickle==1.3.0
|
45 |
+
cmake==3.12.0
|
46 |
+
cmdstanpy==0.9.5
|
47 |
+
colorcet==3.0.0
|
48 |
+
colorlover==0.3.0
|
49 |
+
community==1.0.0b1
|
50 |
+
contextlib2==0.5.5
|
51 |
+
convertdate==2.4.0
|
52 |
+
coverage==3.7.1
|
53 |
+
coveralls==0.5
|
54 |
+
crcmod==1.7
|
55 |
+
cufflinks==0.17.3
|
56 |
+
cupy-cuda111==9.4.0
|
57 |
+
cvxopt==1.2.7
|
58 |
+
cvxpy==1.0.31
|
59 |
+
cycler==0.11.0
|
60 |
+
cymem==2.0.6
|
61 |
+
Cython==0.29.27
|
62 |
+
daft==0.0.4
|
63 |
+
dask==2.12.0
|
64 |
+
datascience==0.10.6
|
65 |
+
datasets==1.18.3
|
66 |
+
debugpy==1.0.0
|
67 |
+
decorator==4.4.2
|
68 |
+
defusedxml==0.7.1
|
69 |
+
descartes==1.1.0
|
70 |
+
dill==0.3.4
|
71 |
+
distlib==0.3.4
|
72 |
+
distributed==1.25.3
|
73 |
+
dlib @ file:///dlib-19.18.0-cp37-cp37m-linux_x86_64.whl
|
74 |
+
dm-tree==0.1.6
|
75 |
+
docopt==0.6.2
|
76 |
+
docutils==0.17.1
|
77 |
+
dopamine-rl==1.0.5
|
78 |
+
earthengine-api==0.1.297
|
79 |
+
easydict==1.9
|
80 |
+
ecos==2.0.10
|
81 |
+
editdistance==0.5.3
|
82 |
+
en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz
|
83 |
+
entrypoints==0.3
|
84 |
+
ephem==4.1.3
|
85 |
+
et-xmlfile==1.1.0
|
86 |
+
fa2==0.3.5
|
87 |
+
fastai==1.0.61
|
88 |
+
fastdtw==0.3.4
|
89 |
+
fastprogress==1.0.0
|
90 |
+
fastrlock==0.8
|
91 |
+
fbprophet==0.7.1
|
92 |
+
feather-format==0.4.1
|
93 |
+
filelock==3.4.2
|
94 |
+
firebase-admin==4.4.0
|
95 |
+
fix-yahoo-finance==0.0.22
|
96 |
+
flake8==3.7.9
|
97 |
+
Flask==1.1.4
|
98 |
+
flatbuffers==2.0
|
99 |
+
folium==0.8.3
|
100 |
+
frozenlist==1.3.0
|
101 |
+
fsspec==2022.1.0
|
102 |
+
future==0.16.0
|
103 |
+
gast==0.4.0
|
104 |
+
GDAL==2.2.2
|
105 |
+
gdown==4.2.1
|
106 |
+
gensim==3.6.0
|
107 |
+
geographiclib==1.52
|
108 |
+
geopy==1.17.0
|
109 |
+
gin-config==0.5.0
|
110 |
+
glob2==0.7
|
111 |
+
google==2.0.3
|
112 |
+
google-api-core==1.26.3
|
113 |
+
google-api-python-client==1.12.10
|
114 |
+
google-auth==1.35.0
|
115 |
+
google-auth-httplib2==0.0.4
|
116 |
+
google-auth-oauthlib==0.4.6
|
117 |
+
google-cloud-bigquery==1.21.0
|
118 |
+
google-cloud-bigquery-storage==1.1.0
|
119 |
+
google-cloud-core==1.0.3
|
120 |
+
google-cloud-datastore==1.8.0
|
121 |
+
google-cloud-firestore==1.7.0
|
122 |
+
google-cloud-language==1.2.0
|
123 |
+
google-cloud-storage==1.18.1
|
124 |
+
google-cloud-translate==1.5.0
|
125 |
+
google-colab @ file:///colabtools/dist/google-colab-1.0.0.tar.gz
|
126 |
+
google-pasta==0.2.0
|
127 |
+
google-resumable-media==0.4.1
|
128 |
+
googleapis-common-protos==1.54.0
|
129 |
+
googledrivedownloader==0.4
|
130 |
+
graphviz==0.10.1
|
131 |
+
greenlet==1.1.2
|
132 |
+
grpcio==1.43.0
|
133 |
+
gspread==3.4.2
|
134 |
+
gspread-dataframe==3.0.8
|
135 |
+
gym==0.17.3
|
136 |
+
h5py==3.1.0
|
137 |
+
HeapDict==1.0.1
|
138 |
+
hijri-converter==2.2.2
|
139 |
+
holidays==0.10.5.2
|
140 |
+
holoviews==1.14.7
|
141 |
+
html5lib==1.0.1
|
142 |
+
httpimport==0.5.18
|
143 |
+
httplib2==0.17.4
|
144 |
+
httplib2shim==0.0.3
|
145 |
+
huggingface-hub==0.4.0
|
146 |
+
humanize==0.5.1
|
147 |
+
hyperopt==0.1.2
|
148 |
+
HyperPyYAML==1.0.0
|
149 |
+
ideep4py==2.0.0.post3
|
150 |
+
identify==2.4.10
|
151 |
+
idna==2.10
|
152 |
+
imageio==2.4.1
|
153 |
+
imagesize==1.3.0
|
154 |
+
imbalanced-learn==0.8.1
|
155 |
+
imblearn==0.0
|
156 |
+
imgaug==0.2.9
|
157 |
+
importlib-metadata==4.10.1
|
158 |
+
importlib-resources==5.4.0
|
159 |
+
imutils==0.5.4
|
160 |
+
inflect==2.1.0
|
161 |
+
iniconfig==1.1.1
|
162 |
+
intel-openmp==2022.0.2
|
163 |
+
intervaltree==2.1.0
|
164 |
+
ipykernel==4.10.1
|
165 |
+
ipython==5.5.0
|
166 |
+
ipython-genutils==0.2.0
|
167 |
+
ipython-sql==0.3.9
|
168 |
+
ipywidgets==7.6.5
|
169 |
+
itsdangerous==1.1.0
|
170 |
+
jax==0.2.25
|
171 |
+
jaxlib @ https://storage.googleapis.com/jax-releases/cuda111/jaxlib-0.1.71+cuda111-cp37-none-manylinux2010_x86_64.whl
|
172 |
+
jedi==0.18.1
|
173 |
+
jieba==0.42.1
|
174 |
+
Jinja2==2.11.3
|
175 |
+
joblib==1.1.0
|
176 |
+
jpeg4py==0.1.4
|
177 |
+
jsonschema==4.3.3
|
178 |
+
jupyter==1.0.0
|
179 |
+
jupyter-client==5.3.5
|
180 |
+
jupyter-console==5.2.0
|
181 |
+
jupyter-core==4.9.1
|
182 |
+
jupyterlab-pygments==0.1.2
|
183 |
+
jupyterlab-widgets==1.0.2
|
184 |
+
kaggle==1.5.12
|
185 |
+
kapre==0.3.7
|
186 |
+
keras==2.7.0
|
187 |
+
Keras-Preprocessing==1.1.2
|
188 |
+
keras-vis==0.4.1
|
189 |
+
kiwisolver==1.3.2
|
190 |
+
korean-lunar-calendar==0.2.1
|
191 |
+
libclang==13.0.0
|
192 |
+
librosa==0.9.0
|
193 |
+
lightgbm==2.2.3
|
194 |
+
llvmlite==0.34.0
|
195 |
+
lmdb==0.99
|
196 |
+
LunarCalendar==0.0.9
|
197 |
+
lxml==4.2.6
|
198 |
+
Markdown==3.3.6
|
199 |
+
MarkupSafe==2.0.1
|
200 |
+
matplotlib==3.2.2
|
201 |
+
matplotlib-inline==0.1.3
|
202 |
+
matplotlib-venn==0.11.6
|
203 |
+
mccabe==0.6.1
|
204 |
+
missingno==0.5.0
|
205 |
+
mistune==0.8.4
|
206 |
+
mizani==0.6.0
|
207 |
+
mkl==2019.0
|
208 |
+
mlxtend==0.14.0
|
209 |
+
more-itertools==8.12.0
|
210 |
+
moviepy==0.2.3.5
|
211 |
+
mpmath==1.2.1
|
212 |
+
msgpack==1.0.3
|
213 |
+
multidict==6.0.2
|
214 |
+
multiprocess==0.70.12.2
|
215 |
+
multitasking==0.0.10
|
216 |
+
murmurhash==1.0.6
|
217 |
+
music21==5.5.0
|
218 |
+
natsort==5.5.0
|
219 |
+
nbclient==0.5.10
|
220 |
+
nbconvert==5.6.1
|
221 |
+
nbformat==5.1.3
|
222 |
+
nest-asyncio==1.5.4
|
223 |
+
netCDF4==1.5.8
|
224 |
+
networkx==2.6.3
|
225 |
+
nibabel==3.0.2
|
226 |
+
nltk==3.2.5
|
227 |
+
nodeenv==1.6.0
|
228 |
+
notebook==5.3.1
|
229 |
+
numba==0.51.2
|
230 |
+
numexpr==2.8.1
|
231 |
+
numpy==1.19.5
|
232 |
+
nvidia-ml-py3==7.352.0
|
233 |
+
oauth2client==4.1.3
|
234 |
+
oauthlib==3.2.0
|
235 |
+
okgrade==0.4.3
|
236 |
+
opencv-contrib-python==4.1.2.30
|
237 |
+
opencv-python==4.1.2.30
|
238 |
+
openpyxl==3.0.9
|
239 |
+
opt-einsum==3.3.0
|
240 |
+
osqp==0.6.2.post0
|
241 |
+
packaging==21.3
|
242 |
+
palettable==3.3.0
|
243 |
+
pandas==1.3.5
|
244 |
+
pandas-datareader==0.9.0
|
245 |
+
pandas-gbq==0.13.3
|
246 |
+
pandas-profiling==1.4.1
|
247 |
+
pandocfilters==1.5.0
|
248 |
+
panel==0.12.1
|
249 |
+
param==1.12.0
|
250 |
+
parso==0.8.3
|
251 |
+
pathlib==1.0.1
|
252 |
+
pathspec==0.9.0
|
253 |
+
patsy==0.5.2
|
254 |
+
pep517==0.12.0
|
255 |
+
pexpect==4.8.0
|
256 |
+
pickleshare==0.7.5
|
257 |
+
Pillow==7.1.2
|
258 |
+
pip-tools==6.2.0
|
259 |
+
plac==1.1.3
|
260 |
+
platformdirs==2.5.0
|
261 |
+
plotly==5.5.0
|
262 |
+
plotnine==0.6.0
|
263 |
+
pluggy==0.13.1
|
264 |
+
pooch==1.6.0
|
265 |
+
portpicker==1.3.9
|
266 |
+
pre-commit==2.17.0
|
267 |
+
prefetch-generator==1.0.1
|
268 |
+
preshed==3.0.6
|
269 |
+
prettytable==3.0.0
|
270 |
+
progressbar2==3.38.0
|
271 |
+
prometheus-client==0.13.1
|
272 |
+
promise==2.3
|
273 |
+
prompt-toolkit==1.0.18
|
274 |
+
protobuf==3.17.3
|
275 |
+
psutil==5.4.8
|
276 |
+
psycopg2==2.7.6.1
|
277 |
+
ptyprocess==0.7.0
|
278 |
+
py==1.11.0
|
279 |
+
pyarrow==6.0.1
|
280 |
+
pyasn1==0.4.8
|
281 |
+
pyasn1-modules==0.2.8
|
282 |
+
pycocotools==2.0.4
|
283 |
+
pycodestyle==2.5.0
|
284 |
+
pycparser==2.21
|
285 |
+
pyct==0.4.8
|
286 |
+
pydata-google-auth==1.3.0
|
287 |
+
pydot==1.3.0
|
288 |
+
pydot-ng==2.0.0
|
289 |
+
pydotplus==2.0.2
|
290 |
+
PyDrive==1.3.1
|
291 |
+
pyemd==0.5.1
|
292 |
+
pyerfa==2.0.0.1
|
293 |
+
pyflakes==2.1.1
|
294 |
+
pyglet==1.5.0
|
295 |
+
Pygments==2.6.1
|
296 |
+
pygobject==3.26.1
|
297 |
+
pymc3==3.11.4
|
298 |
+
PyMeeus==0.5.11
|
299 |
+
pymongo==4.0.1
|
300 |
+
pymystem3==0.2.0
|
301 |
+
PyOpenGL==3.1.5
|
302 |
+
pyparsing==3.0.7
|
303 |
+
pyrsistent==0.18.1
|
304 |
+
pysndfile==1.3.8
|
305 |
+
PySocks==1.7.1
|
306 |
+
pystan==2.19.1.1
|
307 |
+
pytest==5.4.1
|
308 |
+
python-apt==0.0.0
|
309 |
+
python-chess==0.23.11
|
310 |
+
python-dateutil==2.8.2
|
311 |
+
python-louvain==0.16
|
312 |
+
python-slugify==5.0.2
|
313 |
+
python-utils==3.1.0
|
314 |
+
pytz==2018.9
|
315 |
+
pyviz-comms==2.1.0
|
316 |
+
PyWavelets==1.2.0
|
317 |
+
PyYAML==6.0
|
318 |
+
pyzmq==22.3.0
|
319 |
+
qdldl==0.1.5.post0
|
320 |
+
qtconsole==5.2.2
|
321 |
+
QtPy==2.0.1
|
322 |
+
regex==2019.12.20
|
323 |
+
requests==2.23.0
|
324 |
+
requests-oauthlib==1.3.1
|
325 |
+
resampy==0.2.2
|
326 |
+
rpy2==3.4.5
|
327 |
+
rsa==4.8
|
328 |
+
ruamel.yaml==0.17.21
|
329 |
+
ruamel.yaml.clib==0.2.6
|
330 |
+
sacremoses==0.0.47
|
331 |
+
scikit-image==0.18.3
|
332 |
+
scikit-learn==1.0.2
|
333 |
+
scipy==1.4.1
|
334 |
+
screen-resolution-extra==0.0.0
|
335 |
+
scs==3.1.0
|
336 |
+
seaborn==0.11.2
|
337 |
+
semver==2.13.0
|
338 |
+
Send2Trash==1.8.0
|
339 |
+
sentencepiece==0.1.96
|
340 |
+
setuptools-git==1.2
|
341 |
+
Shapely==1.8.0
|
342 |
+
simplegeneric==0.8.1
|
343 |
+
six==1.15.0
|
344 |
+
sklearn==0.0
|
345 |
+
sklearn-pandas==1.8.0
|
346 |
+
smart-open==5.2.1
|
347 |
+
snowballstemmer==2.2.0
|
348 |
+
sortedcontainers==2.4.0
|
349 |
+
SoundFile==0.10.3.post1
|
350 |
+
spacy==2.2.4
|
351 |
+
speechbrain==0.5.11
|
352 |
+
Sphinx==1.8.6
|
353 |
+
sphinxcontrib-serializinghtml==1.1.5
|
354 |
+
sphinxcontrib-websupport==1.2.4
|
355 |
+
SQLAlchemy==1.4.31
|
356 |
+
sqlparse==0.4.2
|
357 |
+
srsly==1.0.5
|
358 |
+
statsmodels==0.10.2
|
359 |
+
sympy==1.7.1
|
360 |
+
tables==3.7.0
|
361 |
+
tabulate==0.8.9
|
362 |
+
tblib==1.7.0
|
363 |
+
tenacity==8.0.1
|
364 |
+
tensorboard==2.7.0
|
365 |
+
tensorboard-data-server==0.6.1
|
366 |
+
tensorboard-plugin-wit==1.8.1
|
367 |
+
tensorflow @ file:///tensorflow-2.7.0-cp37-cp37m-linux_x86_64.whl
|
368 |
+
tensorflow-datasets==4.0.1
|
369 |
+
tensorflow-estimator==2.7.0
|
370 |
+
tensorflow-gcs-config==2.7.0
|
371 |
+
tensorflow-hub==0.12.0
|
372 |
+
tensorflow-io-gcs-filesystem==0.24.0
|
373 |
+
tensorflow-metadata==1.6.0
|
374 |
+
tensorflow-probability==0.15.0
|
375 |
+
termcolor==1.1.0
|
376 |
+
terminado==0.13.1
|
377 |
+
testpath==0.5.0
|
378 |
+
text-unidecode==1.3
|
379 |
+
textblob==0.15.3
|
380 |
+
Theano-PyMC==1.1.2
|
381 |
+
thinc==7.4.0
|
382 |
+
threadpoolctl==3.1.0
|
383 |
+
tifffile==2021.11.2
|
384 |
+
tokenizers==0.11.4
|
385 |
+
toml==0.10.2
|
386 |
+
tomli==2.0.0
|
387 |
+
toolz==0.11.2
|
388 |
+
torch @ https://download.pytorch.org/whl/cu111/torch-1.10.0%2Bcu111-cp37-cp37m-linux_x86_64.whl
|
389 |
+
torchaudio @ https://download.pytorch.org/whl/cu111/torchaudio-0.10.0%2Bcu111-cp37-cp37m-linux_x86_64.whl
|
390 |
+
torchsummary==1.5.1
|
391 |
+
torchtext==0.11.0
|
392 |
+
torchvision @ https://download.pytorch.org/whl/cu111/torchvision-0.11.1%2Bcu111-cp37-cp37m-linux_x86_64.whl
|
393 |
+
tornado==5.1.1
|
394 |
+
tqdm==4.62.3
|
395 |
+
traitlets==5.1.1
|
396 |
+
transformers==4.16.2
|
397 |
+
tweepy==3.10.0
|
398 |
+
typed-ast==1.5.2
|
399 |
+
typeguard==2.7.1
|
400 |
+
typing-extensions==3.10.0.2
|
401 |
+
tzlocal==1.5.1
|
402 |
+
uritemplate==3.0.1
|
403 |
+
urllib3==1.24.3
|
404 |
+
vega-datasets==0.9.0
|
405 |
+
virtualenv==20.13.1
|
406 |
+
wasabi==0.9.0
|
407 |
+
wcwidth==0.2.5
|
408 |
+
webencodings==0.5.1
|
409 |
+
Werkzeug==1.0.1
|
410 |
+
widgetsnbextension==3.5.2
|
411 |
+
wordcloud==1.5.0
|
412 |
+
wrapt==1.13.3
|
413 |
+
xarray==0.18.2
|
414 |
+
xgboost==0.90
|
415 |
+
xkit==0.0.0
|
416 |
+
xlrd==1.1.0
|
417 |
+
xlwt==1.3.0
|
418 |
+
xxhash==2.0.2
|
419 |
+
yamllint==1.23.0
|
420 |
+
yarl==1.7.2
|
421 |
+
yellowbrick==1.3.post1
|
422 |
+
zict==2.0.0
|
423 |
+
zipp==3.7.0
|
424 |
+
|
425 |
+
|
426 |
+
2022-02-15 02:15:14,791 - speechbrain.utils.superpowers - DEBUG - 9d56d508
|
427 |
+
|
428 |
+
|
429 |
+
2022-02-15 02:15:14,794 - prepare - INFO - Preparing results/BPE51/112011/train.csv...
|
430 |
+
2022-02-15 02:15:51,789 - prepare - INFO - Preparing results/BPE51/112011/valid.csv...
|
431 |
+
2022-02-15 02:15:54,371 - prepare - INFO - Preparing results/BPE51/112011/test.csv...
|
432 |
+
2022-02-15 02:15:57,696 - speechbrain.utils.parameter_transfer - DEBUG - Collecting files (or symlinks) for pretraining in results/BPE51/112011/save/FSC_tokenizer.
|
433 |
+
2022-02-15 02:15:57,696 - speechbrain.pretrained.fetching - INFO - Fetch 51_unigram.model?dl=1: Downloading from normal URL https://www.dropbox.com/s/hvf2huofnq0sjbn/51_unigram.model?dl=1.
|
434 |
+
2022-02-15 02:15:58,662 - speechbrain.utils.parameter_transfer - INFO - Loading pretrained files for: tokenizer
|
435 |
+
2022-02-15 02:15:58,689 - speechbrain.core - INFO - 9.3M trainable parameters in SLU
|
436 |
+
2022-02-15 02:15:58,690 - speechbrain.utils.checkpoints - INFO - Would load a checkpoint here, but none found yet.
|
437 |
+
2022-02-15 02:15:58,690 - speechbrain.utils.epoch_loop - INFO - Going into epoch 1
|
438 |
+
2022-02-15 02:55:37,944 - speechbrain.utils.train_logger - INFO - epoch: 1, lr: 3.00e-04 - train loss: 7.70e-01 - valid loss: 7.10e-01, valid CER: 3.82e-01, valid WER: 1.01
|
439 |
+
2022-02-15 02:55:38,203 - speechbrain.utils.checkpoints - INFO - Saved an end-of-epoch checkpoint in results/BPE51/112011/save/CKPT+2022-02-15+02-55-37+00
|
440 |
+
2022-02-15 02:55:38,208 - speechbrain.utils.epoch_loop - INFO - Going into epoch 2
|
441 |
+
2022-02-15 03:34:48,102 - speechbrain.utils.train_logger - INFO - epoch: 2, lr: 3.00e-04 - train loss: 7.10e-01 - valid loss: 7.08e-01, valid CER: 3.45e-01, valid WER: 8.11e-01
|
442 |
+
2022-02-15 03:34:48,368 - speechbrain.utils.checkpoints - INFO - Saved an end-of-epoch checkpoint in results/BPE51/112011/save/CKPT+2022-02-15+03-34-48+00
|
443 |
+
2022-02-15 03:34:48,400 - speechbrain.utils.checkpoints - INFO - Deleted checkpoint in results/BPE51/112011/save/CKPT+2022-02-15+02-55-37+00
|
444 |
+
2022-02-15 03:34:48,401 - speechbrain.utils.epoch_loop - INFO - Going into epoch 3
|
445 |
+
2022-02-15 04:14:03,140 - speechbrain.utils.train_logger - INFO - epoch: 3, lr: 3.00e-04 - train loss: 7.08e-01 - valid loss: 7.07e-01, valid CER: 2.30e-01, valid WER: 5.79e-01
|
446 |
+
2022-02-15 04:14:03,373 - speechbrain.utils.checkpoints - INFO - Saved an end-of-epoch checkpoint in results/BPE51/112011/save/CKPT+2022-02-15+04-14-03+00
|
447 |
+
2022-02-15 04:14:03,429 - speechbrain.utils.checkpoints - INFO - Deleted checkpoint in results/BPE51/112011/save/CKPT+2022-02-15+03-34-48+00
|
448 |
+
2022-02-15 04:14:03,430 - speechbrain.utils.epoch_loop - INFO - Going into epoch 4
|
449 |
+
2022-02-15 04:53:03,184 - speechbrain.nnet.schedulers - INFO - Changing lr from 0.0003 to 0.00024
|
450 |
+
2022-02-15 04:53:03,184 - speechbrain.utils.train_logger - INFO - epoch: 4, lr: 3.00e-04 - train loss: 7.08e-01 - valid loss: 7.07e-01, valid CER: 3.17e-01, valid WER: 7.26e-01
|
451 |
+
2022-02-15 04:53:03,440 - speechbrain.utils.checkpoints - INFO - Saved an end-of-epoch checkpoint in results/BPE51/112011/save/CKPT+2022-02-15+04-53-03+00
|
452 |
+
2022-02-15 04:53:03,473 - speechbrain.utils.checkpoints - INFO - Loading a checkpoint from results/BPE51/112011/save/CKPT+2022-02-15+04-53-03+00
|
453 |
+
2022-02-15 04:53:03,520 - root - DEBUG - SaveableDataLoader was requested to load a checkpoint, but the DataLoader has already been iterated. The DataLoader file will be ignored. This is normal in evaluation, when a checkpoint is loaded just to retrieve the best model.
|
454 |
+
2022-02-15 04:56:32,698 - speechbrain.utils.train_logger - INFO - Epoch loaded: 4 - test loss: 7.05e-01, test CER: 2.37e-02, test WER: 6.91e-02
|
fluent-speech-commands/direct/results/BPE51/112011/save/CKPT+2022-02-15+04-14-03+00/CKPT.yaml
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# yamllint disable
|
2 |
+
WER: 0.5789473684210527
|
3 |
+
end-of-epoch: true
|
4 |
+
unixtime: 1644898443.140418
|
fluent-speech-commands/direct/results/BPE51/112011/save/CKPT+2022-02-15+04-14-03+00/brain.ckpt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
avg_train_loss: 0.0
|
2 |
+
step: 0
|
fluent-speech-commands/direct/results/BPE51/112011/save/CKPT+2022-02-15+04-14-03+00/counter.ckpt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
3
|
fluent-speech-commands/direct/results/BPE51/112011/save/CKPT+2022-02-15+04-14-03+00/dataloader-TRAIN.ckpt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
2892
|
fluent-speech-commands/direct/results/BPE51/112011/save/CKPT+2022-02-15+04-14-03+00/model.ckpt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:66f0875b693545510c3de685a7b12825be269f311f47dff8e4dc09167aae8943
|
3 |
+
size 37181975
|
fluent-speech-commands/direct/results/BPE51/112011/save/CKPT+2022-02-15+04-14-03+00/optimizer.ckpt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c6a1d3d05b4a82113bfece0e7dfb5a3fe11e38cd02fcc01c9e2a367cd1b0f444
|
3 |
+
size 74367087
|
fluent-speech-commands/direct/results/BPE51/112011/save/CKPT+2022-02-15+04-14-03+00/scheduler.ckpt
ADDED
Binary file (495 Bytes). View file
|
|
fluent-speech-commands/direct/results/BPE51/112011/save/CKPT+2022-02-15+04-53-03+00/CKPT.yaml
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# yamllint disable
|
2 |
+
WER: 0.7263157894736842
|
3 |
+
end-of-epoch: true
|
4 |
+
unixtime: 1644900783.1849935
|
fluent-speech-commands/direct/results/BPE51/112011/save/CKPT+2022-02-15+04-53-03+00/brain.ckpt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
avg_train_loss: 0.0
|
2 |
+
step: 0
|
fluent-speech-commands/direct/results/BPE51/112011/save/CKPT+2022-02-15+04-53-03+00/counter.ckpt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
4
|
fluent-speech-commands/direct/results/BPE51/112011/save/CKPT+2022-02-15+04-53-03+00/dataloader-TRAIN.ckpt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
2892
|
fluent-speech-commands/direct/results/BPE51/112011/save/CKPT+2022-02-15+04-53-03+00/model.ckpt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:91b0d3ed61e45b003d62e451327473a9cfc56f64c15368ea8caafd09f3f1b948
|
3 |
+
size 37181975
|
fluent-speech-commands/direct/results/BPE51/112011/save/CKPT+2022-02-15+04-53-03+00/optimizer.ckpt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ce37a00b1a444e7e8b1076ec10f430e63b8c12c77406cb3aa88353b95ab1a17f
|
3 |
+
size 74367087
|
fluent-speech-commands/direct/results/BPE51/112011/save/CKPT+2022-02-15+04-53-03+00/scheduler.ckpt
ADDED
Binary file (495 Bytes). View file
|
|
fluent-speech-commands/direct/results/BPE51/112011/save/FSC_tokenizer/tokenizer.ckpt
ADDED
Binary file (238 kB). View file
|
|
fluent-speech-commands/direct/results/BPE51/112011/test.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
fluent-speech-commands/direct/results/BPE51/112011/train.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
fluent-speech-commands/direct/results/BPE51/112011/train.py
ADDED
@@ -0,0 +1,347 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env/python3
|
2 |
+
"""
|
3 |
+
Recipe for "direct" (speech -> semantics) SLU with ASR-based transfer learning.
|
4 |
+
|
5 |
+
We encode input waveforms into features using a model trained on LibriSpeech,
|
6 |
+
then feed the features into a seq2seq model to map them to semantics.
|
7 |
+
|
8 |
+
(Adapted from the LibriSpeech seq2seq ASR recipe written by Ju-Chieh Chou, Mirco Ravanelli, Abdel Heba, and Peter Plantinga.)
|
9 |
+
|
10 |
+
Run using:
|
11 |
+
> python train.py hparams/train.yaml
|
12 |
+
|
13 |
+
Authors
|
14 |
+
* Loren Lugosch 2020
|
15 |
+
* Mirco Ravanelli 2020
|
16 |
+
"""
|
17 |
+
|
18 |
+
import sys
|
19 |
+
import torch
|
20 |
+
import speechbrain as sb
|
21 |
+
import logging
|
22 |
+
from hyperpyyaml import load_hyperpyyaml
|
23 |
+
from speechbrain.utils.distributed import run_on_main
|
24 |
+
|
25 |
+
logger = logging.getLogger(__name__)
|
26 |
+
|
27 |
+
# Define training procedure
|
28 |
+
|
29 |
+
|
30 |
+
class SLU(sb.Brain):
|
31 |
+
def compute_forward(self, batch, stage):
|
32 |
+
"""Forward computations from the waveform batches to the output probabilities."""
|
33 |
+
batch = batch.to(self.device)
|
34 |
+
wavs, wav_lens = batch.sig
|
35 |
+
tokens_bos, tokens_bos_lens = batch.tokens_bos
|
36 |
+
|
37 |
+
# Add augmentation if specified
|
38 |
+
if stage == sb.Stage.TRAIN:
|
39 |
+
# Applying the augmentation pipeline
|
40 |
+
wavs_aug_tot = []
|
41 |
+
wavs_aug_tot.append(wavs)
|
42 |
+
for count, augment in enumerate(self.hparams.augment_pipeline):
|
43 |
+
|
44 |
+
# Apply augment
|
45 |
+
wavs_aug = augment(wavs, wav_lens)
|
46 |
+
|
47 |
+
# Managing speed change
|
48 |
+
if wavs_aug.shape[1] > wavs.shape[1]:
|
49 |
+
wavs_aug = wavs_aug[:, 0 : wavs.shape[1]]
|
50 |
+
else:
|
51 |
+
zero_sig = torch.zeros_like(wavs)
|
52 |
+
zero_sig[:, 0 : wavs_aug.shape[1]] = wavs_aug
|
53 |
+
wavs_aug = zero_sig
|
54 |
+
|
55 |
+
wavs_aug_tot.append(wavs_aug)
|
56 |
+
|
57 |
+
wavs = torch.cat(wavs_aug_tot, dim=0)
|
58 |
+
self.n_augment = len(wavs_aug_tot)
|
59 |
+
wav_lens = torch.cat([wav_lens] * self.n_augment)
|
60 |
+
tokens_bos = torch.cat([tokens_bos] * self.n_augment)
|
61 |
+
|
62 |
+
# ASR encoder forward pass
|
63 |
+
with torch.no_grad():
|
64 |
+
ASR_encoder_out = self.hparams.asr_model.encode_batch(
|
65 |
+
wavs.detach(), wav_lens
|
66 |
+
)
|
67 |
+
|
68 |
+
# SLU forward pass
|
69 |
+
encoder_out = self.hparams.slu_enc(ASR_encoder_out)
|
70 |
+
e_in = self.hparams.output_emb(tokens_bos)
|
71 |
+
h, _ = self.hparams.dec(e_in, encoder_out, wav_lens)
|
72 |
+
|
73 |
+
# Output layer for seq2seq log-probabilities
|
74 |
+
logits = self.hparams.seq_lin(h)
|
75 |
+
p_seq = self.hparams.log_softmax(logits)
|
76 |
+
|
77 |
+
# Compute outputs
|
78 |
+
if (
|
79 |
+
stage == sb.Stage.TRAIN
|
80 |
+
and self.batch_count % show_results_every != 0
|
81 |
+
):
|
82 |
+
return p_seq, wav_lens
|
83 |
+
else:
|
84 |
+
p_tokens, scores = self.hparams.beam_searcher(encoder_out, wav_lens)
|
85 |
+
return p_seq, wav_lens, p_tokens
|
86 |
+
|
87 |
+
def compute_objectives(self, predictions, batch, stage):
|
88 |
+
"""Computes the loss (NLL) given predictions and targets."""
|
89 |
+
|
90 |
+
if (
|
91 |
+
stage == sb.Stage.TRAIN
|
92 |
+
and self.batch_count % show_results_every != 0
|
93 |
+
):
|
94 |
+
p_seq, wav_lens = predictions
|
95 |
+
else:
|
96 |
+
p_seq, wav_lens, predicted_tokens = predictions
|
97 |
+
|
98 |
+
ids = batch.id
|
99 |
+
tokens_eos, tokens_eos_lens = batch.tokens_eos
|
100 |
+
tokens, tokens_lens = batch.tokens
|
101 |
+
|
102 |
+
if hasattr(self.hparams, "env_corrupt") and stage == sb.Stage.TRAIN:
|
103 |
+
tokens_eos = torch.cat([tokens_eos, tokens_eos], dim=0)
|
104 |
+
tokens_eos_lens = torch.cat(
|
105 |
+
[tokens_eos_lens, tokens_eos_lens], dim=0
|
106 |
+
)
|
107 |
+
|
108 |
+
if stage == sb.Stage.TRAIN:
|
109 |
+
tokens_eos = torch.cat([tokens_eos] * self.n_augment, dim=0)
|
110 |
+
tokens_eos_lens = torch.cat(
|
111 |
+
[tokens_eos_lens] * self.n_augment, dim=0
|
112 |
+
)
|
113 |
+
|
114 |
+
loss_seq = self.hparams.seq_cost(
|
115 |
+
p_seq, tokens_eos, length=tokens_eos_lens
|
116 |
+
)
|
117 |
+
|
118 |
+
# (No ctc loss)
|
119 |
+
loss = loss_seq
|
120 |
+
|
121 |
+
if (stage != sb.Stage.TRAIN) or (
|
122 |
+
self.batch_count % show_results_every == 0
|
123 |
+
):
|
124 |
+
# Decode token terms to words
|
125 |
+
predicted_semantics = [
|
126 |
+
tokenizer.decode_ids(utt_seq).split(" ")
|
127 |
+
for utt_seq in predicted_tokens
|
128 |
+
]
|
129 |
+
|
130 |
+
target_semantics = [wrd.split(" ") for wrd in batch.semantics]
|
131 |
+
|
132 |
+
for i in range(len(target_semantics)):
|
133 |
+
print(" ".join(predicted_semantics[i]).replace("|", ","))
|
134 |
+
print(" ".join(target_semantics[i]).replace("|", ","))
|
135 |
+
print("")
|
136 |
+
|
137 |
+
if stage != sb.Stage.TRAIN:
|
138 |
+
self.wer_metric.append(
|
139 |
+
ids, predicted_semantics, target_semantics
|
140 |
+
)
|
141 |
+
self.cer_metric.append(
|
142 |
+
ids, predicted_semantics, target_semantics
|
143 |
+
)
|
144 |
+
|
145 |
+
return loss
|
146 |
+
|
147 |
+
def fit_batch(self, batch):
|
148 |
+
"""Train the parameters given a single batch in input"""
|
149 |
+
predictions = self.compute_forward(batch, sb.Stage.TRAIN)
|
150 |
+
loss = self.compute_objectives(predictions, batch, sb.Stage.TRAIN)
|
151 |
+
loss.backward()
|
152 |
+
if self.check_gradients(loss):
|
153 |
+
self.optimizer.step()
|
154 |
+
self.optimizer.zero_grad()
|
155 |
+
self.batch_count += 1
|
156 |
+
return loss.detach()
|
157 |
+
|
158 |
+
def evaluate_batch(self, batch, stage):
|
159 |
+
"""Computations needed for validation/test batches"""
|
160 |
+
predictions = self.compute_forward(batch, stage=stage)
|
161 |
+
loss = self.compute_objectives(predictions, batch, stage=stage)
|
162 |
+
return loss.detach()
|
163 |
+
|
164 |
+
def on_stage_start(self, stage, epoch):
|
165 |
+
"""Gets called at the beginning of each epoch"""
|
166 |
+
self.batch_count = 0
|
167 |
+
|
168 |
+
if stage != sb.Stage.TRAIN:
|
169 |
+
|
170 |
+
self.cer_metric = self.hparams.cer_computer()
|
171 |
+
self.wer_metric = self.hparams.error_rate_computer()
|
172 |
+
|
173 |
+
def on_stage_end(self, stage, stage_loss, epoch):
|
174 |
+
"""Gets called at the end of a epoch."""
|
175 |
+
# Compute/store important stats
|
176 |
+
stage_stats = {"loss": stage_loss}
|
177 |
+
if stage == sb.Stage.TRAIN:
|
178 |
+
self.train_stats = stage_stats
|
179 |
+
else:
|
180 |
+
stage_stats["CER"] = self.cer_metric.summarize("error_rate")
|
181 |
+
stage_stats["WER"] = self.wer_metric.summarize("error_rate")
|
182 |
+
|
183 |
+
# Perform end-of-iteration things, like annealing, logging, etc.
|
184 |
+
if stage == sb.Stage.VALID:
|
185 |
+
old_lr, new_lr = self.hparams.lr_annealing(stage_stats["WER"])
|
186 |
+
sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr)
|
187 |
+
self.hparams.train_logger.log_stats(
|
188 |
+
stats_meta={"epoch": epoch, "lr": old_lr},
|
189 |
+
train_stats=self.train_stats,
|
190 |
+
valid_stats=stage_stats,
|
191 |
+
)
|
192 |
+
self.checkpointer.save_and_keep_only(
|
193 |
+
meta={"WER": stage_stats["WER"]}, min_keys=["WER"],
|
194 |
+
)
|
195 |
+
elif stage == sb.Stage.TEST:
|
196 |
+
self.hparams.train_logger.log_stats(
|
197 |
+
stats_meta={"Epoch loaded": self.hparams.epoch_counter.current},
|
198 |
+
test_stats=stage_stats,
|
199 |
+
)
|
200 |
+
with open(self.hparams.wer_file, "w") as w:
|
201 |
+
self.wer_metric.write_stats(w)
|
202 |
+
|
203 |
+
|
204 |
+
def dataio_prepare(hparams):
|
205 |
+
"""This function prepares the datasets to be used in the brain class.
|
206 |
+
It also defines the data processing pipeline through user-defined functions."""
|
207 |
+
|
208 |
+
data_folder = hparams["data_folder"]
|
209 |
+
|
210 |
+
train_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
|
211 |
+
csv_path=hparams["csv_train"], replacements={"data_root": data_folder},
|
212 |
+
)
|
213 |
+
|
214 |
+
if hparams["sorting"] == "ascending":
|
215 |
+
# we sort training data to speed up training and get better results.
|
216 |
+
train_data = train_data.filtered_sorted(sort_key="duration")
|
217 |
+
# when sorting do not shuffle in dataloader ! otherwise is pointless
|
218 |
+
hparams["dataloader_opts"]["shuffle"] = False
|
219 |
+
|
220 |
+
elif hparams["sorting"] == "descending":
|
221 |
+
train_data = train_data.filtered_sorted(
|
222 |
+
sort_key="duration", reverse=True
|
223 |
+
)
|
224 |
+
# when sorting do not shuffle in dataloader ! otherwise is pointless
|
225 |
+
hparams["dataloader_opts"]["shuffle"] = False
|
226 |
+
|
227 |
+
elif hparams["sorting"] == "random":
|
228 |
+
pass
|
229 |
+
|
230 |
+
else:
|
231 |
+
raise NotImplementedError(
|
232 |
+
"sorting must be random, ascending or descending"
|
233 |
+
)
|
234 |
+
|
235 |
+
valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
|
236 |
+
csv_path=hparams["csv_valid"], replacements={"data_root": data_folder},
|
237 |
+
)
|
238 |
+
valid_data = valid_data.filtered_sorted(sort_key="duration")
|
239 |
+
|
240 |
+
test_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
|
241 |
+
csv_path=hparams["csv_test"], replacements={"data_root": data_folder},
|
242 |
+
)
|
243 |
+
test_data = test_data.filtered_sorted(sort_key="duration")
|
244 |
+
|
245 |
+
datasets = [train_data, valid_data, test_data]
|
246 |
+
|
247 |
+
tokenizer = hparams["tokenizer"]
|
248 |
+
|
249 |
+
# 2. Define audio pipeline:
|
250 |
+
@sb.utils.data_pipeline.takes("wav")
|
251 |
+
@sb.utils.data_pipeline.provides("sig")
|
252 |
+
def audio_pipeline(wav):
|
253 |
+
sig = sb.dataio.dataio.read_audio(wav)
|
254 |
+
return sig
|
255 |
+
|
256 |
+
sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)
|
257 |
+
|
258 |
+
# 3. Define text pipeline:
|
259 |
+
@sb.utils.data_pipeline.takes("semantics")
|
260 |
+
@sb.utils.data_pipeline.provides(
|
261 |
+
"semantics", "token_list", "tokens_bos", "tokens_eos", "tokens"
|
262 |
+
)
|
263 |
+
def text_pipeline(semantics):
|
264 |
+
yield semantics
|
265 |
+
tokens_list = tokenizer.encode_as_ids(semantics)
|
266 |
+
yield tokens_list
|
267 |
+
tokens_bos = torch.LongTensor([hparams["bos_index"]] + (tokens_list))
|
268 |
+
yield tokens_bos
|
269 |
+
tokens_eos = torch.LongTensor(tokens_list + [hparams["eos_index"]])
|
270 |
+
yield tokens_eos
|
271 |
+
tokens = torch.LongTensor(tokens_list)
|
272 |
+
yield tokens
|
273 |
+
|
274 |
+
sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline)
|
275 |
+
|
276 |
+
# 4. Set output:
|
277 |
+
sb.dataio.dataset.set_output_keys(
|
278 |
+
datasets,
|
279 |
+
["id", "sig", "semantics", "tokens_bos", "tokens_eos", "tokens"],
|
280 |
+
)
|
281 |
+
return train_data, valid_data, test_data, tokenizer
|
282 |
+
|
283 |
+
|
284 |
+
if __name__ == "__main__":
|
285 |
+
|
286 |
+
# Load hyperparameters file with command-line overrides
|
287 |
+
hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
|
288 |
+
with open(hparams_file) as fin:
|
289 |
+
hparams = load_hyperpyyaml(fin, overrides)
|
290 |
+
|
291 |
+
show_results_every = 100 # plots results every N iterations
|
292 |
+
|
293 |
+
# If distributed_launch=True then
|
294 |
+
# create ddp_group with the right communication protocol
|
295 |
+
sb.utils.distributed.ddp_init_group(run_opts)
|
296 |
+
|
297 |
+
# Create experiment directory
|
298 |
+
sb.create_experiment_directory(
|
299 |
+
experiment_directory=hparams["output_folder"],
|
300 |
+
hyperparams_to_save=hparams_file,
|
301 |
+
overrides=overrides,
|
302 |
+
)
|
303 |
+
|
304 |
+
# Dataset prep
|
305 |
+
from prepare import prepare_FSC # noqa
|
306 |
+
|
307 |
+
# multi-gpu (ddp) save data preparation
|
308 |
+
run_on_main(
|
309 |
+
prepare_FSC,
|
310 |
+
kwargs={
|
311 |
+
"data_folder": hparams["data_folder"],
|
312 |
+
"save_folder": hparams["output_folder"],
|
313 |
+
"skip_prep": hparams["skip_prep"],
|
314 |
+
},
|
315 |
+
)
|
316 |
+
|
317 |
+
# here we create the datasets objects as well as tokenization and encoding
|
318 |
+
(train_set, valid_set, test_set, tokenizer,) = dataio_prepare(hparams)
|
319 |
+
|
320 |
+
# We download and pretrain the tokenizer
|
321 |
+
run_on_main(hparams["pretrainer"].collect_files)
|
322 |
+
hparams["pretrainer"].load_collected(device=run_opts["device"])
|
323 |
+
|
324 |
+
# Brain class initialization
|
325 |
+
slu_brain = SLU(
|
326 |
+
modules=hparams["modules"],
|
327 |
+
opt_class=hparams["opt_class"],
|
328 |
+
hparams=hparams,
|
329 |
+
run_opts=run_opts,
|
330 |
+
checkpointer=hparams["checkpointer"],
|
331 |
+
)
|
332 |
+
|
333 |
+
# adding objects to trainer:
|
334 |
+
slu_brain.tokenizer = tokenizer
|
335 |
+
|
336 |
+
# Training
|
337 |
+
slu_brain.fit(
|
338 |
+
slu_brain.hparams.epoch_counter,
|
339 |
+
train_set,
|
340 |
+
valid_set,
|
341 |
+
train_loader_kwargs=hparams["dataloader_opts"],
|
342 |
+
valid_loader_kwargs=hparams["dataloader_opts"],
|
343 |
+
)
|
344 |
+
|
345 |
+
# Test
|
346 |
+
slu_brain.hparams.wer_file = hparams["output_folder"] + "/wer_test.txt"
|
347 |
+
slu_brain.evaluate(test_set, test_loader_kwargs=hparams["dataloader_opts"])
|
fluent-speech-commands/direct/results/BPE51/112011/train_log.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
epoch: 1, lr: 3.00e-04 - train loss: 7.70e-01 - valid loss: 7.10e-01, valid CER: 3.82e-01, valid WER: 1.01
|
2 |
+
epoch: 2, lr: 3.00e-04 - train loss: 7.10e-01 - valid loss: 7.08e-01, valid CER: 3.45e-01, valid WER: 8.11e-01
|
3 |
+
epoch: 3, lr: 3.00e-04 - train loss: 7.08e-01 - valid loss: 7.07e-01, valid CER: 2.30e-01, valid WER: 5.79e-01
|
4 |
+
epoch: 4, lr: 3.00e-04 - train loss: 7.08e-01 - valid loss: 7.07e-01, valid CER: 3.17e-01, valid WER: 7.26e-01
|
5 |
+
Epoch loaded: 4 - test loss: 7.05e-01, test CER: 2.37e-02, test WER: 6.91e-02
|
fluent-speech-commands/direct/results/BPE51/112011/valid.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
fluent-speech-commands/direct/results/BPE51/112011/wer_test.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
fluent-speech-commands/direct/train.py
ADDED
@@ -0,0 +1,347 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env/python3
|
2 |
+
"""
|
3 |
+
Recipe for "direct" (speech -> semantics) SLU with ASR-based transfer learning.
|
4 |
+
|
5 |
+
We encode input waveforms into features using a model trained on LibriSpeech,
|
6 |
+
then feed the features into a seq2seq model to map them to semantics.
|
7 |
+
|
8 |
+
(Adapted from the LibriSpeech seq2seq ASR recipe written by Ju-Chieh Chou, Mirco Ravanelli, Abdel Heba, and Peter Plantinga.)
|
9 |
+
|
10 |
+
Run using:
|
11 |
+
> python train.py hparams/train.yaml
|
12 |
+
|
13 |
+
Authors
|
14 |
+
* Loren Lugosch 2020
|
15 |
+
* Mirco Ravanelli 2020
|
16 |
+
"""
|
17 |
+
|
18 |
+
import sys
|
19 |
+
import torch
|
20 |
+
import speechbrain as sb
|
21 |
+
import logging
|
22 |
+
from hyperpyyaml import load_hyperpyyaml
|
23 |
+
from speechbrain.utils.distributed import run_on_main
|
24 |
+
|
25 |
+
logger = logging.getLogger(__name__)
|
26 |
+
|
27 |
+
# Define training procedure
|
28 |
+
|
29 |
+
|
30 |
+
class SLU(sb.Brain):
|
31 |
+
def compute_forward(self, batch, stage):
|
32 |
+
"""Forward computations from the waveform batches to the output probabilities."""
|
33 |
+
batch = batch.to(self.device)
|
34 |
+
wavs, wav_lens = batch.sig
|
35 |
+
tokens_bos, tokens_bos_lens = batch.tokens_bos
|
36 |
+
|
37 |
+
# Add augmentation if specified
|
38 |
+
if stage == sb.Stage.TRAIN:
|
39 |
+
# Applying the augmentation pipeline
|
40 |
+
wavs_aug_tot = []
|
41 |
+
wavs_aug_tot.append(wavs)
|
42 |
+
for count, augment in enumerate(self.hparams.augment_pipeline):
|
43 |
+
|
44 |
+
# Apply augment
|
45 |
+
wavs_aug = augment(wavs, wav_lens)
|
46 |
+
|
47 |
+
# Managing speed change
|
48 |
+
if wavs_aug.shape[1] > wavs.shape[1]:
|
49 |
+
wavs_aug = wavs_aug[:, 0 : wavs.shape[1]]
|
50 |
+
else:
|
51 |
+
zero_sig = torch.zeros_like(wavs)
|
52 |
+
zero_sig[:, 0 : wavs_aug.shape[1]] = wavs_aug
|
53 |
+
wavs_aug = zero_sig
|
54 |
+
|
55 |
+
wavs_aug_tot.append(wavs_aug)
|
56 |
+
|
57 |
+
wavs = torch.cat(wavs_aug_tot, dim=0)
|
58 |
+
self.n_augment = len(wavs_aug_tot)
|
59 |
+
wav_lens = torch.cat([wav_lens] * self.n_augment)
|
60 |
+
tokens_bos = torch.cat([tokens_bos] * self.n_augment)
|
61 |
+
|
62 |
+
# ASR encoder forward pass
|
63 |
+
with torch.no_grad():
|
64 |
+
ASR_encoder_out = self.hparams.asr_model.encode_batch(
|
65 |
+
wavs.detach(), wav_lens
|
66 |
+
)
|
67 |
+
|
68 |
+
# SLU forward pass
|
69 |
+
encoder_out = self.hparams.slu_enc(ASR_encoder_out)
|
70 |
+
e_in = self.hparams.output_emb(tokens_bos)
|
71 |
+
h, _ = self.hparams.dec(e_in, encoder_out, wav_lens)
|
72 |
+
|
73 |
+
# Output layer for seq2seq log-probabilities
|
74 |
+
logits = self.hparams.seq_lin(h)
|
75 |
+
p_seq = self.hparams.log_softmax(logits)
|
76 |
+
|
77 |
+
# Compute outputs
|
78 |
+
if (
|
79 |
+
stage == sb.Stage.TRAIN
|
80 |
+
and self.batch_count % show_results_every != 0
|
81 |
+
):
|
82 |
+
return p_seq, wav_lens
|
83 |
+
else:
|
84 |
+
p_tokens, scores = self.hparams.beam_searcher(encoder_out, wav_lens)
|
85 |
+
return p_seq, wav_lens, p_tokens
|
86 |
+
|
87 |
+
def compute_objectives(self, predictions, batch, stage):
|
88 |
+
"""Computes the loss (NLL) given predictions and targets."""
|
89 |
+
|
90 |
+
if (
|
91 |
+
stage == sb.Stage.TRAIN
|
92 |
+
and self.batch_count % show_results_every != 0
|
93 |
+
):
|
94 |
+
p_seq, wav_lens = predictions
|
95 |
+
else:
|
96 |
+
p_seq, wav_lens, predicted_tokens = predictions
|
97 |
+
|
98 |
+
ids = batch.id
|
99 |
+
tokens_eos, tokens_eos_lens = batch.tokens_eos
|
100 |
+
tokens, tokens_lens = batch.tokens
|
101 |
+
|
102 |
+
if hasattr(self.hparams, "env_corrupt") and stage == sb.Stage.TRAIN:
|
103 |
+
tokens_eos = torch.cat([tokens_eos, tokens_eos], dim=0)
|
104 |
+
tokens_eos_lens = torch.cat(
|
105 |
+
[tokens_eos_lens, tokens_eos_lens], dim=0
|
106 |
+
)
|
107 |
+
|
108 |
+
if stage == sb.Stage.TRAIN:
|
109 |
+
tokens_eos = torch.cat([tokens_eos] * self.n_augment, dim=0)
|
110 |
+
tokens_eos_lens = torch.cat(
|
111 |
+
[tokens_eos_lens] * self.n_augment, dim=0
|
112 |
+
)
|
113 |
+
|
114 |
+
loss_seq = self.hparams.seq_cost(
|
115 |
+
p_seq, tokens_eos, length=tokens_eos_lens
|
116 |
+
)
|
117 |
+
|
118 |
+
# (No ctc loss)
|
119 |
+
loss = loss_seq
|
120 |
+
|
121 |
+
if (stage != sb.Stage.TRAIN) or (
|
122 |
+
self.batch_count % show_results_every == 0
|
123 |
+
):
|
124 |
+
# Decode token terms to words
|
125 |
+
predicted_semantics = [
|
126 |
+
tokenizer.decode_ids(utt_seq).split(" ")
|
127 |
+
for utt_seq in predicted_tokens
|
128 |
+
]
|
129 |
+
|
130 |
+
target_semantics = [wrd.split(" ") for wrd in batch.semantics]
|
131 |
+
|
132 |
+
for i in range(len(target_semantics)):
|
133 |
+
print(" ".join(predicted_semantics[i]).replace("|", ","))
|
134 |
+
print(" ".join(target_semantics[i]).replace("|", ","))
|
135 |
+
print("")
|
136 |
+
|
137 |
+
if stage != sb.Stage.TRAIN:
|
138 |
+
self.wer_metric.append(
|
139 |
+
ids, predicted_semantics, target_semantics
|
140 |
+
)
|
141 |
+
self.cer_metric.append(
|
142 |
+
ids, predicted_semantics, target_semantics
|
143 |
+
)
|
144 |
+
|
145 |
+
return loss
|
146 |
+
|
147 |
+
def fit_batch(self, batch):
|
148 |
+
"""Train the parameters given a single batch in input"""
|
149 |
+
predictions = self.compute_forward(batch, sb.Stage.TRAIN)
|
150 |
+
loss = self.compute_objectives(predictions, batch, sb.Stage.TRAIN)
|
151 |
+
loss.backward()
|
152 |
+
if self.check_gradients(loss):
|
153 |
+
self.optimizer.step()
|
154 |
+
self.optimizer.zero_grad()
|
155 |
+
self.batch_count += 1
|
156 |
+
return loss.detach()
|
157 |
+
|
158 |
+
def evaluate_batch(self, batch, stage):
|
159 |
+
"""Computations needed for validation/test batches"""
|
160 |
+
predictions = self.compute_forward(batch, stage=stage)
|
161 |
+
loss = self.compute_objectives(predictions, batch, stage=stage)
|
162 |
+
return loss.detach()
|
163 |
+
|
164 |
+
def on_stage_start(self, stage, epoch):
|
165 |
+
"""Gets called at the beginning of each epoch"""
|
166 |
+
self.batch_count = 0
|
167 |
+
|
168 |
+
if stage != sb.Stage.TRAIN:
|
169 |
+
|
170 |
+
self.cer_metric = self.hparams.cer_computer()
|
171 |
+
self.wer_metric = self.hparams.error_rate_computer()
|
172 |
+
|
173 |
+
def on_stage_end(self, stage, stage_loss, epoch):
|
174 |
+
"""Gets called at the end of a epoch."""
|
175 |
+
# Compute/store important stats
|
176 |
+
stage_stats = {"loss": stage_loss}
|
177 |
+
if stage == sb.Stage.TRAIN:
|
178 |
+
self.train_stats = stage_stats
|
179 |
+
else:
|
180 |
+
stage_stats["CER"] = self.cer_metric.summarize("error_rate")
|
181 |
+
stage_stats["WER"] = self.wer_metric.summarize("error_rate")
|
182 |
+
|
183 |
+
# Perform end-of-iteration things, like annealing, logging, etc.
|
184 |
+
if stage == sb.Stage.VALID:
|
185 |
+
old_lr, new_lr = self.hparams.lr_annealing(stage_stats["WER"])
|
186 |
+
sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr)
|
187 |
+
self.hparams.train_logger.log_stats(
|
188 |
+
stats_meta={"epoch": epoch, "lr": old_lr},
|
189 |
+
train_stats=self.train_stats,
|
190 |
+
valid_stats=stage_stats,
|
191 |
+
)
|
192 |
+
self.checkpointer.save_and_keep_only(
|
193 |
+
meta={"WER": stage_stats["WER"]}, min_keys=["WER"],
|
194 |
+
)
|
195 |
+
elif stage == sb.Stage.TEST:
|
196 |
+
self.hparams.train_logger.log_stats(
|
197 |
+
stats_meta={"Epoch loaded": self.hparams.epoch_counter.current},
|
198 |
+
test_stats=stage_stats,
|
199 |
+
)
|
200 |
+
with open(self.hparams.wer_file, "w") as w:
|
201 |
+
self.wer_metric.write_stats(w)
|
202 |
+
|
203 |
+
|
204 |
+
def dataio_prepare(hparams):
|
205 |
+
"""This function prepares the datasets to be used in the brain class.
|
206 |
+
It also defines the data processing pipeline through user-defined functions."""
|
207 |
+
|
208 |
+
data_folder = hparams["data_folder"]
|
209 |
+
|
210 |
+
train_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
|
211 |
+
csv_path=hparams["csv_train"], replacements={"data_root": data_folder},
|
212 |
+
)
|
213 |
+
|
214 |
+
if hparams["sorting"] == "ascending":
|
215 |
+
# we sort training data to speed up training and get better results.
|
216 |
+
train_data = train_data.filtered_sorted(sort_key="duration")
|
217 |
+
# when sorting do not shuffle in dataloader ! otherwise is pointless
|
218 |
+
hparams["dataloader_opts"]["shuffle"] = False
|
219 |
+
|
220 |
+
elif hparams["sorting"] == "descending":
|
221 |
+
train_data = train_data.filtered_sorted(
|
222 |
+
sort_key="duration", reverse=True
|
223 |
+
)
|
224 |
+
# when sorting do not shuffle in dataloader ! otherwise is pointless
|
225 |
+
hparams["dataloader_opts"]["shuffle"] = False
|
226 |
+
|
227 |
+
elif hparams["sorting"] == "random":
|
228 |
+
pass
|
229 |
+
|
230 |
+
else:
|
231 |
+
raise NotImplementedError(
|
232 |
+
"sorting must be random, ascending or descending"
|
233 |
+
)
|
234 |
+
|
235 |
+
valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
|
236 |
+
csv_path=hparams["csv_valid"], replacements={"data_root": data_folder},
|
237 |
+
)
|
238 |
+
valid_data = valid_data.filtered_sorted(sort_key="duration")
|
239 |
+
|
240 |
+
test_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
|
241 |
+
csv_path=hparams["csv_test"], replacements={"data_root": data_folder},
|
242 |
+
)
|
243 |
+
test_data = test_data.filtered_sorted(sort_key="duration")
|
244 |
+
|
245 |
+
datasets = [train_data, valid_data, test_data]
|
246 |
+
|
247 |
+
tokenizer = hparams["tokenizer"]
|
248 |
+
|
249 |
+
# 2. Define audio pipeline:
|
250 |
+
@sb.utils.data_pipeline.takes("wav")
|
251 |
+
@sb.utils.data_pipeline.provides("sig")
|
252 |
+
def audio_pipeline(wav):
|
253 |
+
sig = sb.dataio.dataio.read_audio(wav)
|
254 |
+
return sig
|
255 |
+
|
256 |
+
sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)
|
257 |
+
|
258 |
+
# 3. Define text pipeline:
|
259 |
+
@sb.utils.data_pipeline.takes("semantics")
|
260 |
+
@sb.utils.data_pipeline.provides(
|
261 |
+
"semantics", "token_list", "tokens_bos", "tokens_eos", "tokens"
|
262 |
+
)
|
263 |
+
def text_pipeline(semantics):
|
264 |
+
yield semantics
|
265 |
+
tokens_list = tokenizer.encode_as_ids(semantics)
|
266 |
+
yield tokens_list
|
267 |
+
tokens_bos = torch.LongTensor([hparams["bos_index"]] + (tokens_list))
|
268 |
+
yield tokens_bos
|
269 |
+
tokens_eos = torch.LongTensor(tokens_list + [hparams["eos_index"]])
|
270 |
+
yield tokens_eos
|
271 |
+
tokens = torch.LongTensor(tokens_list)
|
272 |
+
yield tokens
|
273 |
+
|
274 |
+
sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline)
|
275 |
+
|
276 |
+
# 4. Set output:
|
277 |
+
sb.dataio.dataset.set_output_keys(
|
278 |
+
datasets,
|
279 |
+
["id", "sig", "semantics", "tokens_bos", "tokens_eos", "tokens"],
|
280 |
+
)
|
281 |
+
return train_data, valid_data, test_data, tokenizer
|
282 |
+
|
283 |
+
|
284 |
+
if __name__ == "__main__":
|
285 |
+
|
286 |
+
# Load hyperparameters file with command-line overrides
|
287 |
+
hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
|
288 |
+
with open(hparams_file) as fin:
|
289 |
+
hparams = load_hyperpyyaml(fin, overrides)
|
290 |
+
|
291 |
+
show_results_every = 100 # plots results every N iterations
|
292 |
+
|
293 |
+
# If distributed_launch=True then
|
294 |
+
# create ddp_group with the right communication protocol
|
295 |
+
sb.utils.distributed.ddp_init_group(run_opts)
|
296 |
+
|
297 |
+
# Create experiment directory
|
298 |
+
sb.create_experiment_directory(
|
299 |
+
experiment_directory=hparams["output_folder"],
|
300 |
+
hyperparams_to_save=hparams_file,
|
301 |
+
overrides=overrides,
|
302 |
+
)
|
303 |
+
|
304 |
+
# Dataset prep
|
305 |
+
from prepare import prepare_FSC # noqa
|
306 |
+
|
307 |
+
# multi-gpu (ddp) save data preparation
|
308 |
+
run_on_main(
|
309 |
+
prepare_FSC,
|
310 |
+
kwargs={
|
311 |
+
"data_folder": hparams["data_folder"],
|
312 |
+
"save_folder": hparams["output_folder"],
|
313 |
+
"skip_prep": hparams["skip_prep"],
|
314 |
+
},
|
315 |
+
)
|
316 |
+
|
317 |
+
# here we create the datasets objects as well as tokenization and encoding
|
318 |
+
(train_set, valid_set, test_set, tokenizer,) = dataio_prepare(hparams)
|
319 |
+
|
320 |
+
# We download and pretrain the tokenizer
|
321 |
+
run_on_main(hparams["pretrainer"].collect_files)
|
322 |
+
hparams["pretrainer"].load_collected(device=run_opts["device"])
|
323 |
+
|
324 |
+
# Brain class initialization
|
325 |
+
slu_brain = SLU(
|
326 |
+
modules=hparams["modules"],
|
327 |
+
opt_class=hparams["opt_class"],
|
328 |
+
hparams=hparams,
|
329 |
+
run_opts=run_opts,
|
330 |
+
checkpointer=hparams["checkpointer"],
|
331 |
+
)
|
332 |
+
|
333 |
+
# adding objects to trainer:
|
334 |
+
slu_brain.tokenizer = tokenizer
|
335 |
+
|
336 |
+
# Training
|
337 |
+
slu_brain.fit(
|
338 |
+
slu_brain.hparams.epoch_counter,
|
339 |
+
train_set,
|
340 |
+
valid_set,
|
341 |
+
train_loader_kwargs=hparams["dataloader_opts"],
|
342 |
+
valid_loader_kwargs=hparams["dataloader_opts"],
|
343 |
+
)
|
344 |
+
|
345 |
+
# Test
|
346 |
+
slu_brain.hparams.wer_file = hparams["output_folder"] + "/wer_test.txt"
|
347 |
+
slu_brain.evaluate(test_set, test_loader_kwargs=hparams["dataloader_opts"])
|
fluent-speech-commands/extra_requirements.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
pandas
|
fluent-speech-commands/prepare.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import logging
|
3 |
+
from speechbrain.dataio.dataio import read_audio
|
4 |
+
|
5 |
+
try:
|
6 |
+
import pandas as pd
|
7 |
+
except ImportError:
|
8 |
+
err_msg = (
|
9 |
+
"The optional dependency pandas must be installed to run this recipe.\n"
|
10 |
+
)
|
11 |
+
err_msg += "Install using `pip install pandas`.\n"
|
12 |
+
raise ImportError(err_msg)
|
13 |
+
|
14 |
+
logger = logging.getLogger(__name__)
|
15 |
+
|
16 |
+
|
17 |
+
def prepare_FSC(data_folder, save_folder, skip_prep=False):
|
18 |
+
"""
|
19 |
+
This function prepares the Fluent Speech Commands dataset.
|
20 |
+
|
21 |
+
data_folder : path to dataset.
|
22 |
+
save_folder: folder where the manifest files will be stored.
|
23 |
+
skip_prep: If True, skip data preparation
|
24 |
+
|
25 |
+
"""
|
26 |
+
if skip_prep:
|
27 |
+
return
|
28 |
+
|
29 |
+
splits = [
|
30 |
+
"train",
|
31 |
+
"valid",
|
32 |
+
"test",
|
33 |
+
]
|
34 |
+
ID_start = 0 # needed to have a unique ID for each audio
|
35 |
+
for split in splits:
|
36 |
+
new_filename = os.path.join(save_folder, split) + ".csv"
|
37 |
+
if os.path.exists(new_filename):
|
38 |
+
continue
|
39 |
+
logger.info("Preparing %s..." % new_filename)
|
40 |
+
|
41 |
+
ID = []
|
42 |
+
duration = []
|
43 |
+
|
44 |
+
wav = []
|
45 |
+
wav_format = []
|
46 |
+
wav_opts = []
|
47 |
+
|
48 |
+
spk_id = []
|
49 |
+
spk_id_format = []
|
50 |
+
spk_id_opts = []
|
51 |
+
|
52 |
+
semantics = []
|
53 |
+
semantics_format = []
|
54 |
+
semantics_opts = []
|
55 |
+
|
56 |
+
transcript = []
|
57 |
+
transcript_format = []
|
58 |
+
transcript_opts = []
|
59 |
+
|
60 |
+
df = pd.read_csv(os.path.join(data_folder, "data", split) + "_data.csv")
|
61 |
+
for i in range(len(df)):
|
62 |
+
ID.append(ID_start + i)
|
63 |
+
signal = read_audio(os.path.join(data_folder, df.path[i]))
|
64 |
+
duration.append(signal.shape[0] / 16000)
|
65 |
+
|
66 |
+
wav.append(os.path.join(data_folder, df.path[i]))
|
67 |
+
wav_format.append("wav")
|
68 |
+
wav_opts.append(None)
|
69 |
+
|
70 |
+
spk_id.append(df.speakerId[i])
|
71 |
+
spk_id_format.append("string")
|
72 |
+
spk_id_opts.append(None)
|
73 |
+
|
74 |
+
transcript_ = df.transcription[i]
|
75 |
+
transcript.append(transcript_)
|
76 |
+
transcript_format.append("string")
|
77 |
+
transcript_opts.append(None)
|
78 |
+
|
79 |
+
semantics_ = (
|
80 |
+
'{"action:" "'
|
81 |
+
+ df.action[i]
|
82 |
+
+ '"| "object": "'
|
83 |
+
+ df.object[i]
|
84 |
+
+ '"| "location": "'
|
85 |
+
+ df.location[i]
|
86 |
+
+ '"}'
|
87 |
+
)
|
88 |
+
semantics.append(semantics_)
|
89 |
+
semantics_format.append("string")
|
90 |
+
semantics_opts.append(None)
|
91 |
+
|
92 |
+
new_df = pd.DataFrame(
|
93 |
+
{
|
94 |
+
"ID": ID,
|
95 |
+
"duration": duration,
|
96 |
+
"wav": wav,
|
97 |
+
"spk_id": spk_id,
|
98 |
+
"semantics": semantics,
|
99 |
+
"transcript": transcript,
|
100 |
+
}
|
101 |
+
)
|
102 |
+
new_df.to_csv(new_filename, index=False)
|
103 |
+
ID_start += len(df)
|
pretrained_models/EncoderDecoderASR--5348169877143464308/asr.ckpt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
/root/.cache/huggingface/hub/83e944252a91fe1d0883daa1e87077df4d64c35fffb45e22fff924faace4a59c.7fdf4aabd8400c69a6228ccc17c83b7a8ebf34c5d76f23497b7cf0d7a1baaea3
|
pretrained_models/EncoderDecoderASR--5348169877143464308/hyperparams.yaml
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
/root/.cache/huggingface/hub/7aac72d39109ee19b4004d94239c2924caf33de6d85b0aff9296d844982210cb.d14310ea63844fb38520a592ea3a92e4f131b5f4683f8fa08e27b1e403c92293
|
pretrained_models/EncoderDecoderASR--5348169877143464308/lm.ckpt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
/root/.cache/huggingface/hub/651df066b5d0b2efef7208f51df93d3a0a65bedc3a3a2500cd7b8faf064e631e.b438b9af3f549a23c4458bb066c11cd51dc1cfe9bfef30d3eb66b472e93b1e8c
|
pretrained_models/EncoderDecoderASR--5348169877143464308/normalizer.ckpt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
/root/.cache/huggingface/hub/e733854cce680bcb58ce4b86bacb3cab5222880933b7b85ab17758aa5b10e9da.587fb748e80e719ed5721d5e0098c5feb2a901017135271ce2b2c6baea7e9f6e
|
pretrained_models/EncoderDecoderASR--5348169877143464308/tokenizer.ckpt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
/root/.cache/huggingface/hub/f39208eba495042a59a8404b5703ca08a39a85e4d2bf707e197b90a3323f92ab.cd7af7ea8cfcfbf0f6dd61514c361972eb82b3b76f12b0e9ee0b371f36fdc078
|
pretrained_models/EndToEndSLU-7990244956535603082/hyperparams.yaml
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
/root/.cache/huggingface/hub/a095f802a6283ecd636ffd0c0ec2d2dc335dcccfb395f5bc8d48fdb0ed34ca62.ca16cf2255d592246550b1dcfb9ac24800ec38cb8589cfd07e9db7558562037f
|
pretrained_models/EndToEndSLU-7990244956535603082/model.ckpt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
/root/.cache/huggingface/hub/f01892eb014043257a527de1a0ebf610a17895a2b4c13d7e7e719c37231d08e5.d625fbcb8a2387e5d81fe6ff0d868125c7dcbc1b2245206ea152cccfb98a44fe
|
pretrained_models/EndToEndSLU-7990244956535603082/tokenizer.ckpt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
/root/.cache/huggingface/hub/2e5567fd31be3518b2a174a53d89d98df57247924ea50e69bbdb39cc4f8a76e5.8d38059f23fb577abadc9e131f1b67dd9662567eb032fdb8837e33a90feb47d4
|