zrjin commited on
Commit
a0160d8
1 Parent(s): b6048cb

Upload 36 files

Browse files
whamr_only/51epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5acd362a3cb79f46930aee1277b508d21c3827dbac6439018690f6c03676e3e4
3
+ size 172367337
whamr_only/52epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d2bb10a024f9bf6b8ce278ec89504f31077831f8131552a050863d3e41a2c12
3
+ size 172367337
whamr_only/53epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22bb11d3c7e6407b83bab483bab65d33f726e90ec3054f72e42d9ca5e6d1d05b
3
+ size 172367337
whamr_only/54epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55751dfbafe484a89b4932998a8a562f9a38652f98df6ea1d1ad4ff8f10a4d92
3
+ size 172367337
whamr_only/55epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80ff99937fd1827a3d4477f8a3416ccbe8956f6ef8e1e694c68d1f2f7402bd0d
3
+ size 172367337
whamr_only/56epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2193e1c399a35bba42860e80432ca78426092d4ca9a3312098db7193dd730252
3
+ size 172367337
whamr_only/57epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1175ff22c0a9f3523af30e2dbe35ccfeefa07a709170089059898c6389b083ff
3
+ size 172367337
whamr_only/58epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d528953b7b2b1c8791279d243744450414be15e61123fd2b9216083c2bbb8213
3
+ size 172367337
whamr_only/59epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1fcfc4f27cac05ed4176327b86ae49d5c65096332481a8fc18fee7051a1d5858
3
+ size 172367337
whamr_only/60epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4f732edcb7d3085b74db6df2cc49208ca2424933e4c06f5f2823b737fbf4964
3
+ size 172367337
whamr_only/RESULTS.md ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- Generated by scripts/utils/show_asr_result.sh -->
2
+ # RESULTS
3
+ ## Environments
4
+ - date: `Mon Mar 4 20:54:29 CST 2024`
5
+ - python version: `3.9.18 (main, Sep 11 2023, 13:41:44) [GCC 11.2.0]`
6
+ - espnet version: `espnet 202308`
7
+ - pytorch version: `pytorch 1.12.1+cu116`
8
+ - Git hash: `884659f9ee95374811015381c976fa3b4f6e01db`
9
+ - Commit date: `Thu Nov 23 00:23:29 2023 +0800`
10
+
11
+ ## exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr_only
12
+ ### WER
13
+
14
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
15
+ |---|---|---|---|---|---|---|---|---|
16
+ |decode_sot_asr_model_valid.acc.best/dev_2spk|3315|226216|14.7|50.1|35.2|3.9|89.2|100.0|
17
+ |decode_sot_asr_model_valid.acc.best/dev_3spk|2059|209679|11.3|38.3|50.4|1.2|89.8|100.0|
18
+ |decode_sot_asr_model_valid.acc.best/dev_4spk|1467|200029|9.7|30.3|60.0|0.6|90.9|100.0|
19
+ |decode_sot_asr_model_valid.acc.best/test-clean_2spk|4570|301042|13.6|51.0|35.4|3.8|90.2|100.0|
20
+ |decode_sot_asr_model_valid.acc.best/test-clean_3spk|2072|212871|10.7|38.7|50.6|1.1|90.4|100.0|
21
+ |decode_sot_asr_model_valid.acc.best/test-clean_4spk|1326|185394|9.3|29.4|61.3|0.6|91.3|100.0|
22
+ |decode_sot_asr_model_valid.acc.best/test-other_2spk|4663|336490|12.4|50.7|36.9|3.0|90.5|100.0|
23
+ |decode_sot_asr_model_valid.acc.best/test-other_3spk|2453|266074|9.6|38.5|51.8|0.9|91.2|100.0|
24
+ |decode_sot_asr_model_valid.acc.best/test-other_4spk|1795|259138|8.4|29.5|62.1|0.5|92.1|100.0|
25
+ |decode_sot_asr_model_valid.acc.best/test_clean_2spk_kaldi_fmt|2180|178761|13.8|44.8|41.4|2.3|88.5|100.0|
26
+ |decode_sot_asr_model_valid.acc.best/test_other_2spk_kaldi_fmt|2363|205496|12.6|44.9|42.5|1.9|89.3|100.0|
27
+ |decode_sot_asr_model_valid.acc.best/tt_mix_clean_reverb_max_16k|3000|3000|0.0|100.0|0.0|3139.9|3239.9|100.0|
28
+
29
+ ### CER
30
+
31
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
32
+ |---|---|---|---|---|---|---|---|---|
33
+ |decode_sot_asr_model_valid.acc.best/dev_2spk|3315|1230801|38.3|22.6|39.0|7.4|69.1|100.0|
34
+ |decode_sot_asr_model_valid.acc.best/dev_3spk|2059|1140428|31.0|16.9|52.1|2.9|71.8|100.0|
35
+ |decode_sot_asr_model_valid.acc.best/dev_4spk|1467|1087409|26.8|12.5|60.7|1.5|74.7|100.0|
36
+ |decode_sot_asr_model_valid.acc.best/test-clean_2spk|4570|1550429|38.0|23.4|38.6|7.8|69.8|100.0|
37
+ |decode_sot_asr_model_valid.acc.best/test-clean_3spk|2072|1084475|31.3|17.8|50.9|3.2|71.9|100.0|
38
+ |decode_sot_asr_model_valid.acc.best/test-clean_4spk|1326|938467|26.8|12.7|60.5|1.5|74.7|100.0|
39
+ |decode_sot_asr_model_valid.acc.best/test-other_2spk|4663|1742136|37.1|23.4|39.5|7.0|69.9|100.0|
40
+ |decode_sot_asr_model_valid.acc.best/test-other_3spk|2453|1381987|30.2|17.3|52.4|2.8|72.5|100.0|
41
+ |decode_sot_asr_model_valid.acc.best/test-other_4spk|1795|1346646|25.9|12.4|61.7|1.3|75.5|100.0|
42
+ |decode_sot_asr_model_valid.acc.best/test_clean_2spk_kaldi_fmt|2180|921344|36.4|20.0|43.7|5.1|68.7|100.0|
43
+ |decode_sot_asr_model_valid.acc.best/test_other_2spk_kaldi_fmt|2363|1064868|35.5|20.1|44.3|4.6|69.1|100.0|
44
+ |decode_sot_asr_model_valid.acc.best/tt_mix_clean_reverb_max_16k|3000|143026|16.2|83.6|0.1|294.7|378.4|100.0|
45
+
46
+ ### TER
47
+
48
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
49
+ |---|---|---|---|---|---|---|---|---|
50
+ ## exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr_only/decode_sot_asr_model_valid.acc.best
51
+ ### WER
52
+
53
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
54
+ |---|---|---|---|---|---|---|---|---|
55
+ |org/dev_2spk_kaldi_fmt|1606|135101|15.0|43.3|41.8|2.5|87.5|100.0|
56
+
57
+ ### CER
58
+
59
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
60
+ |---|---|---|---|---|---|---|---|---|
61
+ |org/dev_2spk_kaldi_fmt|1606|735694|36.4|19.0|44.5|4.7|68.2|100.0|
62
+
63
+ ### TER
64
+
65
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
66
+ |---|---|---|---|---|---|---|---|---|
whamr_only/checkpoint.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:164e2e87550cbb3ed02f9b49dcd7ff27a0c5af8590e73476b1bd69587e13c487
3
+ size 516972510
whamr_only/config.yaml ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_sot_asr_conformer.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: sequence
7
+ valid_iterator_type: null
8
+ output_dir: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr_only
9
+ ngpu: 1
10
+ seed: 0
11
+ num_workers: 16
12
+ num_att_plot: 3
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: 2
16
+ dist_rank: 0
17
+ local_rank: 0
18
+ dist_master_addr: localhost
19
+ dist_master_port: 54011
20
+ dist_launcher: null
21
+ multiprocessing_distributed: true
22
+ unused_parameters: false
23
+ sharded_ddp: false
24
+ cudnn_enabled: true
25
+ cudnn_benchmark: false
26
+ cudnn_deterministic: true
27
+ collect_stats: false
28
+ write_collected_feats: false
29
+ max_epoch: 60
30
+ patience: null
31
+ val_scheduler_criterion:
32
+ - valid
33
+ - loss
34
+ early_stopping_criterion:
35
+ - valid
36
+ - loss
37
+ - min
38
+ best_model_criterion:
39
+ - - valid
40
+ - acc
41
+ - max
42
+ keep_nbest_models: 10
43
+ nbest_averaging_interval: 0
44
+ grad_clip: 5.0
45
+ grad_clip_type: 2.0
46
+ grad_noise: false
47
+ accum_grad: 4
48
+ no_forward_run: false
49
+ resume: true
50
+ train_dtype: float32
51
+ use_amp: false
52
+ log_interval: null
53
+ use_matplotlib: true
54
+ use_tensorboard: true
55
+ create_graph_in_tensorboard: false
56
+ use_wandb: false
57
+ wandb_project: null
58
+ wandb_id: null
59
+ wandb_entity: null
60
+ wandb_name: null
61
+ wandb_model_log_interval: -1
62
+ detect_anomaly: false
63
+ pretrain_path: null
64
+ init_param:
65
+ - /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth
66
+ ignore_init_mismatch: false
67
+ freeze_param: []
68
+ num_iters_per_epoch: null
69
+ batch_size: 20
70
+ valid_batch_size: null
71
+ batch_bins: 10000000
72
+ valid_batch_bins: null
73
+ train_shape_file:
74
+ - exp/asr_stats_raw_en_char/train/speech_shape
75
+ - exp/asr_stats_raw_en_char/train/text_shape.char
76
+ valid_shape_file:
77
+ - exp/asr_stats_raw_en_char/valid/speech_shape
78
+ - exp/asr_stats_raw_en_char/valid/text_shape.char
79
+ batch_type: numel
80
+ valid_batch_type: null
81
+ fold_length:
82
+ - 80000
83
+ - 150
84
+ sort_in_batch: descending
85
+ shuffle_within_batch: false
86
+ sort_batch: descending
87
+ multiple_iterator: false
88
+ chunk_length: 500
89
+ chunk_shift_ratio: 0.5
90
+ num_cache_chunks: 1024
91
+ chunk_excluded_key_prefixes: []
92
+ train_data_path_and_name_and_type:
93
+ - - dump/raw/tr_mix_clean_reverb_max_16k_sp/wav.scp
94
+ - speech
95
+ - kaldi_ark
96
+ - - dump/raw/tr_mix_clean_reverb_max_16k_sp/text
97
+ - text
98
+ - text
99
+ valid_data_path_and_name_and_type:
100
+ - - dump/raw/cv_mix_clean_reverb_max_16k/wav.scp
101
+ - speech
102
+ - kaldi_ark
103
+ - - dump/raw/cv_mix_clean_reverb_max_16k/text
104
+ - text
105
+ - text
106
+ allow_variable_data_keys: false
107
+ max_cache_size: 0.0
108
+ max_cache_fd: 32
109
+ valid_max_cache_size: null
110
+ exclude_weight_decay: false
111
+ exclude_weight_decay_conf: {}
112
+ optim: adam
113
+ optim_conf:
114
+ lr: 0.002
115
+ weight_decay: 1.0e-06
116
+ scheduler: warmuplr
117
+ scheduler_conf:
118
+ warmup_steps: 20000
119
+ token_list:
120
+ - <blank>
121
+ - <unk>
122
+ - <sc>
123
+ - <space>
124
+ - E
125
+ - T
126
+ - A
127
+ - O
128
+ - N
129
+ - I
130
+ - H
131
+ - S
132
+ - R
133
+ - D
134
+ - L
135
+ - U
136
+ - M
137
+ - C
138
+ - W
139
+ - F
140
+ - G
141
+ - Y
142
+ - P
143
+ - B
144
+ - V
145
+ - K
146
+ - ''''
147
+ - X
148
+ - J
149
+ - Q
150
+ - Z
151
+ - <sos/eos>
152
+ init: null
153
+ input_size: null
154
+ ctc_conf:
155
+ dropout_rate: 0.0
156
+ ctc_type: builtin
157
+ reduce: true
158
+ ignore_nan_grad: null
159
+ zero_infinity: true
160
+ joint_net_conf: null
161
+ use_preprocessor: true
162
+ token_type: char
163
+ bpemodel: null
164
+ non_linguistic_symbols: null
165
+ cleaner: null
166
+ g2p: null
167
+ speech_volume_normalize: null
168
+ rir_scp: null
169
+ rir_apply_prob: 1.0
170
+ noise_scp: null
171
+ noise_apply_prob: 1.0
172
+ noise_db_range: '13_15'
173
+ short_noise_thres: 0.5
174
+ aux_ctc_tasks: []
175
+ frontend: default
176
+ frontend_conf:
177
+ fs: 16k
178
+ specaug: null
179
+ specaug_conf: {}
180
+ normalize: global_mvn
181
+ normalize_conf:
182
+ stats_file: exp/asr_stats_raw_en_char/train/feats_stats.npz
183
+ model: espnet
184
+ model_conf:
185
+ ctc_weight: 0.0
186
+ lsm_weight: 0.1
187
+ length_normalized_loss: false
188
+ preencoder: null
189
+ preencoder_conf: {}
190
+ encoder: conformer
191
+ encoder_conf:
192
+ output_size: 256
193
+ attention_heads: 4
194
+ linear_units: 2048
195
+ num_blocks: 12
196
+ dropout_rate: 0.1
197
+ positional_dropout_rate: 0.1
198
+ attention_dropout_rate: 0.1
199
+ input_layer: conv2d
200
+ normalize_before: true
201
+ macaron_style: true
202
+ rel_pos_type: latest
203
+ pos_enc_layer_type: rel_pos
204
+ selfattention_layer_type: rel_selfattn
205
+ activation_type: swish
206
+ use_cnn_module: true
207
+ cnn_module_kernel: 31
208
+ postencoder: null
209
+ postencoder_conf: {}
210
+ decoder: transformer
211
+ decoder_conf:
212
+ attention_heads: 4
213
+ linear_units: 2048
214
+ num_blocks: 6
215
+ dropout_rate: 0.1
216
+ positional_dropout_rate: 0.1
217
+ self_attention_dropout_rate: 0.1
218
+ src_attention_dropout_rate: 0.1
219
+ preprocessor: multi
220
+ preprocessor_conf:
221
+ speaker_change_symbol:
222
+ - <sc>
223
+ required:
224
+ - output_dir
225
+ - token_list
226
+ version: '202308'
227
+ distributed: true
whamr_only/images/acc.png ADDED
whamr_only/images/backward_time.png ADDED
whamr_only/images/cer.png ADDED
whamr_only/images/clip.png ADDED
whamr_only/images/forward_time.png ADDED
whamr_only/images/gpu_max_cached_mem_GB.png ADDED
whamr_only/images/grad_norm.png ADDED
whamr_only/images/iter_time.png ADDED
whamr_only/images/loss.png ADDED
whamr_only/images/loss_att.png ADDED
whamr_only/images/loss_scale.png ADDED
whamr_only/images/optim0_lr0.png ADDED
whamr_only/images/optim_step_time.png ADDED
whamr_only/images/train_time.png ADDED
whamr_only/images/wer.png ADDED
whamr_only/latest.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4f732edcb7d3085b74db6df2cc49208ca2424933e4c06f5f2823b737fbf4964
3
+ size 172367337
whamr_only/run.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ./asr.sh --lang en --audio_format flac.ark --stage 6 --stop_stage 15 --speed_perturb_factors '0.9 1.0 1.1' --feats_type raw --token_type char --sot_asr true --max_wav_duration 50 --speed_perturb_factors '' --feats_normalize global_mvn --use_lm false --pretrained_model /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth --asr_config conf/tuning/train_sot_asr_conformer.yaml --lm_config conf/tuning/train_lm_transformer.yaml --inference_config conf/tuning/decode_sot.yaml --train_set tr_mix_clean_reverb_max_16k --valid_set cv_mix_clean_reverb_max_16k --test_sets tt_mix_clean_reverb_max_16k --ngpu 2 --asr_tag train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr_only --lm_train_text data/local/other_text/text --bpe_train_text data/tr_mix_clean_reverb_max_16k/text --stage 11 "$@"; exit $?
whamr_only/tensorboard/train/events.out.tfevents.1708415307.de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr.946846.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc726117ceea4f047b43a54ccb4085a699b3c7c06e08da30e5e4c718f8b77098
3
+ size 810259841
whamr_only/tensorboard/valid/events.out.tfevents.1708415308.de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr.946846.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36576199cf0a9a358be6ff8cbeb97275d530ce3896f3bb091ea7398f1faa14b9
3
+ size 16918
whamr_only/train.log ADDED
The diff for this file is too large to render. See raw diff
 
whamr_only/valid.acc.ave.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b28c5f6d1c3ff5b939be7b4169b60eeb2f33e37c2ed4190b9c632f2d75e300cf
3
+ size 172358249
whamr_only/valid.acc.ave_10best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b28c5f6d1c3ff5b939be7b4169b60eeb2f33e37c2ed4190b9c632f2d75e300cf
3
+ size 172358249
whamr_only/valid.acc.best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4f732edcb7d3085b74db6df2cc49208ca2424933e4c06f5f2823b737fbf4964
3
+ size 172367337