ssiidd commited on
Commit
7b487d7
β€’
1 Parent(s): 01c26e3

Add FSC code

Browse files
Files changed (5) hide show
  1. README.md +1 -1
  2. app.py +17 -78
  3. audio_fsc.wav +0 -0
  4. fsc/config.yaml +355 -0
  5. fsc/valid.acc.ave_5best.pth +3 -0
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: ESPnet2 TTS
3
  emoji: πŸ“ˆ
4
  colorFrom: green
5
  colorTo: green
 
1
  ---
2
+ title: ESPnet2 SLU
3
  emoji: πŸ“ˆ
4
  colorFrom: green
5
  colorTo: green
app.py CHANGED
@@ -10,95 +10,34 @@ from espnet2.bin.asr_inference import Speech2Text
10
  # tagen = 'kan-bayashi/ljspeech_vits'
11
  # vocoder_tagen = "none"
12
 
13
- speech2text = Speech2Text.from_pretrained(
14
  asr_train_config="slurp/config.yaml",
15
  asr_model_file="slurp/valid.acc.ave_10best.pth",
16
  # Decoding parameters are not included in the model file
17
  nbest=1
18
  )
19
- # Confirm the sampling rate is equal to that of the training corpus.
20
- # If not, you need to resample the audio data before inputting to speech2text
21
- # speech, rate = soundfile.read("audio--1504190171-headset.flac")
22
- # nbests = speech2text(speech)
23
 
24
- # text, *_ = nbests[0]
25
- # print(text)
26
- # exit()
27
-
28
- # text2speechen = Text2Speech.from_pretrained(
29
- # model_tag=str_or_none(tagen),
30
- # vocoder_tag=str_or_none(vocoder_tagen),
31
- # device="cpu",
32
- # # Only for Tacotron 2 & Transformer
33
- # threshold=0.5,
34
- # # Only for Tacotron 2
35
- # minlenratio=0.0,
36
- # maxlenratio=10.0,
37
- # use_att_constraint=False,
38
- # backward_window=1,
39
- # forward_window=3,
40
- # # Only for FastSpeech & FastSpeech2 & VITS
41
- # speed_control_alpha=1.0,
42
- # # Only for VITS
43
- # noise_scale=0.333,
44
- # noise_scale_dur=0.333,
45
- # )
46
-
47
-
48
- # tagjp = 'kan-bayashi/jsut_full_band_vits_prosody'
49
- # vocoder_tagjp = 'none'
50
-
51
- # text2speechjp = Text2Speech.from_pretrained(
52
- # model_tag=str_or_none(tagjp),
53
- # vocoder_tag=str_or_none(vocoder_tagjp),
54
- # device="cpu",
55
- # # Only for Tacotron 2 & Transformer
56
- # threshold=0.5,
57
- # # Only for Tacotron 2
58
- # minlenratio=0.0,
59
- # maxlenratio=10.0,
60
- # use_att_constraint=False,
61
- # backward_window=1,
62
- # forward_window=3,
63
- # # Only for FastSpeech & FastSpeech2 & VITS
64
- # speed_control_alpha=1.0,
65
- # # Only for VITS
66
- # noise_scale=0.333,
67
- # noise_scale_dur=0.333,
68
- # )
69
-
70
- # tagch = 'kan-bayashi/csmsc_full_band_vits'
71
- # vocoder_tagch = "none"
72
-
73
- # text2speechch = Text2Speech.from_pretrained(
74
- # model_tag=str_or_none(tagch),
75
- # vocoder_tag=str_or_none(vocoder_tagch),
76
- # device="cpu",
77
- # # Only for Tacotron 2 & Transformer
78
- # threshold=0.5,
79
- # # Only for Tacotron 2
80
- # minlenratio=0.0,
81
- # maxlenratio=10.0,
82
- # use_att_constraint=False,
83
- # backward_window=1,
84
- # forward_window=3,
85
- # # Only for FastSpeech & FastSpeech2 & VITS
86
- # speed_control_alpha=1.0,
87
- # # Only for VITS
88
- # noise_scale=0.333,
89
- # noise_scale_dur=0.333,
90
- # )
91
 
92
- def inference(wav,lang):
93
  with torch.no_grad():
94
- if lang == "english":
95
  speech, rate = soundfile.read(wav.name)
96
- nbests = speech2text(speech)
97
  text, *_ = nbests[0]
98
  intent=text.split(" ")[0]
99
  scenario=intent.split("_")[0]
100
  action=intent.split("_")[1]
101
  text="{scenario: "+scenario+", action: "+action+"}"
 
 
 
 
102
  # if lang == "chinese":
103
  # wav = text2speechch(text)["wav"]
104
  # scipy.io.wavfile.write("out.wav",text2speechch.fs , wav.view(-1).cpu().numpy())
@@ -107,15 +46,15 @@ def inference(wav,lang):
107
  # scipy.io.wavfile.write("out.wav",text2speechjp.fs , wav.view(-1).cpu().numpy())
108
  return text
109
  title = "ESPnet2-SLU"
110
- description = "Gradio demo for ESPnet2-SLU: Advancing Spoken Language Understanding through ESPnet. To use it, simply record your audio. Read more at the links below."
111
  article = "<p style='text-align: center'><a href='https://github.com/espnet/espnet' target='_blank'>Github Repo</a></p>"
112
 
113
- examples=[['audio_slurp.flac',"english"]]
114
 
115
  # gr.inputs.Textbox(label="input text",lines=10),gr.inputs.Radio(choices=["english"], type="value", default="english", label="language")
116
  gr.Interface(
117
  inference,
118
- [gr.inputs.Audio(label="input audio",source = "microphone", type="file"),gr.inputs.Radio(choices=["english"], type="value", default="english", label="language")],
119
  gr.outputs.Textbox(type="str", label="Output"),
120
  title=title,
121
  description=description,
 
10
  # tagen = 'kan-bayashi/ljspeech_vits'
11
  # vocoder_tagen = "none"
12
 
13
+ speech2text_slurp = Speech2Text.from_pretrained(
14
  asr_train_config="slurp/config.yaml",
15
  asr_model_file="slurp/valid.acc.ave_10best.pth",
16
  # Decoding parameters are not included in the model file
17
  nbest=1
18
  )
 
 
 
 
19
 
20
+ speech2text_fsc = Speech2Text.from_pretrained(
21
+ asr_train_config="fsc/config.yaml",
22
+ asr_model_file="fsc/valid.acc.ave_5best.pth",
23
+ # Decoding parameters are not included in the model file
24
+ nbest=1
25
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
+ def inference(wav,data):
28
  with torch.no_grad():
29
+ if data == "english_slurp":
30
  speech, rate = soundfile.read(wav.name)
31
+ nbests = speech2text_slurp(speech)
32
  text, *_ = nbests[0]
33
  intent=text.split(" ")[0]
34
  scenario=intent.split("_")[0]
35
  action=intent.split("_")[1]
36
  text="{scenario: "+scenario+", action: "+action+"}"
37
+ elif data == "english_fsc":
38
+ speech, rate = soundfile.read(wav.name)
39
+ nbests = speech2text_fsc(speech)
40
+ text, *_ = nbests[0]
41
  # if lang == "chinese":
42
  # wav = text2speechch(text)["wav"]
43
  # scipy.io.wavfile.write("out.wav",text2speechch.fs , wav.view(-1).cpu().numpy())
 
46
  # scipy.io.wavfile.write("out.wav",text2speechjp.fs , wav.view(-1).cpu().numpy())
47
  return text
48
  title = "ESPnet2-SLU"
49
+ description = "Gradio demo for ESPnet2-SLU: Advancing Spoken Language Understanding through ESPnet. To use it, simply record your audio or click one of the examples to load them. Read more at the links below."
50
  article = "<p style='text-align: center'><a href='https://github.com/espnet/espnet' target='_blank'>Github Repo</a></p>"
51
 
52
+ examples=[['audio_slurp.flac',"english_slurp"],['audio_fsc.wav',"english_fsc"]]
53
 
54
  # gr.inputs.Textbox(label="input text",lines=10),gr.inputs.Radio(choices=["english"], type="value", default="english", label="language")
55
  gr.Interface(
56
  inference,
57
+ [gr.inputs.Audio(label="input audio",source = "microphone", type="file"),gr.inputs.Radio(choices=["english_slurp","english_fsc"], type="value", default="english_slurp", label="Dataset")],
58
  gr.outputs.Textbox(type="str", label="Output"),
59
  title=title,
60
  description=description,
audio_fsc.wav ADDED
Binary file (41 kB). View file
 
fsc/config.yaml ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_asr_hubert_transformer_adam_specaug.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/asr_train_asr_hubert_transformer_adam_specaug_raw_en_word
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 200
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - train
38
+ - loss
39
+ - min
40
+ - - valid
41
+ - loss
42
+ - min
43
+ - - train
44
+ - acc
45
+ - max
46
+ - - valid
47
+ - acc
48
+ - max
49
+ keep_nbest_models: 5
50
+ grad_clip: 5.0
51
+ grad_clip_type: 2.0
52
+ grad_noise: false
53
+ accum_grad: 1
54
+ no_forward_run: false
55
+ resume: true
56
+ train_dtype: float32
57
+ use_amp: false
58
+ log_interval: null
59
+ use_tensorboard: true
60
+ use_wandb: false
61
+ wandb_project: null
62
+ wandb_id: null
63
+ wandb_entity: null
64
+ wandb_name: null
65
+ wandb_model_log_interval: -1
66
+ detect_anomaly: false
67
+ pretrain_path: null
68
+ init_param: []
69
+ ignore_init_mismatch: false
70
+ freeze_param:
71
+ - frontend.upstream
72
+ num_iters_per_epoch: null
73
+ batch_size: 20
74
+ valid_batch_size: null
75
+ batch_bins: 1000000
76
+ valid_batch_bins: null
77
+ train_shape_file:
78
+ - exp/asr_stats_raw_en_word/train/speech_shape
79
+ - exp/asr_stats_raw_en_word/train/text_shape.word
80
+ valid_shape_file:
81
+ - exp/asr_stats_raw_en_word/valid/speech_shape
82
+ - exp/asr_stats_raw_en_word/valid/text_shape.word
83
+ batch_type: folded
84
+ valid_batch_type: null
85
+ fold_length:
86
+ - 80000
87
+ - 150
88
+ sort_in_batch: descending
89
+ sort_batch: descending
90
+ multiple_iterator: false
91
+ chunk_length: 500
92
+ chunk_shift_ratio: 0.5
93
+ num_cache_chunks: 1024
94
+ train_data_path_and_name_and_type:
95
+ - - dump/raw/train/wav.scp
96
+ - speech
97
+ - sound
98
+ - - dump/raw/train/text
99
+ - text
100
+ - text
101
+ valid_data_path_and_name_and_type:
102
+ - - dump/raw/valid/wav.scp
103
+ - speech
104
+ - sound
105
+ - - dump/raw/valid/text
106
+ - text
107
+ - text
108
+ allow_variable_data_keys: false
109
+ max_cache_size: 0.0
110
+ max_cache_fd: 32
111
+ valid_max_cache_size: null
112
+ optim: adam
113
+ optim_conf:
114
+ lr: 0.0002
115
+ scheduler: warmuplr
116
+ scheduler_conf:
117
+ warmup_steps: 25000
118
+ token_list:
119
+ - <blank>
120
+ - <unk>
121
+ - the
122
+ - Turn
123
+ - in
124
+ - lights
125
+ - 'on'
126
+ - up
127
+ - down
128
+ - temperature
129
+ - heat
130
+ - 'off'
131
+ - Switch
132
+ - increase_volume_none
133
+ - kitchen
134
+ - language
135
+ - decrease_volume_none
136
+ - bedroom
137
+ - washroom
138
+ - volume
139
+ - my
140
+ - to
141
+ - bathroom
142
+ - Decrease
143
+ - increase_heat_washroom
144
+ - decrease_heat_washroom
145
+ - Increase
146
+ - music
147
+ - heating
148
+ - Bring
149
+ - increase_heat_none
150
+ - decrease_heat_none
151
+ - me
152
+ - change_language_none_none
153
+ - activate_lights_washroom
154
+ - Set
155
+ - Lights
156
+ - activate_lights_kitchen
157
+ - I
158
+ - activate_music_none
159
+ - too
160
+ - it
161
+ - increase_heat_bedroom
162
+ - decrease_heat_bedroom
163
+ - sound
164
+ - increase_heat_kitchen
165
+ - decrease_heat_kitchen
166
+ - deactivate_music_none
167
+ - lamp
168
+ - Make
169
+ - deactivate_lights_bedroom
170
+ - deactivate_lights_kitchen
171
+ - bring_newspaper_none
172
+ - newspaper
173
+ - activate_lights_bedroom
174
+ - bring_socks_none
175
+ - socks
176
+ - bring_shoes_none
177
+ - shoes
178
+ - need
179
+ - Volume
180
+ - activate_lights_none
181
+ - deactivate_lights_none
182
+ - bring_juice_none
183
+ - juice
184
+ - deactivate_lights_washroom
185
+ - change_language_Chinese_none
186
+ - deactivate_lamp_none
187
+ - activate_lamp_none
188
+ - Kitchen
189
+ - turn
190
+ - some
191
+ - Could
192
+ - you
193
+ - Bedroom
194
+ - Go
195
+ - get
196
+ - Washroom
197
+ - Chinese
198
+ - phone's
199
+ - change_language_English_none
200
+ - Get
201
+ - change_language_Korean_none
202
+ - OK
203
+ - now
204
+ - switch
205
+ - main
206
+ - change_language_German_none
207
+ - practice
208
+ - Louder
209
+ - Stop
210
+ - loud
211
+ - increase
212
+ - Play
213
+ - hear
214
+ - Change
215
+ - quiet
216
+ - Bathroom
217
+ - Fetch
218
+ - Korean
219
+ - English
220
+ - German
221
+ - Pause
222
+ - Lamp
223
+ - Resume
224
+ - louder
225
+ - Heat
226
+ - audio
227
+ - Its
228
+ - loud,
229
+ - heating?
230
+ - Far
231
+ - a
232
+ - different
233
+ - please?
234
+ - decrease
235
+ - Too
236
+ - settings
237
+ - Put
238
+ - Start
239
+ - Quieter
240
+ - please
241
+ - Thats
242
+ - softer
243
+ - max
244
+ - mute
245
+ - lower
246
+ - phone
247
+ - couldn't
248
+ - anything,
249
+ - Reduce
250
+ - this,
251
+ - More
252
+ - That's
253
+ - Lower
254
+ - levels
255
+ - Use
256
+ - hotter
257
+ - languages
258
+ - Allow
259
+ - can't
260
+ - that
261
+ - Less
262
+ - system
263
+ - cooler
264
+ - This
265
+ - video
266
+ - is
267
+ - low,
268
+ - device
269
+ - Chinese.
270
+ - quieter
271
+ - English.
272
+ - Language
273
+ - Open
274
+ - German.
275
+ - Korean.
276
+ - <sos/eos>
277
+ init: null
278
+ input_size: null
279
+ ctc_conf:
280
+ dropout_rate: 0.0
281
+ ctc_type: builtin
282
+ reduce: true
283
+ ignore_nan_grad: true
284
+ model_conf:
285
+ ctc_weight: 0.3
286
+ lsm_weight: 0.1
287
+ length_normalized_loss: false
288
+ extract_feats_in_collect_stats: false
289
+ use_preprocessor: true
290
+ token_type: word
291
+ bpemodel: null
292
+ non_linguistic_symbols: null
293
+ cleaner: null
294
+ g2p: null
295
+ speech_volume_normalize: null
296
+ rir_scp: null
297
+ rir_apply_prob: 1.0
298
+ noise_scp: null
299
+ noise_apply_prob: 1.0
300
+ noise_db_range: '13_15'
301
+ frontend: s3prl
302
+ frontend_conf:
303
+ frontend_conf:
304
+ upstream: hubert_large_ll60k
305
+ download_dir: ./hub
306
+ multilayer_feature: true
307
+ fs: 16k
308
+ specaug: specaug
309
+ specaug_conf:
310
+ apply_time_warp: true
311
+ time_warp_window: 5
312
+ time_warp_mode: bicubic
313
+ apply_freq_mask: true
314
+ freq_mask_width_range:
315
+ - 0
316
+ - 30
317
+ num_freq_mask: 2
318
+ apply_time_mask: true
319
+ time_mask_width_range:
320
+ - 0
321
+ - 40
322
+ num_time_mask: 2
323
+ normalize: utterance_mvn
324
+ normalize_conf: {}
325
+ preencoder: linear
326
+ preencoder_conf:
327
+ input_size: 1024
328
+ output_size: 80
329
+ encoder: transformer
330
+ encoder_conf:
331
+ output_size: 256
332
+ attention_heads: 4
333
+ linear_units: 2048
334
+ num_blocks: 12
335
+ dropout_rate: 0.1
336
+ positional_dropout_rate: 0.1
337
+ attention_dropout_rate: 0.0
338
+ input_layer: conv2d
339
+ normalize_before: true
340
+ postencoder: null
341
+ postencoder_conf: {}
342
+ decoder: transformer
343
+ decoder_conf:
344
+ attention_heads: 4
345
+ linear_units: 2048
346
+ num_blocks: 6
347
+ dropout_rate: 0.1
348
+ positional_dropout_rate: 0.1
349
+ self_attention_dropout_rate: 0.0
350
+ src_attention_dropout_rate: 0.0
351
+ required:
352
+ - output_dir
353
+ - token_list
354
+ version: 0.10.3a2
355
+ distributed: false
fsc/valid.acc.ave_5best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2adc40ef8aaace766e9aa307cc49e78419a516a75fa58b3eabecc9e857fc5a4
3
+ size 1375946815