ssiidd commited on
Commit
1da587e
1 Parent(s): 0ae8a5d

Complete demo

Browse files
app.py CHANGED
@@ -14,74 +14,306 @@ from espnet_model_zoo.downloader import ModelDownloader
14
  # tagen = 'kan-bayashi/ljspeech_vits'
15
  # vocoder_tagen = "none"
16
 
17
- speech2text_slurp = Speech2Text.from_pretrained(
18
- asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
19
- asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
20
- # Decoding parameters are not included in the model file
21
- lang_prompt_token="<|en|> <|ner|> <|SLURP|>",
22
- prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
23
- nbest=1
24
- )
25
 
26
- speech2text_fsc = Speech2Text.from_pretrained(
27
- asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
28
- asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
29
- # Decoding parameters are not included in the model file
30
- lang_prompt_token="<|en|> <|ic|> <|fsc|>",
31
- prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
32
- nbest=1
33
- )
34
 
35
- speech2text_grabo = Speech2Text.from_pretrained(
36
- asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
37
- asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
38
- # Decoding parameters are not included in the model file
39
- lang_prompt_token="<|nl|> <|scr|> <|grabo_scr|>",
40
- prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
41
- nbest=1
42
- )
43
 
44
  def inference(wav,data):
 
45
  with torch.no_grad():
 
 
 
46
  if data == "english_slurp":
47
- speech, rate = soundfile.read(wav.name)
48
- nbests = speech2text_slurp(speech)
 
 
 
 
 
 
 
 
 
 
49
  text, *_ = nbests[0]
50
- # intent=text.split(" ")[0]
51
- # scenario=intent.split("_")[0]
52
- # action=intent.split("_")[1]
53
- # text="{scenario: "+scenario+", action: "+action+"}"
 
 
 
 
 
 
 
 
54
  elif data == "english_fsc":
55
- print(wav.name)
56
- speech, rate = soundfile.read(wav.name)
57
- print(speech.shape)
58
- if len(speech.shape)==2:
59
- speech=speech[:,0]
60
- # soundfile.write("store_file.wav", speech, rate, subtype='FLOAT')
61
- print(speech.shape)
62
- nbests = speech2text_fsc(speech)
 
 
63
  text, *_ = nbests[0]
64
- # intent=text.split(" ")[0]
65
- # action=intent.split("_")[0]
66
- # objects=intent.split("_")[1]
67
- # location=intent.split("_")[2]
68
- # text="{action: "+action+", object: "+objects+", location: "+location+"}"
69
- # elif data == "english_snips":
70
- # print(wav.name)
71
- # speech, rate = soundfile.read(wav.name)
72
- # nbests = speech2text_snips(speech)
73
- # text, *_ = nbests[0]
74
- elif data == "dutch":
75
- print(wav.name)
76
- speech, rate = soundfile.read(wav.name)
77
- nbests = speech2text_grabo(speech)
 
 
 
78
  text, *_ = nbests[0]
79
- # intent=text.split(" ")[0]
80
- # action=intent.split("_")[0]
81
- # objects=intent.split("_")[1]
82
- # location=intent.split("_")[2]
83
- # text="{action: "+action+", object: "+objects+", location: "+location+"}"
84
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  # if lang == "chinese":
86
  # wav = text2speechch(text)["wav"]
87
  # scipy.io.wavfile.write("out.wav",text2speechch.fs , wav.view(-1).cpu().numpy())
@@ -91,19 +323,18 @@ def inference(wav,data):
91
  return text
92
 
93
  title = "UniverSLU"
94
- description = "Gradio demo for UniverSLU: Universal Spoken Language Understanding for Diverse Tasks with Natural Language Instructions. To use it, simply record your audio or click one of the examples to load them. Read more at the links below."
95
  article = "<p style='text-align: center'><a href='https://github.com/espnet/espnet' target='_blank'>Github Repo</a></p>"
96
 
97
- examples=[['audio_slurp.flac',"english_slurp"],['audio_fsc.wav',"english_fsc"],['audio_grabo.wav',"dutch"]]
98
 
99
  # gr.inputs.Textbox(label="input text",lines=10),gr.inputs.Radio(choices=["english"], type="value", default="english", label="language")
100
  gr.Interface(
101
  inference,
102
- [gr.Audio(label="input audio",source = "microphone", type="file"),gr.Radio(choices=["english_slurp","english_fsc","dutch_scd"], type="value", default="english_fsc", label="Task")],
103
- gr.Textbox(type="str", label="Output"),
104
  title=title,
105
  description=description,
106
  article=article,
107
- enable_queue=True,
108
  examples=examples
109
- ).launch(debug=True)
 
14
  # tagen = 'kan-bayashi/ljspeech_vits'
15
  # vocoder_tagen = "none"
16
 
 
 
 
 
 
 
 
 
17
 
 
 
 
 
 
 
 
 
18
 
19
+ audio_class_str='0."dog", 1."rooster", 2."pig", 3."cow", 4."frog", 5."cat", 6."hen", 7."insects", 8."sheep", 9."crow", 10."rain", 11."sea waves", 12."crackling fire", 13."crickets", 14."chirping birds", 15."water drops", 16."wind", 17."pouring water", 18."toilet flush", 19."thunderstorm", 20."crying baby", 21."sneezing", 22."clapping", 23."breathing", 24."coughing", 25."footsteps", 26."laughing", 27."brushing teeth", 28."snoring", 29."drinking sipping", 30."door wood knock", 31."mouse click", 32."keyboard typing", 33."door wood creaks", 34."can opening", 35."washing machine", 36."vacuum cleaner", 37."clock alarm", 38."clock tick", 39."glass breaking", 40."helicopter", 41."chainsaw", 42."siren", 43."car horn", 44."engine", 45."train", 46."church bells", 47."airplane", 48."fireworks", 49."hand saw".'
20
+ audio_class_arr=audio_class_str.split(", ")
21
+ audio_class_arr=[k.split('"')[1] for k in audio_class_arr]
 
 
 
 
 
22
 
23
  def inference(wav,data):
24
+ # import pdb;pdb.set_trace()
25
  with torch.no_grad():
26
+ speech, rate = soundfile.read(wav)
27
+ if len(speech.shape)==2:
28
+ speech=speech[:,0]
29
  if data == "english_slurp":
30
+ speech2text = Speech2Text.from_pretrained(
31
+ asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
32
+ asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
33
+ # Decoding parameters are not included in the model file
34
+ lang_prompt_token="<|en|> <|ner|> <|SLURP|>",
35
+ prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
36
+ beam_size=20,
37
+ ctc_weight=0.0,
38
+ penalty=0.1,
39
+ nbest=1
40
+ )
41
+ nbests = speech2text(speech)
42
  text, *_ = nbests[0]
43
+ text=text.split("|>")[-1]
44
+ intent=text.split(" ")[0].replace("in:","")
45
+ scenario=intent.split("_")[0]
46
+ action=intent.split("_")[1]
47
+ ner_text=text.split(" SEP ")[1:-1]
48
+ text="INTENT: {scenario: "+scenario+", action: "+action+"}\n"
49
+ text=text+"NAMED ENTITIES: {"
50
+ for k in ner_text:
51
+ slot_name=k.split(" FILL ")[0].replace("sl:","")
52
+ slot_val=k.split(" FILL ")[1]
53
+ text=text+" "+slot_name+" : "+slot_val+","
54
+ text=text+"}"
55
  elif data == "english_fsc":
56
+ speech2text = Speech2Text.from_pretrained(
57
+ asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
58
+ asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
59
+ # Decoding parameters are not included in the model file
60
+ lang_prompt_token="<|en|> <|ic|> <|fsc|>",
61
+ prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
62
+ ctc_weight=0.0,
63
+ nbest=1
64
+ )
65
+ nbests = speech2text(speech)
66
  text, *_ = nbests[0]
67
+ text=text.split("|>")[-1]
68
+ intent=text.split(" ")[0].replace("in:","")
69
+ action=intent.split("_")[0]
70
+ objects=intent.split("_")[1]
71
+ location=intent.split("_")[2]
72
+ text="INTENT: {action: "+action+", object: "+objects+", location: "+location+"}"
73
+ elif data == "english_snips":
74
+ speech2text = Speech2Text.from_pretrained(
75
+ asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
76
+ asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
77
+ # Decoding parameters are not included in the model file
78
+ lang_prompt_token="<|en|> <|ic|> <|SNIPS|>",
79
+ prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
80
+ ctc_weight=0.0,
81
+ nbest=1
82
+ )
83
+ nbests = speech2text(speech)
84
  text, *_ = nbests[0]
85
+ text=text.split("|>")[-1]
86
+ intent=text.split(" ")[0].replace("in:","")
87
+ text="INTENT: "+intent
88
+ elif data == "dutch_scr":
89
+ speech2text = Speech2Text.from_pretrained(
90
+ asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
91
+ asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
92
+ # Decoding parameters are not included in the model file
93
+ lang_prompt_token="<|nl|> <|scr|> <|grabo_scr|>",
94
+ prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
95
+ ctc_weight=0.0,
96
+ beam_size=20,
97
+ nbest=1
98
+ )
99
+ nbests = speech2text(speech)
100
+ text, *_ = nbests[0]
101
+ text=text.split("|>")[-1]
102
+ intent=text.split(" ")[0]
103
+ text="SPEECH COMMAND: "+intent
104
+ elif data == "english_scr":
105
+ speech2text = Speech2Text.from_pretrained(
106
+ asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
107
+ asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
108
+ # Decoding parameters are not included in the model file
109
+ lang_prompt_token="<|en|> <|scr|> <|google_scr|>",
110
+ prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
111
+ ctc_weight=0.0,
112
+ beam_size=1,
113
+ nbest=1
114
+ )
115
+ nbests = speech2text(speech)
116
+ text, *_ = nbests[0]
117
+ text=text.split("|>")[-1]
118
+ intent=text.split(" ")[0].replace("command:","")
119
+ text="SPEECH COMMAND: "+intent
120
+ elif data == "lithuanian_scr":
121
+ speech2text = Speech2Text.from_pretrained(
122
+ asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
123
+ asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
124
+ # Decoding parameters are not included in the model file
125
+ lang_prompt_token= "<|lt|> <|scr|> <|lt_scr|>",
126
+ prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
127
+ ctc_weight=0.0,
128
+ beam_size=1,
129
+ nbest=1
130
+ )
131
+ nbests = speech2text(speech)
132
+ text, *_ = nbests[0]
133
+ text=text.split("|>")[-1]
134
+ intent=text
135
+ text="SPEECH COMMAND: "+intent
136
+ elif data == "arabic_scr":
137
+ speech2text = Speech2Text.from_pretrained(
138
+ asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
139
+ asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
140
+ # Decoding parameters are not included in the model file
141
+ lang_prompt_token= "<|ar|> <|scr|> <|ar_scr|>",
142
+ prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
143
+ ctc_weight=0.0,
144
+ beam_size=1,
145
+ nbest=1
146
+ )
147
+ nbests = speech2text(speech)
148
+ text, *_ = nbests[0]
149
+ text=text.split("|>")[-1]
150
+ intent=text.split(" ")[0].replace("command:","")
151
+ text="SPEECH COMMAND: "+intent
152
+ elif data == "lid_voxforge":
153
+ speech2text = Speech2Text.from_pretrained(
154
+ asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
155
+ asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
156
+ # Decoding parameters are not included in the model file
157
+ lid_prompt=True,
158
+ prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
159
+ ctc_weight=0.0,
160
+ beam_size=1,
161
+ nbest=1
162
+ )
163
+ nbests = speech2text(speech)
164
+ # import pdb;pdb.set_trace()
165
+ lang=speech2text.converter.tokenizer.tokenizer.convert_ids_to_tokens(nbests[0][2][0]).replace("|>","").replace("<|","")
166
+ text="LANG: "+lang
167
+ elif data == "fake_speech_detection_asvspoof":
168
+ speech2text = Speech2Text.from_pretrained(
169
+ asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
170
+ asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
171
+ # Decoding parameters are not included in the model file
172
+ lang_prompt_token="<|en|> <|fsd|> <|asvspoof|>",
173
+ prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
174
+ ctc_weight=0.0,
175
+ beam_size=1,
176
+ nbest=1
177
+ )
178
+ nbests = speech2text(speech)
179
+ text, *_ = nbests[0]
180
+ text=text.split("|>")[-1]
181
+ intent=text.split(" ")[0].replace("class:","")
182
+ text="SPEECH CLASS: "+intent
183
+ elif data == "emotion_rec_iemocap":
184
+ replace_dict={}
185
+ replace_dict["em:neu"]="Neutral"
186
+ replace_dict["em:ang"]="Angry"
187
+ replace_dict["em:sad"]="Sad"
188
+ replace_dict["em:hap"]="Happy"
189
+ speech2text = Speech2Text.from_pretrained(
190
+ asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
191
+ asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
192
+ # Decoding parameters are not included in the model file
193
+ lang_prompt_token="<|en|> <|er|> <|iemocap|>",
194
+ prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
195
+ ctc_weight=0.0,
196
+ beam_size=1,
197
+ nbest=1
198
+ )
199
+ nbests = speech2text(speech)
200
+ text, *_ = nbests[0]
201
+ text=text.split("|>")[-1]
202
+ intent=replace_dict[text.split(" ")[0]]
203
+ text="EMOTION: "+intent
204
+ elif data == "accent_classify_accentdb":
205
+ speech2text = Speech2Text.from_pretrained(
206
+ asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
207
+ asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
208
+ # Decoding parameters are not included in the model file
209
+ lang_prompt_token="<|en|> <|accent_rec|> <|accentdb|>",
210
+ prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
211
+ ctc_weight=0.0,
212
+ beam_size=1,
213
+ nbest=1
214
+ )
215
+ nbests = speech2text(speech)
216
+ text, *_ = nbests[0]
217
+ text=text.split("|>")[-1]
218
+ intent=text.split(" ")[0].replace("accent:","")
219
+ text="ACCENT: "+intent
220
+ elif data == "sarcasm_mustard":
221
+ speech2text = Speech2Text.from_pretrained(
222
+ asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
223
+ asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
224
+ # Decoding parameters are not included in the model file
225
+ lang_prompt_token="<|en|> <|scd|> <|mustard|>",
226
+ prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
227
+ ctc_weight=0.0,
228
+ beam_size=1,
229
+ nbest=1
230
+ )
231
+ nbests = speech2text(speech)
232
+ text, *_ = nbests[0]
233
+ text=text.split("|>")[-1]
234
+ intent=text.split(" ")[0].replace("class:","")
235
+ text="SARCASM CLASS: "+intent
236
+ elif data == "sarcasm_mustard_plus":
237
+ speech2text = Speech2Text.from_pretrained(
238
+ asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
239
+ asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
240
+ # Decoding parameters are not included in the model file
241
+ lang_prompt_token="<|en|> <|scd|> <|mustard_plus_plus|>",
242
+ prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
243
+ ctc_weight=0.0,
244
+ beam_size=1,
245
+ nbest=1
246
+ )
247
+ nbests = speech2text(speech)
248
+ text, *_ = nbests[0]
249
+ text=text.split("|>")[-1]
250
+ intent=text.split(" ")[0].replace("class:","")
251
+ text="SARCASM CLASS: "+intent
252
+ elif data == "gender_voxceleb1":
253
+ speech2text = Speech2Text.from_pretrained(
254
+ asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
255
+ asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
256
+ # Decoding parameters are not included in the model file
257
+ lang_prompt_token="<|en|> <|gid|> <|voxceleb|>",
258
+ prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
259
+ ctc_weight=0.0,
260
+ beam_size=1,
261
+ nbest=1
262
+ )
263
+ nbests = speech2text(speech)
264
+ text, *_ = nbests[0]
265
+ text=text.split("|>")[-1]
266
+ intent=text.split(" ")[0].replace("gender:f","female").replace("gender:m","male")
267
+ text="GENDER: "+intent
268
+ elif data == "audio_classification_esc50":
269
+ speech2text = Speech2Text.from_pretrained(
270
+ asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
271
+ asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
272
+ # Decoding parameters are not included in the model file
273
+ lang_prompt_token="<|audio|> <|auc|> <|esc50|>",
274
+ prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
275
+ ctc_weight=0.0,
276
+ beam_size=1,
277
+ nbest=1
278
+ )
279
+ nbests = speech2text(speech)
280
+ text, *_ = nbests[0]
281
+ text=text.split("|>")[-1]
282
+ intent=text.split(" ")[0].replace("audio_class:","")
283
+ text="AUDIO EVENT CLASS: "+audio_class_arr[int(intent)]
284
+ elif data == "semantic_parsing_stop":
285
+ speech2text = Speech2Text.from_pretrained(
286
+ asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
287
+ asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
288
+ # Decoding parameters are not included in the model file
289
+ lang_prompt_token="<|en|> <|sp|> <|STOP|>",
290
+ prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
291
+ ctc_weight=0.0,
292
+ beam_size=20,
293
+ penalty=0.1,
294
+ nbest=1
295
+ )
296
+ nbests = speech2text(speech)
297
+ text, *_ = nbests[0]
298
+ text=text.split("|>")[-1].replace("_STOP","")
299
+ text="SEMANTIC PARSE SEQUENCE: "+text
300
+ elif data == "vad_freesound":
301
+ speech2text = Speech2Text.from_pretrained(
302
+ asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
303
+ asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
304
+ # Decoding parameters are not included in the model file
305
+ lid_prompt=True,
306
+ prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
307
+ ctc_weight=0.0,
308
+ beam_size=1,
309
+ nbest=1
310
+ )
311
+ nbests = speech2text(speech)
312
+ lang=speech2text.converter.tokenizer.tokenizer.convert_ids_to_tokens(nbests[0][2][0])
313
+ if lang=="<|nospeech|>":
314
+ text="VAD: no speech"
315
+ else:
316
+ text="VAD: speech"
317
  # if lang == "chinese":
318
  # wav = text2speechch(text)["wav"]
319
  # scipy.io.wavfile.write("out.wav",text2speechch.fs , wav.view(-1).cpu().numpy())
 
323
  return text
324
 
325
  title = "UniverSLU"
326
+ description = "Gradio demo for UniverSLU Task Specifier (https://huggingface.co/espnet/UniverSLU-17-Task-Specifier). UniverSLU-17 Task Specifier is a Multi-task Spoken Language Understanding model from CMU WAVLab. It adapts Whisper to additional tasks using single-token task specifiers. To use it, simply record your audio or click one of the examples to load them. More details about the SLU tasks that the model is trained on and it's performance on these tasks can be found in our paper: https://aclanthology.org/2024.naacl-long.151/"
327
  article = "<p style='text-align: center'><a href='https://github.com/espnet/espnet' target='_blank'>Github Repo</a></p>"
328
 
329
+ examples=[['audio_slurp_ner.flac',"english_slurp"],['audio_fsc.wav',"english_fsc"],['audio_grabo.wav',"dutch_scr"],['audio_english_scr.wav',"english_scr"],['audio_lt_scr.wav',"lithuanian_scr"],['audio_ar_scr.wav',"arabic_scr"],['audio_snips.wav',"english_snips"],['audio_lid.wav',"lid_voxforge"],['audio_fsd.wav',"fake_speech_detection_asvspoof"],['audio_er.wav',"emotion_rec_iemocap"],['audio_acc.wav',"accent_classify_accentdb"],['audio_mustard.wav',"sarcasm_mustard"],['audio_mustard_plus.wav',"sarcasm_mustard_plus"],['audio_voxceleb1.wav',"gender_voxceleb1"],['audio_esc50.wav',"audio_classification_esc50"],['audio_stop.wav',"semantic_parsing_stop"],['audio_freesound.wav',"vad_freesound"]]
330
 
331
  # gr.inputs.Textbox(label="input text",lines=10),gr.inputs.Radio(choices=["english"], type="value", default="english", label="language")
332
  gr.Interface(
333
  inference,
334
+ [gr.Audio(label="input audio",sources=["microphone"],type="filepath"),gr.Radio(choices=["english_slurp","english_fsc","dutch_scr","english_scr","lithuanian_scr","arabic_scr","english_snips","lid_voxforge","fake_speech_detection_asvspoof","emotion_rec_iemocap","accent_classify_accentdb","sarcasm_mustard","sarcasm_mustard_plus","gender_voxceleb1","audio_classification_esc50","semantic_parsing_stop","vad_freesound"], type="value", label="Task")],
335
+ gr.Textbox(type="text", label="Output"),
336
  title=title,
337
  description=description,
338
  article=article,
 
339
  examples=examples
340
+ ).launch(debug=True)
audio_acc.wav ADDED
Binary file (159 kB). View file
 
audio_ar_scr.wav ADDED
Binary file (68.5 kB). View file
 
audio_english_scr.wav ADDED
Binary file (32 kB). View file
 
audio_er.wav ADDED
Binary file (193 kB). View file
 
audio_esc50.wav ADDED
Binary file (441 kB). View file
 
audio_freesound.wav ADDED
Binary file (30.3 kB). View file
 
audio_fsd.wav ADDED
Binary file (40 kB). View file
 
audio_lid.wav ADDED
Binary file (320 kB). View file
 
audio_lt_scr.wav ADDED
Binary file (32 kB). View file
 
audio_mustard.wav ADDED
Binary file (225 kB). View file
 
audio_mustard_plus.wav ADDED
Binary file (201 kB). View file
 
audio_slurp_ner.flac ADDED
Binary file (59.7 kB). View file
 
audio_snips.wav ADDED
Binary file (112 kB). View file
 
audio_stop.wav ADDED
Binary file (132 kB). View file
 
audio_voxceleb1.wav ADDED
Binary file (141 kB). View file