File size: 9,039 Bytes
a7c2f52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59df896
1da587e
a7c2f52
1da587e
 
 
59df896
 
 
 
 
 
 
 
 
 
 
 
 
 
675f494
a7c2f52
 
 
 
 
 
 
 
59df896
 
a7c2f52
 
59df896
 
a7c2f52
 
 
 
59df896
1da587e
a7c2f52
8878265
a7c2f52
 
 
1da587e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import gradio as gr
import soundfile
import time
import torch
import scipy.io.wavfile
from espnet2.utils.types import str_or_none
from espnet2.bin.asr_inference import Speech2Text
from subprocess import call
import os
from espnet_model_zoo.downloader import ModelDownloader
# print(a1)
# exit()
# exit()
# tagen = 'kan-bayashi/ljspeech_vits' 
# vocoder_tagen = "none" 


def inference(wav,instruction):
#   import pdb;pdb.set_trace()
  with torch.no_grad():
      speech, rate = soundfile.read(wav)
      if len(speech.shape)==2:
          speech=speech[:,0] 
      speech2text = Speech2Text.from_pretrained(
        asr_train_config="UniverSLU-17-Natural-Phrase/exp/asr_train_asr_whisper_full_correct_specaug_target_raw_en_whisper_multilingual/config.yaml",
        asr_model_file="UniverSLU-17-Natural-Phrase/exp/asr_train_asr_whisper_full_correct_specaug_target_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
        # Decoding parameters are not included in the model file
        nlp_prompt_prev_token=instruction,
        prompt_token_file="UniverSLU-17-Natural-Phrase/add_tokens-Copy1.txt",
        ctc_weight=0.0,
        beam_size=1,
        nbest=1
      )
      nbests = speech2text(speech)
      text, *_ = nbests[0]
      instruction=instruction.split(" <|")[0]
      # import pdb;pdb.set_trace()
      text=text.replace(instruction,"").replace("<|audio|>","").replace("_STOP","").split(".")[-1]
      # if lang == "chinese":
      #     wav = text2speechch(text)["wav"]
      #     scipy.io.wavfile.write("out.wav",text2speechch.fs , wav.view(-1).cpu().numpy())
      # if lang == "japanese":
      #     wav = text2speechjp(text)["wav"]
      #     scipy.io.wavfile.write("out.wav",text2speechjp.fs , wav.view(-1).cpu().numpy())
  return  text

title = "UniverSLU Natural Phrase"
description = "Gradio demo for UniverSLU Natural Phrase (https://huggingface.co/espnet/UniverSLU-17-Natural-Phrase). UniverSLU-17 Natural-Phrase is a Multi-task Spoken Language Understanding model from CMU WAVLab. It adapts Whisper to additional tasks through instruction tuning, i.e., finetuning by describing the task using natural language instructions followed by the list of label options. To use it, simply record your audio or click one of the examples to load them. More details about the SLU tasks that the model is trained on and it's performance on these tasks can be found in our paper: https://aclanthology.org/2024.naacl-long.151/"
article = "<p style='text-align: center'><a href='https://github.com/espnet/espnet' target='_blank'>Github Repo</a></p>"

# examples=[['audio_slurp_ner.flac',"english_slurp"],['audio_fsc.wav',"english_fsc"],['audio_grabo.wav',"dutch_scr"],['audio_english_scr.wav',"english_scr"],['audio_lt_scr.wav',"lithuanian_scr"],['audio_ar_scr.wav',"arabic_scr"],['audio_snips.wav',"english_snips"],['audio_lid.wav',"lid_voxforge"],['audio_fsd.wav',"fake_speech_detection_asvspoof"],['audio_er.wav',"emotion_rec_iemocap"],['audio_acc.wav',"accent_classify_accentdb"],['audio_mustard.wav',"sarcasm_mustard"],['audio_mustard_plus.wav',"sarcasm_mustard_plus"],['audio_voxceleb1.wav',"gender_voxceleb1"],['audio_esc50.wav',"audio_classification_esc50"],['audio_stop.wav',"semantic_parsing_stop"]]
examples=[['audio_slurp_ner.flac','Identify the named entities in the spoken words. <|startoftranscript|> <|en|>'],['audio_fsc.wav','Intent classification of spoken utterance. The options are 0."increase heat washroom", 1."deactivate lights", 2."deactivate lights bedroom", 3."decrease heat", 4."deactivate lights kitchen", 5."change language", 6."activate music", 7."change language English", 8."activate lights", 9."deactivate lights washroom", 10."change language German", 11."decrease heat kitchen", 12."increase volume", 13."decrease heat bedroom", 14."deactivate music", 15."decrease volume", 16."change language Chinese", 17."decrease heat washroom", 18."change language Korean", 19."increase heat", 20."bring newspaper", 21."activate lamp", 22."deactivate lamp", 23."bring juice", 24."activate lights kitchen", 25."increase heat kitchen", 26."bring socks", 27."activate lights bedroom", 28."increase heat bedroom", 29."activate lights washroom", 30."bring shoes". <|startoftranscript|> <|en|>'],['audio_grabo.wav','Recognize speech command. The options are 0."lift position up", 1."pointer state on", 2."turn relative slow south", 3."turn absolute south", 4."move relative slow alot forward", 5."turn relative fast south", 6."turn relative fast west", 7."turn relative slow west", 8."move relative slow alot backward", 9."move absolute slow right down", 10."move relative fast alot backward", 11."pointer state off", 12."grab grabber open", 13."move relative slow normal backward", 14."move absolute fast centerx centery", 15."approach slow", 16."turn absolute west", 17."move relative slow normal forward", 18."move absolute fast left up", 19."turn relative slow east", 20."move relative fast alot forward", 21."lift position down", 22."turn relative fast east", 23."move relative fast little forward", 24."move relative fast little backward", 25."move relative fast normal backward", 26."approach fast", 27."move absolute fast right down", 28."grab grabber close", 29."move absolute slow centerx centery", 30."turn absolute east", 31."move relative slow little forward", 32."turn absolute north", 33."move relative slow little backward", 34."move absolute slow left up", 35."move relative fast normal forward". <|startoftranscript|> <|nl|>'],['audio_english_scr.wav','Recognize speech command. The options are 0."yes", 1."down", 2."no", 3."stop", 4."go", 5."on", 6."left", 7."right", 8."unknown", 9."silence", 10."off", 11."up". <|startoftranscript|> <|en|>'],['audio_lt_scr.wav','Recognize speech command. The options are 0."ačiū", 1."iki", 2."išjunk", 3."labas", 4."ne", 5."pauzė", 6."startas", 7."stop", 8."unknown", 9."į_apačią", 10."į_dešinę", 11."į_kairę", 12."į_viršų", 13."įjunk". <|startoftranscript|> <|lt|>'],['audio_ar_scr.wav','Recognize speech command. The options are 0."A", 1."B", 2."C", 3."D", 4."E", 5."F", 6."0", 7."1", 8."2", 9."3", 10."4", 11."5", 12."6", 13."7", 14."8", 15."9". <|startoftranscript|> <|ar|>'],['audio_snips.wav','Intent classification of spoken utterance. The options are 0."Increase brightness", 1."Set light color", 2."Set light brightness", 3."Switch light on", 4."Decrease brightness", 5."Switch light off". <|startoftranscript|> <|en|>'],['audio_lid.wav','Determining the language in spoken speech. The options are 0."<|ru|>", 1."<|es|>", 2."<|it|>", 3."<|en|>", 4."<|fr|>", 5."<|de|>". <|startoftranscript|>'],['audio_fsd.wav','Distinguish between synthesized and converted speech from actual speech. The options are 0."spoof", 1."bonafide". <|startoftranscript|> <|en|>'],['audio_er.wav','Emotion recognition of spoken utterance. The options are 0."angry", 1."neutral", 2."sad", 3."happy", 4."other". <|startoftranscript|> <|en|>'],['audio_acc.wav','Accent classification in speech. The options are 0."american", 1."australian", 2."bangla", 3."british", 4."indian", 5."malayalam", 6."odiya", 7."telugu", 8."welsh". <|startoftranscript|> <|en|>'],['audio_mustard.wav','Determine if the speech is sarcastic. The options are 0."sarcasm", 1."not sarcasm". <|startoftranscript|> <|en|>'],['audio_voxceleb1.wav','Recognize the gender of the speaker. The options are 0."female", 1."male". <|startoftranscript|> <|en|>'],['audio_esc50.wav','Categorize the background noise in the audio. The options are 0."dog", 1."rooster", 2."pig", 3."cow", 4."frog", 5."cat", 6."hen", 7."insects", 8."sheep", 9."crow", 10."rain", 11."sea waves", 12."crackling fire", 13."crickets", 14."chirping birds", 15."water drops", 16."wind", 17."pouring water", 18."toilet flush", 19."thunderstorm", 20."crying baby", 21."sneezing", 22."clapping", 23."breathing", 24."coughing", 25."footsteps", 26."laughing", 27."brushing teeth", 28."snoring", 29."drinking sipping", 30."door wood knock", 31."mouse click", 32."keyboard typing", 33."door wood creaks", 34."can opening", 35."washing machine", 36."vacuum cleaner", 37."clock alarm", 38."clock tick", 39."glass breaking", 40."helicopter", 41."chainsaw", 42."siren", 43."car horn", 44."engine", 45."train", 46."church bells", 47."airplane", 48."fireworks", 49."hand saw". <|startoftranscript|> <|audio|>'],['audio_stop.wav','Develop the semantic parse of the spoken content. <|startoftranscript|> <|en|>'],['audio_freesound.wav','Identify if there is speech in the provided audio. The options are 0."no speech",1."speech". <|startoftranscript|>']]

# gr.inputs.Textbox(label="input text",lines=10),gr.inputs.Radio(choices=["english"], type="value", default="english", label="language")
gr.Interface(
    inference, 
    [gr.Audio(label="input audio",sources=["microphone"],type="filepath"),gr.Textbox(type="text", label="Instruction")], 
    gr.Textbox(type="text", label="Output"),
    title=title,
    cache_examples=False,
    description=description,
    article=article,
    examples=examples
    ).launch(debug=True)