File size: 5,382 Bytes
65191fa
d146e4d
65191fa
 
 
 
 
 
f4eb629
9bc09f5
4967734
 
 
 
 
 
65191fa
f4eb629
 
 
9bc09f5
 
 
910bdcb
 
9bc09f5
65191fa
 
 
7b487d7
65191fa
 
 
 
 
 
7b487d7
 
 
 
 
 
65191fa
4967734
 
 
 
 
 
 
d231d7c
45af28c
 
 
 
 
 
4967734
 
 
 
 
 
 
 
7b487d7
65191fa
7b487d7
ef7af0d
7b487d7
d8d2ace
daa9ddd
 
 
01c26e3
7b487d7
12c22ef
7b487d7
12c22ef
 
 
 
 
7b487d7
 
9bc09f5
 
 
12c22ef
9bc09f5
45af28c
 
 
 
 
 
 
 
d231d7c
45af28c
 
4967734
 
 
 
 
 
 
 
 
 
45af28c
 
 
 
 
 
d8d2ace
 
 
 
 
 
 
242a9b8
d8d2ace
7b487d7
d8d2ace
65191fa
4967734
65191fa
d8d2ace
65191fa
 
4967734
d8d2ace
65191fa
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import gradio as gr
import soundfile
import time
import torch
import scipy.io.wavfile
from espnet2.bin.tts_inference import Text2Speech
from espnet2.utils.types import str_or_none
from espnet2.bin.asr_inference import Speech2Text
from subprocess import call
import os
from espnet_model_zoo.downloader import ModelDownloader
d = ModelDownloader()
tag="ftshijt/open_li52_asr_train_asr_raw_bpe7000_valid.acc.ave_10best"
a1= (d.download_and_unpack(tag))
# print(a1)
# exit()

with open('s3prl.sh', 'rb') as file:
    script = file.read()
rc = call(script, shell=True)
import sys
sys.path.append(os.getcwd()+"/s3prl")
os.environ["PYTHONPATH"]=os.getcwd()+"/s3prl"
import fairseq
print(fairseq.__version__)
# exit()
# tagen = 'kan-bayashi/ljspeech_vits' 
# vocoder_tagen = "none" 

speech2text_slurp = Speech2Text.from_pretrained(
    asr_train_config="slurp/config.yaml",
    asr_model_file="slurp/valid.acc.ave_10best.pth",
    # Decoding parameters are not included in the model file
    nbest=1
)

speech2text_fsc = Speech2Text.from_pretrained(
    asr_train_config="fsc/config.yaml",
    asr_model_file="fsc/valid.acc.ave_5best.pth",
    # Decoding parameters are not included in the model file
    nbest=1
)

speech2text_snips = Speech2Text.from_pretrained(
    asr_train_config="espnet-slu-snips/config.yaml",
    asr_model_file="espnet-slu-snips/valid.acc.ave_10best.pth",
    # Decoding parameters are not included in the model file
    nbest=1
)

speech2text_catslu = Speech2Text.from_pretrained(
    asr_train_config="catslu/config.yaml",
    asr_model_file="catslu/valid.acc.ave_5best.pth",
    # Decoding parameters are not included in the model file
    nbest=1
)

speech2text_grabo = Speech2Text.from_pretrained(
    asr_train_config="grabo/config.yaml",
    asr_model_file="grabo/valid.acc.ave_10best.pth",
    ctc_weight=0.0,
    # Decoding parameters are not included in the model file
    nbest=1
)

def inference(wav,data):
  with torch.no_grad():
      if data == "english_slurp":
          speech, rate = soundfile.read(wav.name)
          nbests = speech2text_slurp(speech)
          text, *_ = nbests[0]
          intent=text.split(" ")[0]
          scenario=intent.split("_")[0]
          action=intent.split("_")[1]
          text="{scenario: "+scenario+", action: "+action+"}"
      elif data == "english_fsc":
          print(wav.name)
          speech, rate = soundfile.read(wav.name)
          print(speech.shape)
          if len(speech.shape)==2:
            speech=speech[:,0]
            # soundfile.write("store_file.wav", speech, rate, subtype='FLOAT')
          print(speech.shape)
          nbests = speech2text_fsc(speech)
          text, *_ = nbests[0]
          intent=text.split(" ")[0]
          action=intent.split("_")[0]
          objects=intent.split("_")[1]
          location=intent.split("_")[2]
          text="{action: "+action+", object: "+objects+", location: "+location+"}"
      elif data == "chinese":
          print(wav.name)
          speech, rate = soundfile.read(wav.name)
          print(speech.shape)
          if len(speech.shape)==2:
            speech=speech[:,0]
            # soundfile.write("store_file.wav", speech, rate, subtype='FLOAT')
          print(speech.shape)
          nbests = speech2text_catslu(speech)
          text, *_ = nbests[0]
          text=text.split(" ")[0]
      # elif data == "english_snips":
      #     print(wav.name)
      #     speech, rate = soundfile.read(wav.name)
      #     nbests = speech2text_snips(speech)
      #     text, *_ = nbests[0]
      elif data == "dutch":
          print(wav.name)
          speech, rate = soundfile.read(wav.name)
          nbests = speech2text_grabo(speech)
          text, *_ = nbests[0]
          # intent=text.split(" ")[0]
          # action=intent.split("_")[0]
          # objects=intent.split("_")[1]
          # location=intent.split("_")[2]
          # text="{action: "+action+", object: "+objects+", location: "+location+"}"

      # if lang == "chinese":
      #     wav = text2speechch(text)["wav"]
      #     scipy.io.wavfile.write("out.wav",text2speechch.fs , wav.view(-1).cpu().numpy())
      # if lang == "japanese":
      #     wav = text2speechjp(text)["wav"]
      #     scipy.io.wavfile.write("out.wav",text2speechjp.fs , wav.view(-1).cpu().numpy())
  return  text

title = "ESPnet2-SLU"
description = "Gradio demo for ESPnet2-SLU: Advancing Spoken Language Understanding through ESPnet. To use it, simply record your audio or click one of the examples to load them. Read more at the links below."
article = "<p style='text-align: center'><a href='https://github.com/espnet/espnet' target='_blank'>Github Repo</a></p>"

examples=[['audio_slurp.flac',"english_slurp"],['audio_fsc.wav',"english_fsc"],['audio_grabo.wav',"dutch"],['audio_catslu.wav',"chinese"]]

# gr.inputs.Textbox(label="input text",lines=10),gr.inputs.Radio(choices=["english"], type="value", default="english", label="language")
gr.Interface(
    inference, 
    [gr.inputs.Audio(label="input audio",source = "microphone", type="file"),gr.inputs.Radio(choices=["english_slurp","english_fsc","dutch","chinese"], type="value", default="english_slurp", label="Dataset")], 
    gr.outputs.Textbox(type="str", label="Output"),
    title=title,
    description=description,
    article=article,
    enable_queue=True,
    examples=examples
    ).launch(debug=True)