Spaces:
Runtime error
Runtime error
File size: 5,382 Bytes
65191fa d146e4d 65191fa f4eb629 9bc09f5 4967734 65191fa f4eb629 9bc09f5 910bdcb 9bc09f5 65191fa 7b487d7 65191fa 7b487d7 65191fa 4967734 d231d7c 45af28c 4967734 7b487d7 65191fa 7b487d7 ef7af0d 7b487d7 d8d2ace daa9ddd 01c26e3 7b487d7 12c22ef 7b487d7 12c22ef 7b487d7 9bc09f5 12c22ef 9bc09f5 45af28c d231d7c 45af28c 4967734 45af28c d8d2ace 242a9b8 d8d2ace 7b487d7 d8d2ace 65191fa 4967734 65191fa d8d2ace 65191fa 4967734 d8d2ace 65191fa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
import gradio as gr
import soundfile
import time
import torch
import scipy.io.wavfile
from espnet2.bin.tts_inference import Text2Speech
from espnet2.utils.types import str_or_none
from espnet2.bin.asr_inference import Speech2Text
from subprocess import call
import os
from espnet_model_zoo.downloader import ModelDownloader
d = ModelDownloader()
tag="ftshijt/open_li52_asr_train_asr_raw_bpe7000_valid.acc.ave_10best"
a1= (d.download_and_unpack(tag))
# print(a1)
# exit()
with open('s3prl.sh', 'rb') as file:
script = file.read()
rc = call(script, shell=True)
import sys
sys.path.append(os.getcwd()+"/s3prl")
os.environ["PYTHONPATH"]=os.getcwd()+"/s3prl"
import fairseq
print(fairseq.__version__)
# exit()
# tagen = 'kan-bayashi/ljspeech_vits'
# vocoder_tagen = "none"
speech2text_slurp = Speech2Text.from_pretrained(
asr_train_config="slurp/config.yaml",
asr_model_file="slurp/valid.acc.ave_10best.pth",
# Decoding parameters are not included in the model file
nbest=1
)
speech2text_fsc = Speech2Text.from_pretrained(
asr_train_config="fsc/config.yaml",
asr_model_file="fsc/valid.acc.ave_5best.pth",
# Decoding parameters are not included in the model file
nbest=1
)
speech2text_snips = Speech2Text.from_pretrained(
asr_train_config="espnet-slu-snips/config.yaml",
asr_model_file="espnet-slu-snips/valid.acc.ave_10best.pth",
# Decoding parameters are not included in the model file
nbest=1
)
speech2text_catslu = Speech2Text.from_pretrained(
asr_train_config="catslu/config.yaml",
asr_model_file="catslu/valid.acc.ave_5best.pth",
# Decoding parameters are not included in the model file
nbest=1
)
speech2text_grabo = Speech2Text.from_pretrained(
asr_train_config="grabo/config.yaml",
asr_model_file="grabo/valid.acc.ave_10best.pth",
ctc_weight=0.0,
# Decoding parameters are not included in the model file
nbest=1
)
def inference(wav,data):
with torch.no_grad():
if data == "english_slurp":
speech, rate = soundfile.read(wav.name)
nbests = speech2text_slurp(speech)
text, *_ = nbests[0]
intent=text.split(" ")[0]
scenario=intent.split("_")[0]
action=intent.split("_")[1]
text="{scenario: "+scenario+", action: "+action+"}"
elif data == "english_fsc":
print(wav.name)
speech, rate = soundfile.read(wav.name)
print(speech.shape)
if len(speech.shape)==2:
speech=speech[:,0]
# soundfile.write("store_file.wav", speech, rate, subtype='FLOAT')
print(speech.shape)
nbests = speech2text_fsc(speech)
text, *_ = nbests[0]
intent=text.split(" ")[0]
action=intent.split("_")[0]
objects=intent.split("_")[1]
location=intent.split("_")[2]
text="{action: "+action+", object: "+objects+", location: "+location+"}"
elif data == "chinese":
print(wav.name)
speech, rate = soundfile.read(wav.name)
print(speech.shape)
if len(speech.shape)==2:
speech=speech[:,0]
# soundfile.write("store_file.wav", speech, rate, subtype='FLOAT')
print(speech.shape)
nbests = speech2text_catslu(speech)
text, *_ = nbests[0]
text=text.split(" ")[0]
# elif data == "english_snips":
# print(wav.name)
# speech, rate = soundfile.read(wav.name)
# nbests = speech2text_snips(speech)
# text, *_ = nbests[0]
elif data == "dutch":
print(wav.name)
speech, rate = soundfile.read(wav.name)
nbests = speech2text_grabo(speech)
text, *_ = nbests[0]
# intent=text.split(" ")[0]
# action=intent.split("_")[0]
# objects=intent.split("_")[1]
# location=intent.split("_")[2]
# text="{action: "+action+", object: "+objects+", location: "+location+"}"
# if lang == "chinese":
# wav = text2speechch(text)["wav"]
# scipy.io.wavfile.write("out.wav",text2speechch.fs , wav.view(-1).cpu().numpy())
# if lang == "japanese":
# wav = text2speechjp(text)["wav"]
# scipy.io.wavfile.write("out.wav",text2speechjp.fs , wav.view(-1).cpu().numpy())
return text
title = "ESPnet2-SLU"
description = "Gradio demo for ESPnet2-SLU: Advancing Spoken Language Understanding through ESPnet. To use it, simply record your audio or click one of the examples to load them. Read more at the links below."
article = "<p style='text-align: center'><a href='https://github.com/espnet/espnet' target='_blank'>Github Repo</a></p>"
examples=[['audio_slurp.flac',"english_slurp"],['audio_fsc.wav',"english_fsc"],['audio_grabo.wav',"dutch"],['audio_catslu.wav',"chinese"]]
# gr.inputs.Textbox(label="input text",lines=10),gr.inputs.Radio(choices=["english"], type="value", default="english", label="language")
gr.Interface(
inference,
[gr.inputs.Audio(label="input audio",source = "microphone", type="file"),gr.inputs.Radio(choices=["english_slurp","english_fsc","dutch","chinese"], type="value", default="english_slurp", label="Dataset")],
gr.outputs.Textbox(type="str", label="Output"),
title=title,
description=description,
article=article,
enable_queue=True,
examples=examples
).launch(debug=True)
|