Spaces:
Runtime error
Runtime error
Change app.py
Browse files- .DS_Store +0 -0
- app.py +81 -79
- audio--1504190171-headset.flac → audio_slurp.flac +0 -0
.DS_Store
CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
|
|
app.py
CHANGED
@@ -17,99 +17,101 @@ speech2text = Speech2Text.from_pretrained(
|
|
17 |
)
|
18 |
# Confirm the sampling rate is equal to that of the training corpus.
|
19 |
# If not, you need to resample the audio data before inputting to speech2text
|
20 |
-
speech, rate = soundfile.read("audio--1504190171-headset.flac")
|
21 |
-
nbests = speech2text(speech)
|
22 |
|
23 |
-
text, *_ = nbests[0]
|
24 |
-
print(text)
|
25 |
-
exit()
|
26 |
|
27 |
-
text2speechen = Text2Speech.from_pretrained(
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
)
|
45 |
|
46 |
|
47 |
-
tagjp = 'kan-bayashi/jsut_full_band_vits_prosody'
|
48 |
-
vocoder_tagjp = 'none'
|
49 |
|
50 |
-
text2speechjp = Text2Speech.from_pretrained(
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
)
|
68 |
|
69 |
-
tagch = 'kan-bayashi/csmsc_full_band_vits'
|
70 |
-
vocoder_tagch = "none"
|
71 |
|
72 |
-
text2speechch = Text2Speech.from_pretrained(
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
)
|
90 |
|
91 |
-
def inference(
|
92 |
with torch.no_grad():
|
93 |
if lang == "english":
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
|
|
106 |
|
107 |
-
examples=[['
|
108 |
|
|
|
109 |
gr.Interface(
|
110 |
inference,
|
111 |
-
[gr.inputs.
|
112 |
-
gr.outputs.
|
113 |
title=title,
|
114 |
description=description,
|
115 |
article=article,
|
|
|
17 |
)
|
18 |
# Confirm the sampling rate is equal to that of the training corpus.
|
19 |
# If not, you need to resample the audio data before inputting to speech2text
|
20 |
+
# speech, rate = soundfile.read("audio--1504190171-headset.flac")
|
21 |
+
# nbests = speech2text(speech)
|
22 |
|
23 |
+
# text, *_ = nbests[0]
|
24 |
+
# print(text)
|
25 |
+
# exit()
|
26 |
|
27 |
+
# text2speechen = Text2Speech.from_pretrained(
|
28 |
+
# model_tag=str_or_none(tagen),
|
29 |
+
# vocoder_tag=str_or_none(vocoder_tagen),
|
30 |
+
# device="cpu",
|
31 |
+
# # Only for Tacotron 2 & Transformer
|
32 |
+
# threshold=0.5,
|
33 |
+
# # Only for Tacotron 2
|
34 |
+
# minlenratio=0.0,
|
35 |
+
# maxlenratio=10.0,
|
36 |
+
# use_att_constraint=False,
|
37 |
+
# backward_window=1,
|
38 |
+
# forward_window=3,
|
39 |
+
# # Only for FastSpeech & FastSpeech2 & VITS
|
40 |
+
# speed_control_alpha=1.0,
|
41 |
+
# # Only for VITS
|
42 |
+
# noise_scale=0.333,
|
43 |
+
# noise_scale_dur=0.333,
|
44 |
+
# )
|
45 |
|
46 |
|
47 |
+
# tagjp = 'kan-bayashi/jsut_full_band_vits_prosody'
|
48 |
+
# vocoder_tagjp = 'none'
|
49 |
|
50 |
+
# text2speechjp = Text2Speech.from_pretrained(
|
51 |
+
# model_tag=str_or_none(tagjp),
|
52 |
+
# vocoder_tag=str_or_none(vocoder_tagjp),
|
53 |
+
# device="cpu",
|
54 |
+
# # Only for Tacotron 2 & Transformer
|
55 |
+
# threshold=0.5,
|
56 |
+
# # Only for Tacotron 2
|
57 |
+
# minlenratio=0.0,
|
58 |
+
# maxlenratio=10.0,
|
59 |
+
# use_att_constraint=False,
|
60 |
+
# backward_window=1,
|
61 |
+
# forward_window=3,
|
62 |
+
# # Only for FastSpeech & FastSpeech2 & VITS
|
63 |
+
# speed_control_alpha=1.0,
|
64 |
+
# # Only for VITS
|
65 |
+
# noise_scale=0.333,
|
66 |
+
# noise_scale_dur=0.333,
|
67 |
+
# )
|
68 |
|
69 |
+
# tagch = 'kan-bayashi/csmsc_full_band_vits'
|
70 |
+
# vocoder_tagch = "none"
|
71 |
|
72 |
+
# text2speechch = Text2Speech.from_pretrained(
|
73 |
+
# model_tag=str_or_none(tagch),
|
74 |
+
# vocoder_tag=str_or_none(vocoder_tagch),
|
75 |
+
# device="cpu",
|
76 |
+
# # Only for Tacotron 2 & Transformer
|
77 |
+
# threshold=0.5,
|
78 |
+
# # Only for Tacotron 2
|
79 |
+
# minlenratio=0.0,
|
80 |
+
# maxlenratio=10.0,
|
81 |
+
# use_att_constraint=False,
|
82 |
+
# backward_window=1,
|
83 |
+
# forward_window=3,
|
84 |
+
# # Only for FastSpeech & FastSpeech2 & VITS
|
85 |
+
# speed_control_alpha=1.0,
|
86 |
+
# # Only for VITS
|
87 |
+
# noise_scale=0.333,
|
88 |
+
# noise_scale_dur=0.333,
|
89 |
+
# )
|
90 |
|
91 |
+
def inference(wav,lang):
|
92 |
with torch.no_grad():
|
93 |
if lang == "english":
|
94 |
+
speech, rate = soundfile.read("audio--1504190171-headset.flac")
|
95 |
+
nbests = speech2text(speech)
|
96 |
+
text, *_ = nbests[0]
|
97 |
+
# if lang == "chinese":
|
98 |
+
# wav = text2speechch(text)["wav"]
|
99 |
+
# scipy.io.wavfile.write("out.wav",text2speechch.fs , wav.view(-1).cpu().numpy())
|
100 |
+
# if lang == "japanese":
|
101 |
+
# wav = text2speechjp(text)["wav"]
|
102 |
+
# scipy.io.wavfile.write("out.wav",text2speechjp.fs , wav.view(-1).cpu().numpy())
|
103 |
+
return text
|
104 |
+
title = "ESPnet2-SLU"
|
105 |
+
description = "Gradio demo for ESPnet2-SLU: Extending the Edge of SLU Research. To use it, simply record your audio. Read more at the links below."
|
106 |
+
article = "<p style='text-align: center'><a href='https://github.com/espnet/espnet' target='_blank'>Github Repo</a></p>"
|
107 |
|
108 |
+
examples=[['audio-_slurp.flac',"english"]]
|
109 |
|
110 |
+
# gr.inputs.Textbox(label="input text",lines=10),gr.inputs.Radio(choices=["english"], type="value", default="english", label="language")
|
111 |
gr.Interface(
|
112 |
inference,
|
113 |
+
[gr.inputs.Audio(label="input audio"),gr.inputs.Radio(choices=["english"], type="value", default="english", label="language")],
|
114 |
+
gr.outputs.Textbox(type="str", label="Output"),
|
115 |
title=title,
|
116 |
description=description,
|
117 |
article=article,
|
audio--1504190171-headset.flac → audio_slurp.flac
RENAMED
File without changes
|