ssiidd commited on
Commit
d8d2ace
1 Parent(s): 65191fa

Change app.py

Browse files
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
app.py CHANGED
@@ -17,99 +17,101 @@ speech2text = Speech2Text.from_pretrained(
17
  )
18
  # Confirm the sampling rate is equal to that of the training corpus.
19
  # If not, you need to resample the audio data before inputting to speech2text
20
- speech, rate = soundfile.read("audio--1504190171-headset.flac")
21
- nbests = speech2text(speech)
22
 
23
- text, *_ = nbests[0]
24
- print(text)
25
- exit()
26
 
27
- text2speechen = Text2Speech.from_pretrained(
28
- model_tag=str_or_none(tagen),
29
- vocoder_tag=str_or_none(vocoder_tagen),
30
- device="cpu",
31
- # Only for Tacotron 2 & Transformer
32
- threshold=0.5,
33
- # Only for Tacotron 2
34
- minlenratio=0.0,
35
- maxlenratio=10.0,
36
- use_att_constraint=False,
37
- backward_window=1,
38
- forward_window=3,
39
- # Only for FastSpeech & FastSpeech2 & VITS
40
- speed_control_alpha=1.0,
41
- # Only for VITS
42
- noise_scale=0.333,
43
- noise_scale_dur=0.333,
44
- )
45
 
46
 
47
- tagjp = 'kan-bayashi/jsut_full_band_vits_prosody'
48
- vocoder_tagjp = 'none'
49
 
50
- text2speechjp = Text2Speech.from_pretrained(
51
- model_tag=str_or_none(tagjp),
52
- vocoder_tag=str_or_none(vocoder_tagjp),
53
- device="cpu",
54
- # Only for Tacotron 2 & Transformer
55
- threshold=0.5,
56
- # Only for Tacotron 2
57
- minlenratio=0.0,
58
- maxlenratio=10.0,
59
- use_att_constraint=False,
60
- backward_window=1,
61
- forward_window=3,
62
- # Only for FastSpeech & FastSpeech2 & VITS
63
- speed_control_alpha=1.0,
64
- # Only for VITS
65
- noise_scale=0.333,
66
- noise_scale_dur=0.333,
67
- )
68
 
69
- tagch = 'kan-bayashi/csmsc_full_band_vits'
70
- vocoder_tagch = "none"
71
 
72
- text2speechch = Text2Speech.from_pretrained(
73
- model_tag=str_or_none(tagch),
74
- vocoder_tag=str_or_none(vocoder_tagch),
75
- device="cpu",
76
- # Only for Tacotron 2 & Transformer
77
- threshold=0.5,
78
- # Only for Tacotron 2
79
- minlenratio=0.0,
80
- maxlenratio=10.0,
81
- use_att_constraint=False,
82
- backward_window=1,
83
- forward_window=3,
84
- # Only for FastSpeech & FastSpeech2 & VITS
85
- speed_control_alpha=1.0,
86
- # Only for VITS
87
- noise_scale=0.333,
88
- noise_scale_dur=0.333,
89
- )
90
 
91
- def inference(text,lang):
92
  with torch.no_grad():
93
  if lang == "english":
94
- wav = text2speechen(text)["wav"]
95
- scipy.io.wavfile.write("out.wav",text2speechen.fs , wav.view(-1).cpu().numpy())
96
- if lang == "chinese":
97
- wav = text2speechch(text)["wav"]
98
- scipy.io.wavfile.write("out.wav",text2speechch.fs , wav.view(-1).cpu().numpy())
99
- if lang == "japanese":
100
- wav = text2speechjp(text)["wav"]
101
- scipy.io.wavfile.write("out.wav",text2speechjp.fs , wav.view(-1).cpu().numpy())
102
- return "out.wav"
103
- title = "ESPnet2-TTS"
104
- description = "Gradio demo for ESPnet2-TTS: Extending the Edge of TTS Research. To use it, simply add your audio, or click one of the examples to load them. Read more at the links below."
105
- article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2110.07840' target='_blank'>ESPnet2-TTS: Extending the Edge of TTS Research</a> | <a href='https://github.com/espnet/espnet' target='_blank'>Github Repo</a></p>"
 
106
 
107
- examples=[['This paper describes ESPnet2-TTS, an end-to-end text-to-speech (E2E-TTS) toolkit. ESPnet2-TTS extends our earlier version, ESPnet-TTS, by adding many new features, including: on-the-fly flexible pre-processing, joint training with neural vocoders, and state-of-the-art TTS models with extensions like full-band E2E text-to-waveform modeling, which simplify the training pipeline and further enhance TTS performance. The unified design of our recipes enables users to quickly reproduce state-of-the-art E2E-TTS results',"english"],['レシピの統一された設計により、ユーザーは最先端のE2E-TTSの結果をすばやく再現できます。また、推論用の統合Pythonインターフェースで事前にトレーニングされたモデルを多数提供し、ユーザーがベースラインサンプルを生成してデモを構築するための迅速な手段を提供します。',"japanese"],['对英语和日语语料库的实验评估表明,我们提供的模型合成了与真实情况相当的话语,达到了最先进的水平',"chinese"]]
108
 
 
109
  gr.Interface(
110
  inference,
111
- [gr.inputs.Textbox(label="input text",lines=10),gr.inputs.Radio(choices=["english", "chinese", "japanese"], type="value", default="english", label="language")],
112
- gr.outputs.Audio(type="file", label="Output"),
113
  title=title,
114
  description=description,
115
  article=article,
 
17
  )
18
  # Confirm the sampling rate is equal to that of the training corpus.
19
  # If not, you need to resample the audio data before inputting to speech2text
20
+ # speech, rate = soundfile.read("audio--1504190171-headset.flac")
21
+ # nbests = speech2text(speech)
22
 
23
+ # text, *_ = nbests[0]
24
+ # print(text)
25
+ # exit()
26
 
27
+ # text2speechen = Text2Speech.from_pretrained(
28
+ # model_tag=str_or_none(tagen),
29
+ # vocoder_tag=str_or_none(vocoder_tagen),
30
+ # device="cpu",
31
+ # # Only for Tacotron 2 & Transformer
32
+ # threshold=0.5,
33
+ # # Only for Tacotron 2
34
+ # minlenratio=0.0,
35
+ # maxlenratio=10.0,
36
+ # use_att_constraint=False,
37
+ # backward_window=1,
38
+ # forward_window=3,
39
+ # # Only for FastSpeech & FastSpeech2 & VITS
40
+ # speed_control_alpha=1.0,
41
+ # # Only for VITS
42
+ # noise_scale=0.333,
43
+ # noise_scale_dur=0.333,
44
+ # )
45
 
46
 
47
+ # tagjp = 'kan-bayashi/jsut_full_band_vits_prosody'
48
+ # vocoder_tagjp = 'none'
49
 
50
+ # text2speechjp = Text2Speech.from_pretrained(
51
+ # model_tag=str_or_none(tagjp),
52
+ # vocoder_tag=str_or_none(vocoder_tagjp),
53
+ # device="cpu",
54
+ # # Only for Tacotron 2 & Transformer
55
+ # threshold=0.5,
56
+ # # Only for Tacotron 2
57
+ # minlenratio=0.0,
58
+ # maxlenratio=10.0,
59
+ # use_att_constraint=False,
60
+ # backward_window=1,
61
+ # forward_window=3,
62
+ # # Only for FastSpeech & FastSpeech2 & VITS
63
+ # speed_control_alpha=1.0,
64
+ # # Only for VITS
65
+ # noise_scale=0.333,
66
+ # noise_scale_dur=0.333,
67
+ # )
68
 
69
+ # tagch = 'kan-bayashi/csmsc_full_band_vits'
70
+ # vocoder_tagch = "none"
71
 
72
+ # text2speechch = Text2Speech.from_pretrained(
73
+ # model_tag=str_or_none(tagch),
74
+ # vocoder_tag=str_or_none(vocoder_tagch),
75
+ # device="cpu",
76
+ # # Only for Tacotron 2 & Transformer
77
+ # threshold=0.5,
78
+ # # Only for Tacotron 2
79
+ # minlenratio=0.0,
80
+ # maxlenratio=10.0,
81
+ # use_att_constraint=False,
82
+ # backward_window=1,
83
+ # forward_window=3,
84
+ # # Only for FastSpeech & FastSpeech2 & VITS
85
+ # speed_control_alpha=1.0,
86
+ # # Only for VITS
87
+ # noise_scale=0.333,
88
+ # noise_scale_dur=0.333,
89
+ # )
90
 
91
+ def inference(wav,lang):
92
  with torch.no_grad():
93
  if lang == "english":
94
+ speech, rate = soundfile.read("audio--1504190171-headset.flac")
95
+ nbests = speech2text(speech)
96
+ text, *_ = nbests[0]
97
+ # if lang == "chinese":
98
+ # wav = text2speechch(text)["wav"]
99
+ # scipy.io.wavfile.write("out.wav",text2speechch.fs , wav.view(-1).cpu().numpy())
100
+ # if lang == "japanese":
101
+ # wav = text2speechjp(text)["wav"]
102
+ # scipy.io.wavfile.write("out.wav",text2speechjp.fs , wav.view(-1).cpu().numpy())
103
+ return text
104
+ title = "ESPnet2-SLU"
105
+ description = "Gradio demo for ESPnet2-SLU: Extending the Edge of SLU Research. To use it, simply record your audio. Read more at the links below."
106
+ article = "<p style='text-align: center'><a href='https://github.com/espnet/espnet' target='_blank'>Github Repo</a></p>"
107
 
108
+ examples=[['audio-_slurp.flac',"english"]]
109
 
110
+ # gr.inputs.Textbox(label="input text",lines=10),gr.inputs.Radio(choices=["english"], type="value", default="english", label="language")
111
  gr.Interface(
112
  inference,
113
+ [gr.inputs.Audio(label="input audio"),gr.inputs.Radio(choices=["english"], type="value", default="english", label="language")],
114
+ gr.outputs.Textbox(type="str", label="Output"),
115
  title=title,
116
  description=description,
117
  article=article,
audio--1504190171-headset.flac → audio_slurp.flac RENAMED
File without changes