Kevin676 commited on
Commit
6004618
·
1 Parent(s): 4817bcc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -7
app.py CHANGED
@@ -1,5 +1,22 @@
1
-
 
 
 
 
 
 
 
2
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
3
 
4
  import re
5
  import random
@@ -14,6 +31,20 @@ from encoder import inference as encoder
14
  from vocoder.hifigan import inference as gan_vocoder
15
  from synthesizer.inference import Synthesizer
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  class Mandarin:
18
  def __init__(self):
19
  self.encoder_path = "encoder/saved_models/pretrained.pt"
@@ -55,25 +86,84 @@ class Mandarin:
55
 
56
  return wav, sample_rate
57
 
58
- def greet(audio, text, voice=None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
  if voice is None:
61
  voice = Mandarin()
62
- voice.setVoice(audio.name)
63
  voice.say("加载成功")
64
- wav, sample_rate = voice.say(text)
65
 
66
  output_file = "".join( random.sample(string.ascii_lowercase + string.digits, 11) ) + ".wav"
67
 
68
  write(output_file, sample_rate, wav.astype(np.float32))
69
 
70
- return output_file, voice
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
  def main():
73
  gr.Interface(
74
  fn=greet,
75
- inputs=[gr.inputs.Audio(type="file"),"text", "state"],
76
- outputs=[gr.outputs.Audio(type="file"), "state"]
 
 
 
 
 
 
 
 
77
  ).launch()
78
 
79
  if __name__=="__main__":
 
1
+ from TTS.api import TTS
2
+ tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)
3
+ import whisper
4
+ model = whisper.load_model("small")
5
+ import os
6
+ os.system('pip install voicefixer --upgrade')
7
+ from voicefixer import VoiceFixer
8
+ voicefixer = VoiceFixer()
9
  import gradio as gr
10
+ import openai
11
+ import torch
12
+ import torchaudio
13
+ from speechbrain.pretrained import SpectralMaskEnhancement
14
+
15
+ enhance_model = SpectralMaskEnhancement.from_hparams(
16
+ source="speechbrain/metricgan-plus-voicebank",
17
+ savedir="pretrained_models/metricgan-plus-voicebank",
18
+ run_opts={"device":"cuda"},
19
+ )
20
 
21
  import re
22
  import random
 
31
  from vocoder.hifigan import inference as gan_vocoder
32
  from synthesizer.inference import Synthesizer
33
 
34
+ mes1 = [
35
+ {"role": "system", "content": "You are a TOEFL examiner. Help me improve my oral Englsih and give me feedback."}
36
+ ]
37
+
38
+ mes2 = [
39
+ {"role": "system", "content": "You are a mental health therapist. Your name is Tina."}
40
+ ]
41
+
42
+ mes3 = [
43
+ {"role": "system", "content": "You are my personal assistant. Your name is Alice."}
44
+ ]
45
+
46
+ res = []
47
+
48
  class Mandarin:
49
  def __init__(self):
50
  self.encoder_path = "encoder/saved_models/pretrained.pt"
 
86
 
87
  return wav, sample_rate
88
 
89
+ def greet(apikey, upload, audio, choice1, voice=None):
90
+
91
+ openai.api_key = apikey
92
+
93
+ # load audio and pad/trim it to fit 30 seconds
94
+ audio = whisper.load_audio(audio)
95
+ audio = whisper.pad_or_trim(audio)
96
+
97
+ # make log-Mel spectrogram and move to the same device as the model
98
+ mel = whisper.log_mel_spectrogram(audio).to(model.device)
99
+
100
+ # detect the spoken language
101
+ _, probs = model.detect_language(mel)
102
+ print(f"Detected language: {max(probs, key=probs.get)}")
103
+
104
+ # decode the audio
105
+ options = whisper.DecodingOptions()
106
+ result = whisper.decode(model, mel, options)
107
+ res.append(result.text)
108
+
109
+ if choice1 == "TOEFL":
110
+ messages = mes1
111
+ elif choice1 == "Therapist":
112
+ messages = mes2
113
+ elif choice1 == "Alice":
114
+ messages = mes3
115
+
116
+ # chatgpt
117
+ n = len(res)
118
+ content = res[n-1]
119
+ messages.append({"role": "user", "content": content})
120
+
121
+ completion = openai.ChatCompletion.create(
122
+ model = "gpt-3.5-turbo",
123
+ messages = messages
124
+ )
125
+
126
+ chat_response = completion.choices[0].message.content
127
+
128
+ messages.append({"role": "assistant", "content": chat_response})
129
 
130
  if voice is None:
131
  voice = Mandarin()
132
+ voice.setVoice(upload)
133
  voice.say("加载成功")
134
+ wav, sample_rate = voice.say(chat_response)
135
 
136
  output_file = "".join( random.sample(string.ascii_lowercase + string.digits, 11) ) + ".wav"
137
 
138
  write(output_file, sample_rate, wav.astype(np.float32))
139
 
140
+ voicefixer.restore(input=output_file, # input wav file path
141
+ output="audio1.wav", # output wav file path
142
+ cuda=True, # whether to use gpu acceleration
143
+ mode = 0) # You can try out mode 0, 1, or 2 to find out the best result
144
+
145
+ noisy = enhance_model.load_audio(
146
+ "audio1.wav"
147
+ ).unsqueeze(0)
148
+
149
+ enhanced = enhance_model.enhance_batch(noisy, lengths=torch.tensor([1.]))
150
+ torchaudio.save("enhanced.wav", enhanced.cpu(), 16000)
151
+
152
+ return [result.text, chat_response, "enhanced.wav", voice]
153
 
154
  def main():
155
  gr.Interface(
156
  fn=greet,
157
+ inputs=[
158
+ gr.Textbox(lines=1, label = "请填写您的OpenAI-API-key"),
159
+ gr.Audio(source="upload", label = "请上传您喜欢的声音(wav文件)", type="filepath"),
160
+ gr.Audio(source="microphone", label = "和您的专属AI聊天吧!", type="filepath"),
161
+ gr.Radio(["TOEFL", "Therapist", "Alice"], label="TOEFL Examiner, Therapist Tina, or Assistant Alice?"),
162
+ gr.State([]),
163
+ ],
164
+ outputs=[
165
+ gr.Textbox(label="Speech to Text"), gr.Textbox(label="ChatGPT Output"), gr.Audio(label="Audio with Custom Voice"), gr.State([]),
166
+ ],
167
  ).launch()
168
 
169
  if __name__=="__main__":