from TTS.api import TTS tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True) import whisper model = whisper.load_model("small") import os os.system('pip install voicefixer --upgrade') from voicefixer import VoiceFixer voicefixer = VoiceFixer() import gradio as gr import openai import torch import torchaudio from speechbrain.pretrained import SpectralMaskEnhancement enhance_model = SpectralMaskEnhancement.from_hparams( source="speechbrain/metricgan-plus-voicebank", savedir="pretrained_models/metricgan-plus-voicebank", run_opts={"device":"cuda"}, ) import re import random import string import librosa import numpy as np from pathlib import Path from scipy.io.wavfile import write from encoder import inference as encoder from vocoder.hifigan import inference as gan_vocoder from synthesizer.inference import Synthesizer mes1 = [ {"role": "system", "content": "You are a TOEFL examiner. Help me improve my oral Englsih and give me feedback."} ] mes2 = [ {"role": "system", "content": "You are a mental health therapist. Your name is Tina."} ] mes3 = [ {"role": "system", "content": "You are my personal assistant. Your name is Alice."} ] res = [] class Mandarin: def __init__(self): self.encoder_path = "encoder/saved_models/pretrained.pt" self.vocoder_path = "vocoder/saved_models/pretrained/g_hifigan.pt" self.config_fpath = "vocoder/hifigan/config_16k_.json" self.accent = "synthesizer/saved_models/普通话.pt" synthesizers_cache = {} if synthesizers_cache.get(self.accent) is None: self.current_synt = Synthesizer(Path(self.accent)) synthesizers_cache[self.accent] = self.current_synt else: self.current_synt = synthesizers_cache[self.accent] encoder.load_model(Path(self.encoder_path)) gan_vocoder.load_model(Path(self.vocoder_path), self.config_fpath) def setVoice(self, timbre): self.timbre = timbre wav, sample_rate, = librosa.load(self.timbre) encoder_wav = encoder.preprocess_wav(wav, sample_rate) self.embed, _, _ = encoder.embed_utterance(encoder_wav, return_partials=True) def say(self, text): texts = filter(None, text.split("\n")) punctuation = "!,。、?!,.?::" # punctuate and split/clean text processed_texts = [] for text in texts: for processed_text in re.sub(r'[{}]+'.format(punctuation), '\n', text).split('\n'): if processed_text: processed_texts.append(processed_text.strip()) texts = processed_texts embeds = [self.embed] * len(texts) specs = self.current_synt.synthesize_spectrograms(texts, embeds) spec = np.concatenate(specs, axis=1) wav, sample_rate = gan_vocoder.infer_waveform(spec) return wav, sample_rate def greet(apikey, upload, audio, choice1, voice=None): openai.api_key = apikey # load audio and pad/trim it to fit 30 seconds audio = whisper.load_audio(audio) audio = whisper.pad_or_trim(audio) # make log-Mel spectrogram and move to the same device as the model mel = whisper.log_mel_spectrogram(audio).to(model.device) # detect the spoken language _, probs = model.detect_language(mel) print(f"Detected language: {max(probs, key=probs.get)}") # decode the audio options = whisper.DecodingOptions() result = whisper.decode(model, mel, options) res.append(result.text) if choice1 == "TOEFL": messages = mes1 elif choice1 == "Therapist": messages = mes2 elif choice1 == "Alice": messages = mes3 # chatgpt n = len(res) content = res[n-1] messages.append({"role": "user", "content": content}) completion = openai.ChatCompletion.create( model = "gpt-3.5-turbo", messages = messages ) chat_response = completion.choices[0].message.content messages.append({"role": "assistant", "content": chat_response}) if voice is None: voice = Mandarin() voice.setVoice(upload) voice.say("加载成功") wav, sample_rate = voice.say(chat_response) output_file = "".join( random.sample(string.ascii_lowercase + string.digits, 11) ) + ".wav" write(output_file, sample_rate, wav.astype(np.float32)) voicefixer.restore(input=output_file, # input wav file path output="audio1.wav", # output wav file path cuda=True, # whether to use gpu acceleration mode = 0) # You can try out mode 0, 1, or 2 to find out the best result noisy = enhance_model.load_audio( "audio1.wav" ).unsqueeze(0) enhanced = enhance_model.enhance_batch(noisy, lengths=torch.tensor([1.])) torchaudio.save("enhanced.wav", enhanced.cpu(), 16000) return [result.text, chat_response, "enhanced.wav", voice] def main(): gr.Interface( fn=greet, inputs=[ gr.Textbox(lines=1, label = "请填写您的OpenAI-API-key"), gr.Audio(source="upload", label = "请上传您喜欢的声音(wav文件)", type="filepath"), gr.Audio(source="microphone", label = "和您的专属AI聊天吧!", type="filepath"), gr.Radio(["TOEFL", "Therapist", "Alice"], label="TOEFL Examiner, Therapist Tina, or Assistant Alice?"), gr.State([]), ], outputs=[ gr.Textbox(label="Speech to Text"), gr.Textbox(label="ChatGPT Output"), gr.Audio(label="Audio with Custom Voice"), gr.State([]), ], ).launch() if __name__=="__main__": main()