ChatGPT-with-Voice-Cloning-in-Chinese

Build error

App Files Files Community

ChatGPT-with-Voice-Cloning-in-Chinese / app.py

Kevin676

Update app.py

6004618 almost 2 years ago

raw

history blame

5.66 kB

	from TTS.api import TTS
	tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)
	import whisper
	model = whisper.load_model("small")
	import os
	os.system('pip install voicefixer --upgrade')
	from voicefixer import VoiceFixer
	voicefixer = VoiceFixer()
	import gradio as gr
	import openai
	import torch
	import torchaudio
	from speechbrain.pretrained import SpectralMaskEnhancement

	enhance_model = SpectralMaskEnhancement.from_hparams(
	source="speechbrain/metricgan-plus-voicebank",
	savedir="pretrained_models/metricgan-plus-voicebank",
	run_opts={"device":"cuda"},
	)

	import re
	import random
	import string
	import librosa
	import numpy as np

	from pathlib import Path
	from scipy.io.wavfile import write

	from encoder import inference as encoder
	from vocoder.hifigan import inference as gan_vocoder
	from synthesizer.inference import Synthesizer

	mes1 = [
	{"role": "system", "content": "You are a TOEFL examiner. Help me improve my oral Englsih and give me feedback."}
	]

	mes2 = [
	{"role": "system", "content": "You are a mental health therapist. Your name is Tina."}
	]

	mes3 = [
	{"role": "system", "content": "You are my personal assistant. Your name is Alice."}
	]

	res = []

	class Mandarin:
	def __init__(self):
	self.encoder_path = "encoder/saved_models/pretrained.pt"
	self.vocoder_path = "vocoder/saved_models/pretrained/g_hifigan.pt"
	self.config_fpath = "vocoder/hifigan/config_16k_.json"
	self.accent = "synthesizer/saved_models/普通话.pt"

	synthesizers_cache = {}
	if synthesizers_cache.get(self.accent) is None:
	self.current_synt = Synthesizer(Path(self.accent))
	synthesizers_cache[self.accent] = self.current_synt
	else:
	self.current_synt = synthesizers_cache[self.accent]

	encoder.load_model(Path(self.encoder_path))
	gan_vocoder.load_model(Path(self.vocoder_path), self.config_fpath)

	def setVoice(self, timbre):
	self.timbre = timbre
	wav, sample_rate, = librosa.load(self.timbre)

	encoder_wav = encoder.preprocess_wav(wav, sample_rate)
	self.embed, _, _ = encoder.embed_utterance(encoder_wav, return_partials=True)

	def say(self, text):
	texts = filter(None, text.split("\n"))
	punctuation = "！，。、？!,.?：:" # punctuate and split/clean text
	processed_texts = []
	for text in texts:
	for processed_text in re.sub(r'[{}]+'.format(punctuation), '\n', text).split('\n'):
	if processed_text:
	processed_texts.append(processed_text.strip())
	texts = processed_texts
	embeds = [self.embed] * len(texts)

	specs = self.current_synt.synthesize_spectrograms(texts, embeds)
	spec = np.concatenate(specs, axis=1)
	wav, sample_rate = gan_vocoder.infer_waveform(spec)

	return wav, sample_rate

	def greet(apikey, upload, audio, choice1, voice=None):

	openai.api_key = apikey

	# load audio and pad/trim it to fit 30 seconds
	audio = whisper.load_audio(audio)
	audio = whisper.pad_or_trim(audio)

	# make log-Mel spectrogram and move to the same device as the model
	mel = whisper.log_mel_spectrogram(audio).to(model.device)

	# detect the spoken language
	_, probs = model.detect_language(mel)
	print(f"Detected language: {max(probs, key=probs.get)}")

	# decode the audio
	options = whisper.DecodingOptions()
	result = whisper.decode(model, mel, options)
	res.append(result.text)

	if choice1 == "TOEFL":
	messages = mes1
	elif choice1 == "Therapist":
	messages = mes2
	elif choice1 == "Alice":
	messages = mes3

	# chatgpt
	n = len(res)
	content = res[n-1]
	messages.append({"role": "user", "content": content})

	completion = openai.ChatCompletion.create(
	model = "gpt-3.5-turbo",
	messages = messages
	)

	chat_response = completion.choices[0].message.content

	messages.append({"role": "assistant", "content": chat_response})

	if voice is None:
	voice = Mandarin()
	voice.setVoice(upload)
	voice.say("加载成功")
	wav, sample_rate = voice.say(chat_response)

	output_file = "".join( random.sample(string.ascii_lowercase + string.digits, 11) ) + ".wav"

	write(output_file, sample_rate, wav.astype(np.float32))

	voicefixer.restore(input=output_file, # input wav file path
	output="audio1.wav", # output wav file path
	cuda=True, # whether to use gpu acceleration
	mode = 0) # You can try out mode 0, 1, or 2 to find out the best result

	noisy = enhance_model.load_audio(
	"audio1.wav"
	).unsqueeze(0)

	enhanced = enhance_model.enhance_batch(noisy, lengths=torch.tensor([1.]))
	torchaudio.save("enhanced.wav", enhanced.cpu(), 16000)

	return [result.text, chat_response, "enhanced.wav", voice]

	def main():
	gr.Interface(
	fn=greet,
	inputs=[
	gr.Textbox(lines=1, label = "请填写您的OpenAI-API-key"),
	gr.Audio(source="upload", label = "请上传您喜欢的声音(wav文件)", type="filepath"),
	gr.Audio(source="microphone", label = "和您的专属AI聊天吧！", type="filepath"),
	gr.Radio(["TOEFL", "Therapist", "Alice"], label="TOEFL Examiner, Therapist Tina, or Assistant Alice?"),
	gr.State([]),
	],
	outputs=[
	gr.Textbox(label="Speech to Text"), gr.Textbox(label="ChatGPT Output"), gr.Audio(label="Audio with Custom Voice"), gr.State([]),
	],
	).launch()

	if __name__=="__main__":
	main()