ChatGPT-with-Voice-Cloning-in-Chinese

Build error

App Files Files Community

ChatGPT-with-Voice-Cloning-in-Chinese / app.py

Kevin676

Update app.py

0c8dd37 over 1 year ago

raw

history blame

6.12 kB

	import whisper
	model = whisper.load_model("small")
	import os
	os.system('pip install voicefixer --upgrade')
	from voicefixer import VoiceFixer
	voicefixer = VoiceFixer()
	import gradio as gr
	import openai
	import torch
	import torchaudio
	from speechbrain.pretrained import SpectralMaskEnhancement

	enhance_model = SpectralMaskEnhancement.from_hparams(
	source="speechbrain/metricgan-plus-voicebank",
	savedir="pretrained_models/metricgan-plus-voicebank",
	run_opts={"device":"cuda"},
	)

	import re
	import random
	import string
	import librosa
	import numpy as np

	from pathlib import Path
	from scipy.io.wavfile import write

	from encoder import inference as encoder
	from vocoder.hifigan import inference as gan_vocoder
	from synthesizer.inference import Synthesizer

	mes = [
	{"role": "system", "content": "You are my personal assistant. Respond to me only in Chinese."}
	]

	res = []

	class Mandarin:
	def __init__(self):
	self.encoder_path = "encoder/saved_models/pretrained.pt"
	self.vocoder_path = "vocoder/saved_models/pretrained/g_hifigan.pt"
	self.config_fpath = "vocoder/hifigan/config_16k_.json"
	self.accent = "synthesizer/saved_models/普通话.pt"

	synthesizers_cache = {}
	if synthesizers_cache.get(self.accent) is None:
	self.current_synt = Synthesizer(Path(self.accent))
	synthesizers_cache[self.accent] = self.current_synt
	else:
	self.current_synt = synthesizers_cache[self.accent]

	encoder.load_model(Path(self.encoder_path))
	gan_vocoder.load_model(Path(self.vocoder_path), self.config_fpath)

	def setVoice(self, timbre):
	self.timbre = timbre
	wav, sample_rate, = librosa.load(self.timbre)

	encoder_wav = encoder.preprocess_wav(wav, sample_rate)
	self.embed, _, _ = encoder.embed_utterance(encoder_wav, return_partials=True)

	def say(self, text):
	texts = filter(None, text.split("\n"))
	punctuation = "！，。、？!,.?：:" # punctuate and split/clean text
	processed_texts = []
	for text in texts:
	for processed_text in re.sub(r'[{}]+'.format(punctuation), '\n', text).split('\n'):
	if processed_text:
	processed_texts.append(processed_text.strip())
	texts = processed_texts
	embeds = [self.embed] * len(texts)

	specs = self.current_synt.synthesize_spectrograms(texts, embeds)
	spec = np.concatenate(specs, axis=1)
	wav, sample_rate = gan_vocoder.infer_waveform(spec)

	return wav, sample_rate

	def greet(apikey, upload, audio):

	openai.api_key = apikey

	# load audio and pad/trim it to fit 30 seconds
	audio = whisper.load_audio(audio)
	audio = whisper.pad_or_trim(audio)

	# make log-Mel spectrogram and move to the same device as the model
	mel = whisper.log_mel_spectrogram(audio).to(model.device)

	# detect the spoken language
	_, probs = model.detect_language(mel)
	print(f"Detected language: {max(probs, key=probs.get)}")

	# decode the audio
	options = whisper.DecodingOptions()
	result = whisper.decode(model, mel, options)
	res.append(result.text)

	messages = mes

	# chatgpt
	n = len(res)
	content = res[n-1]
	messages.append({"role": "user", "content": content})

	completion = openai.ChatCompletion.create(
	model = "gpt-3.5-turbo",
	messages = messages
	)

	chat_response = completion.choices[0].message.content

	messages.append({"role": "assistant", "content": chat_response})

	voice=None

	if voice is None:
	voice = Mandarin()
	voice.setVoice(upload)
	voice.say("加载成功")
	wav, sample_rate = voice.say(chat_response)

	output_file = "".join( random.sample(string.ascii_lowercase + string.digits, 11) ) + ".wav"

	write(output_file, sample_rate, wav.astype(np.float32))

	voicefixer.restore(input=output_file, # input wav file path
	output="audio1.wav", # output wav file path
	cuda=True, # whether to use gpu acceleration
	mode = 0) # You can try out mode 0, 1, or 2 to find out the best result

	noisy = enhance_model.load_audio(
	"audio1.wav"
	).unsqueeze(0)

	enhanced = enhance_model.enhance_batch(noisy, lengths=torch.tensor([1.]))
	torchaudio.save("enhanced.wav", enhanced.cpu(), 16000)

	return [result.text, chat_response, "enhanced.wav"]

	c1=gr.Interface(
	fn=greet,
	inputs=[
	gr.Textbox(lines=1, label = "请填写您的OpenAI-API-key", type = "password"),
	gr.Audio(source="upload", label = "请上传您喜欢的声音(wav文件)", type="filepath"),
	gr.Audio(source="microphone", label = "和您的专属AI聊天吧！", type="filepath"),
	],
	outputs=[
	gr.Textbox(label="Speech to Text"), gr.Textbox(label="ChatGPT Output"), gr.Audio(label="Audio with Custom Voice"),
	],
	#theme="huggingface",
	#title= "🥳💬💕 - TalktoAI，随时随地，谈天说地！"
	description = "🥳💬💕 - TalktoAI，随时随地，谈天说地！ \n\n🤖 - 让有人文关怀的AI造福每一个人！AI向善，文明璀璨！TalktoAI - Enable the future！",
	)


	c2=gr.Interface(
	fn=greet,
	inputs=[
	gr.Textbox(lines=1, label = "请填写您的OpenAI-API-key", type = "password"),
	gr.Audio(source="microphone", label = "请上传您喜欢的声音，并尽量避免噪音", type="filepath"),
	gr.Audio(source="microphone", label = "和您的专属AI聊天吧！", type="filepath"),
	],
	outputs=[
	gr.Textbox(label="Speech to Text"), gr.Textbox(label="ChatGPT Output"), gr.Audio(label="Audio with Custom Voice"),
	],
	#theme="huggingface",
	#title= "🥳💬💕 - TalktoAI，随时随地，谈天说地！"
	description = "🥳💬💕 - TalktoAI，随时随地，谈天说地！ \n\n🤖 - 让有人文关怀的AI造福每一个人！AI向善，文明璀璨！TalktoAI - Enable the future！",
	)

	demo = gr.TabbedInterface([c1, c2], ["wav文件上传", "麦克风上传"])
	demo.launch()