Spaces:
Build error
Build error
import whisper | |
model = whisper.load_model("small") | |
import os | |
os.system('pip install voicefixer --upgrade') | |
from voicefixer import VoiceFixer | |
voicefixer = VoiceFixer() | |
import gradio as gr | |
import openai | |
import torch | |
import torchaudio | |
from speechbrain.pretrained import SpectralMaskEnhancement | |
enhance_model = SpectralMaskEnhancement.from_hparams( | |
source="speechbrain/metricgan-plus-voicebank", | |
savedir="pretrained_models/metricgan-plus-voicebank", | |
run_opts={"device":"cuda"}, | |
) | |
import re | |
import random | |
import string | |
import librosa | |
import numpy as np | |
from pathlib import Path | |
from scipy.io.wavfile import write | |
from encoder import inference as encoder | |
from vocoder.hifigan import inference as gan_vocoder | |
from synthesizer.inference import Synthesizer | |
mes = [ | |
{"role": "system", "content": "You are my personal assistant. Respond to me only in Chinese."} | |
] | |
res = [] | |
class Mandarin: | |
def __init__(self): | |
self.encoder_path = "encoder/saved_models/pretrained.pt" | |
self.vocoder_path = "vocoder/saved_models/pretrained/g_hifigan.pt" | |
self.config_fpath = "vocoder/hifigan/config_16k_.json" | |
self.accent = "synthesizer/saved_models/普通话.pt" | |
synthesizers_cache = {} | |
if synthesizers_cache.get(self.accent) is None: | |
self.current_synt = Synthesizer(Path(self.accent)) | |
synthesizers_cache[self.accent] = self.current_synt | |
else: | |
self.current_synt = synthesizers_cache[self.accent] | |
encoder.load_model(Path(self.encoder_path)) | |
gan_vocoder.load_model(Path(self.vocoder_path), self.config_fpath) | |
def setVoice(self, timbre): | |
self.timbre = timbre | |
wav, sample_rate, = librosa.load(self.timbre) | |
encoder_wav = encoder.preprocess_wav(wav, sample_rate) | |
self.embed, _, _ = encoder.embed_utterance(encoder_wav, return_partials=True) | |
def say(self, text): | |
texts = filter(None, text.split("\n")) | |
punctuation = "!,。、?!,.?::" # punctuate and split/clean text | |
processed_texts = [] | |
for text in texts: | |
for processed_text in re.sub(r'[{}]+'.format(punctuation), '\n', text).split('\n'): | |
if processed_text: | |
processed_texts.append(processed_text.strip()) | |
texts = processed_texts | |
embeds = [self.embed] * len(texts) | |
specs = self.current_synt.synthesize_spectrograms(texts, embeds) | |
spec = np.concatenate(specs, axis=1) | |
wav, sample_rate = gan_vocoder.infer_waveform(spec) | |
return wav, sample_rate | |
def greet(apikey, upload, audio): | |
openai.api_key = apikey | |
# load audio and pad/trim it to fit 30 seconds | |
audio = whisper.load_audio(audio) | |
audio = whisper.pad_or_trim(audio) | |
# make log-Mel spectrogram and move to the same device as the model | |
mel = whisper.log_mel_spectrogram(audio).to(model.device) | |
# detect the spoken language | |
_, probs = model.detect_language(mel) | |
print(f"Detected language: {max(probs, key=probs.get)}") | |
# decode the audio | |
options = whisper.DecodingOptions() | |
result = whisper.decode(model, mel, options) | |
res.append(result.text) | |
messages = mes | |
# chatgpt | |
n = len(res) | |
content = res[n-1] | |
messages.append({"role": "user", "content": content}) | |
completion = openai.ChatCompletion.create( | |
model = "gpt-3.5-turbo", | |
messages = messages | |
) | |
chat_response = completion.choices[0].message.content | |
messages.append({"role": "assistant", "content": chat_response}) | |
voice=None | |
if voice is None: | |
voice = Mandarin() | |
voice.setVoice(upload) | |
voice.say("加载成功") | |
wav, sample_rate = voice.say(chat_response) | |
output_file = "".join( random.sample(string.ascii_lowercase + string.digits, 11) ) + ".wav" | |
write(output_file, sample_rate, wav.astype(np.float32)) | |
voicefixer.restore(input=output_file, # input wav file path | |
output="audio1.wav", # output wav file path | |
cuda=True, # whether to use gpu acceleration | |
mode = 0) # You can try out mode 0, 1, or 2 to find out the best result | |
noisy = enhance_model.load_audio( | |
"audio1.wav" | |
).unsqueeze(0) | |
enhanced = enhance_model.enhance_batch(noisy, lengths=torch.tensor([1.])) | |
torchaudio.save("enhanced.wav", enhanced.cpu(), 16000) | |
return [result.text, chat_response, "enhanced.wav"] | |
c1=gr.Interface( | |
fn=greet, | |
inputs=[ | |
gr.Textbox(lines=1, label = "请填写您的OpenAI-API-key", type = "password"), | |
gr.Audio(source="upload", label = "请上传您喜欢的声音(wav文件)", type="filepath"), | |
gr.Audio(source="microphone", label = "和您的专属AI聊天吧!", type="filepath"), | |
], | |
outputs=[ | |
gr.Textbox(label="Speech to Text"), gr.Textbox(label="ChatGPT Output"), gr.Audio(label="Audio with Custom Voice"), | |
], | |
#theme="huggingface", | |
#title= "🥳💬💕 - TalktoAI,随时随地,谈天说地!" | |
description = "🥳💬💕 - TalktoAI,随时随地,谈天说地! \n\n🤖 - 让有人文关怀的AI造福每一个人!AI向善,文明璀璨!TalktoAI - Enable the future!", | |
) | |
c2=gr.Interface( | |
fn=greet, | |
inputs=[ | |
gr.Textbox(lines=1, label = "请填写您的OpenAI-API-key", type = "password"), | |
gr.Audio(source="microphone", label = "请上传您喜欢的声音,并尽量避免噪音", type="filepath"), | |
gr.Audio(source="microphone", label = "和您的专属AI聊天吧!", type="filepath"), | |
], | |
outputs=[ | |
gr.Textbox(label="Speech to Text"), gr.Textbox(label="ChatGPT Output"), gr.Audio(label="Audio with Custom Voice"), | |
], | |
#theme="huggingface", | |
#title= "🥳💬💕 - TalktoAI,随时随地,谈天说地!" | |
description = "🥳💬💕 - TalktoAI,随时随地,谈天说地! \n\n🤖 - 让有人文关怀的AI造福每一个人!AI向善,文明璀璨!TalktoAI - Enable the future!", | |
) | |
demo = gr.TabbedInterface([c1, c2], ["wav文件上传", "麦克风上传"]) | |
demo.launch() | |