Kevin676's picture
Update app.py
0c8dd37
raw
history blame
6.12 kB
import whisper
model = whisper.load_model("small")
import os
os.system('pip install voicefixer --upgrade')
from voicefixer import VoiceFixer
voicefixer = VoiceFixer()
import gradio as gr
import openai
import torch
import torchaudio
from speechbrain.pretrained import SpectralMaskEnhancement
enhance_model = SpectralMaskEnhancement.from_hparams(
source="speechbrain/metricgan-plus-voicebank",
savedir="pretrained_models/metricgan-plus-voicebank",
run_opts={"device":"cuda"},
)
import re
import random
import string
import librosa
import numpy as np
from pathlib import Path
from scipy.io.wavfile import write
from encoder import inference as encoder
from vocoder.hifigan import inference as gan_vocoder
from synthesizer.inference import Synthesizer
mes = [
{"role": "system", "content": "You are my personal assistant. Respond to me only in Chinese."}
]
res = []
class Mandarin:
def __init__(self):
self.encoder_path = "encoder/saved_models/pretrained.pt"
self.vocoder_path = "vocoder/saved_models/pretrained/g_hifigan.pt"
self.config_fpath = "vocoder/hifigan/config_16k_.json"
self.accent = "synthesizer/saved_models/普通话.pt"
synthesizers_cache = {}
if synthesizers_cache.get(self.accent) is None:
self.current_synt = Synthesizer(Path(self.accent))
synthesizers_cache[self.accent] = self.current_synt
else:
self.current_synt = synthesizers_cache[self.accent]
encoder.load_model(Path(self.encoder_path))
gan_vocoder.load_model(Path(self.vocoder_path), self.config_fpath)
def setVoice(self, timbre):
self.timbre = timbre
wav, sample_rate, = librosa.load(self.timbre)
encoder_wav = encoder.preprocess_wav(wav, sample_rate)
self.embed, _, _ = encoder.embed_utterance(encoder_wav, return_partials=True)
def say(self, text):
texts = filter(None, text.split("\n"))
punctuation = "!,。、?!,.?::" # punctuate and split/clean text
processed_texts = []
for text in texts:
for processed_text in re.sub(r'[{}]+'.format(punctuation), '\n', text).split('\n'):
if processed_text:
processed_texts.append(processed_text.strip())
texts = processed_texts
embeds = [self.embed] * len(texts)
specs = self.current_synt.synthesize_spectrograms(texts, embeds)
spec = np.concatenate(specs, axis=1)
wav, sample_rate = gan_vocoder.infer_waveform(spec)
return wav, sample_rate
def greet(apikey, upload, audio):
openai.api_key = apikey
# load audio and pad/trim it to fit 30 seconds
audio = whisper.load_audio(audio)
audio = whisper.pad_or_trim(audio)
# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio).to(model.device)
# detect the spoken language
_, probs = model.detect_language(mel)
print(f"Detected language: {max(probs, key=probs.get)}")
# decode the audio
options = whisper.DecodingOptions()
result = whisper.decode(model, mel, options)
res.append(result.text)
messages = mes
# chatgpt
n = len(res)
content = res[n-1]
messages.append({"role": "user", "content": content})
completion = openai.ChatCompletion.create(
model = "gpt-3.5-turbo",
messages = messages
)
chat_response = completion.choices[0].message.content
messages.append({"role": "assistant", "content": chat_response})
voice=None
if voice is None:
voice = Mandarin()
voice.setVoice(upload)
voice.say("加载成功")
wav, sample_rate = voice.say(chat_response)
output_file = "".join( random.sample(string.ascii_lowercase + string.digits, 11) ) + ".wav"
write(output_file, sample_rate, wav.astype(np.float32))
voicefixer.restore(input=output_file, # input wav file path
output="audio1.wav", # output wav file path
cuda=True, # whether to use gpu acceleration
mode = 0) # You can try out mode 0, 1, or 2 to find out the best result
noisy = enhance_model.load_audio(
"audio1.wav"
).unsqueeze(0)
enhanced = enhance_model.enhance_batch(noisy, lengths=torch.tensor([1.]))
torchaudio.save("enhanced.wav", enhanced.cpu(), 16000)
return [result.text, chat_response, "enhanced.wav"]
c1=gr.Interface(
fn=greet,
inputs=[
gr.Textbox(lines=1, label = "请填写您的OpenAI-API-key", type = "password"),
gr.Audio(source="upload", label = "请上传您喜欢的声音(wav文件)", type="filepath"),
gr.Audio(source="microphone", label = "和您的专属AI聊天吧!", type="filepath"),
],
outputs=[
gr.Textbox(label="Speech to Text"), gr.Textbox(label="ChatGPT Output"), gr.Audio(label="Audio with Custom Voice"),
],
#theme="huggingface",
#title= "🥳💬💕 - TalktoAI,随时随地,谈天说地!"
description = "🥳💬💕 - TalktoAI,随时随地,谈天说地! \n\n🤖 - 让有人文关怀的AI造福每一个人!AI向善,文明璀璨!TalktoAI - Enable the future!",
)
c2=gr.Interface(
fn=greet,
inputs=[
gr.Textbox(lines=1, label = "请填写您的OpenAI-API-key", type = "password"),
gr.Audio(source="microphone", label = "请上传您喜欢的声音,并尽量避免噪音", type="filepath"),
gr.Audio(source="microphone", label = "和您的专属AI聊天吧!", type="filepath"),
],
outputs=[
gr.Textbox(label="Speech to Text"), gr.Textbox(label="ChatGPT Output"), gr.Audio(label="Audio with Custom Voice"),
],
#theme="huggingface",
#title= "🥳💬💕 - TalktoAI,随时随地,谈天说地!"
description = "🥳💬💕 - TalktoAI,随时随地,谈天说地! \n\n🤖 - 让有人文关怀的AI造福每一个人!AI向善,文明璀璨!TalktoAI - Enable the future!",
)
demo = gr.TabbedInterface([c1, c2], ["wav文件上传", "麦克风上传"])
demo.launch()