Spaces:
Runtime error
Runtime error
import torch | |
import librosa | |
import commons | |
import utils | |
from models import SynthesizerTrn | |
from text import text_to_sequence | |
import numpy as np | |
from mel_processing import spectrogram_torch | |
import gradio as gr | |
from text.cleaners import shanghainese_cleaners | |
from transformers import AutoModel, AutoTokenizer | |
from TTS.api import TTS | |
tts = TTS("tts_models/zh-CN/baker/tacotron2-DDC-GST") | |
tts1 = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True) | |
import torchaudio | |
from speechbrain.pretrained import SpectralMaskEnhancement | |
enhance_model = SpectralMaskEnhancement.from_hparams( | |
source="speechbrain/metricgan-plus-voicebank", | |
savedir="pretrained_models/metricgan-plus-voicebank", | |
run_opts={"device":"cuda"}, | |
) | |
from denoiser import pretrained | |
from denoiser.dsp import convert_audio | |
model1 = pretrained.dns64().cuda() | |
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True) | |
model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).half().cuda() | |
model = model.eval() | |
def predict(input, history=None): | |
if history is None: | |
history = [] | |
response, history = model.chat(tokenizer, input, history) | |
return history, history, response | |
def chinese(text_cn, upload1, VoiceMicrophone1): | |
if upload1 is not None: | |
tts.tts_with_vc_to_file( | |
" ".join(text_cn.split()) + "。", | |
speaker_wav=upload1, | |
file_path="output0.wav" | |
) | |
else: | |
tts.tts_with_vc_to_file( | |
" ".join(text_cn.split()) + "。", | |
speaker_wav=VoiceMicrophone1, | |
file_path="output0.wav" | |
) | |
noisy = enhance_model.load_audio( | |
"output0.wav" | |
).unsqueeze(0) | |
enhanced = enhance_model.enhance_batch(noisy, lengths=torch.tensor([1.])) | |
torchaudio.save("enhanced.wav", enhanced.cpu(), 16000) | |
return "enhanced.wav" | |
def english(text_en, upload, VoiceMicrophone): | |
if upload is not None: | |
tts1.tts_to_file(text_en.strip(), speaker_wav = upload, language="en", file_path="output.wav") | |
else: | |
tts1.tts_to_file(text_en.strip(), speaker_wav = VoiceMicrophone, language="en", file_path="output.wav") | |
wav, sr = torchaudio.load("output.wav") | |
wav = convert_audio(wav.cuda(), sr, model1.sample_rate, model1.chin) | |
with torch.no_grad(): | |
denoised = model1(wav[None])[0] | |
torchaudio.save("denoise.wav", denoised.data.cpu(), model1.sample_rate) | |
noisy = enhance_model.load_audio( | |
"denoise.wav" | |
).unsqueeze(0) | |
enhanced = enhance_model.enhance_batch(noisy, lengths=torch.tensor([1.])) | |
torchaudio.save("enhanced.wav", enhanced.cpu(), 16000) | |
return "enhanced.wav" | |
def clean_text(text,ipa_input): | |
if ipa_input: | |
return shanghainese_cleaners(text) | |
return text | |
def get_text(text, hps, cleaned=False): | |
if cleaned: | |
text_norm = text_to_sequence(text, hps.symbols, []) | |
else: | |
text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners) | |
if hps.data.add_blank: | |
text_norm = commons.intersperse(text_norm, 0) | |
text_norm = torch.LongTensor(text_norm) | |
return text_norm | |
def speech_synthesize(text, cleaned, length_scale): | |
text=text.replace('\n','') | |
print(text) | |
stn_tst = get_text(text, hps_ms, cleaned) | |
with torch.no_grad(): | |
x_tst = stn_tst.unsqueeze(0) | |
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]) | |
sid = torch.LongTensor([0]) | |
audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.8, length_scale=length_scale)[0][0,0].data.cpu().float().numpy() | |
return (hps_ms.data.sampling_rate, audio) | |
hps_ms = utils.get_hparams_from_file('model/config.json') | |
n_speakers = hps_ms.data.n_speakers | |
n_symbols = len(hps_ms.symbols) | |
speakers = hps_ms.speakers | |
net_g_ms = SynthesizerTrn( | |
n_symbols, | |
hps_ms.data.filter_length // 2 + 1, | |
hps_ms.train.segment_size // hps_ms.data.hop_length, | |
n_speakers=n_speakers, | |
**hps_ms.model) | |
_ = net_g_ms.eval() | |
utils.load_checkpoint('model/model.pth', net_g_ms) | |
with gr.Blocks() as demo: | |
gr.Markdown( | |
""" # <center>🥳💬💕 - TalktoAI,随时随地,谈天说地!</center> | |
### <center>🤖 - 让有人文关怀的AI造福每一个人!AI向善,文明璀璨!TalktoAI - Enable the future!</center> | |
""" | |
) | |
state = gr.State([]) | |
chatbot = gr.Chatbot([], elem_id="chatbot").style(height=300) | |
res = gr.Textbox(lines=1, placeholder="最新的回答在这里(此内容可编辑,用作声音克隆的文本)", show_label = False).style(container=False) | |
with gr.Row(): | |
txt = gr.Textbox(label = "说点什么吧(中英皆可)", lines=1) | |
button = gr.Button("开始对话吧") | |
txt.submit(predict, [txt, state], [chatbot, state, res]) | |
button.click(predict, [txt, state], [chatbot, state, res]) | |
with gr.Row().style(mobile_collapse=False, equal_height=True): | |
inp3 = res | |
inp4 = gr.Audio(source="upload", label = "请上传您喜欢的声音(wav/mp3文件);长语音(~90s)、女声效果更好", type="filepath") | |
inp5 = gr.Audio(source="microphone", type="filepath", label = '请用麦克风上传您喜欢的声音,与文件上传二选一即可') | |
btn1 = gr.Button("用喜欢的声音听一听吧(中文)") | |
btn2 = gr.Button("用喜欢的声音听一听吧(英文)") | |
with gr.Row(): | |
out1 = gr.Audio(label="为您合成的专属声音(中文)") | |
out2 = gr.Audio(label="为您合成的专属声音(英文)") | |
btn1.click(chinese, [inp3, inp4, inp5], [out1]) | |
btn2.click(english, [inp3, inp4, inp5], [out2]) | |
text_input = res | |
cleaned_text=gr.Checkbox(label='IPA Input',default=True, visible = False) | |
length_scale=gr.Slider(0.5,2,1,step=0.1,label='Speaking Speed',interactive=True, visible = False) | |
with gr.Row().style(mobile_collapse=False, equal_height=True): | |
tts_button = gr.Button('彩蛋:上海话合成') | |
audio_output = gr.Audio(label='听一听上海话吧') | |
cleaned_text.change(clean_text,[text_input,cleaned_text],[text_input]) | |
tts_button.click(speech_synthesize,[text_input,cleaned_text,length_scale],[audio_output]) | |
gr.Markdown( | |
""" ### <center>注意❗:请不要输入或生成会对个人以及组织造成侵害的内容,此程序仅供科研、学习及娱乐使用。用户输入或生成的内容与程序开发者无关,请自觉合法合规使用,违反者一切后果自负。</center> | |
### <center>Model by [ChatGLM-6B](https://huggingface.co/THUDM/chatglm-6b). Thanks to [THUDM](https://github.com/THUDM) and [CjangCjengh](https://github.com/CjangCjengh). Please follow me on [Bilibili](https://space.bilibili.com/501495851?spm_id_from=333.1007.0.0).</center> | |
""" | |
) | |
gr.HTML(''' | |
<div class="footer"> | |
<p>🎶🖼️🎡 - It’s the intersection of technology and liberal arts that makes our hearts sing. - Steve Jobs | |
</p> | |
<p>注:中文声音克隆实际上是通过声音转换(Voice Conversion)实现,所以输出结果可能更像是一种新的声音,效果不一定很理想,希望大家多多包涵,之后我们也会不断迭代该程序的!为了实现更好的效果,使用中文声音克隆时请尽量上传女声。 | |
</p> | |
</div> | |
''') | |
demo.queue().launch(show_error=True) |