import torch import librosa import commons import utils from models import SynthesizerTrn from text import text_to_sequence import numpy as np from mel_processing import spectrogram_torch import gradio as gr from text.cleaners import shanghainese_cleaners from transformers import AutoModel, AutoTokenizer from TTS.api import TTS tts = TTS("tts_models/zh-CN/baker/tacotron2-DDC-GST") tts1 = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True) import torchaudio from speechbrain.pretrained import SpectralMaskEnhancement enhance_model = SpectralMaskEnhancement.from_hparams( source="speechbrain/metricgan-plus-voicebank", savedir="pretrained_models/metricgan-plus-voicebank", run_opts={"device":"cuda"}, ) from denoiser import pretrained from denoiser.dsp import convert_audio model1 = pretrained.dns64().cuda() tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True) model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).half().cuda() model = model.eval() def predict(input, history=None): if history is None: history = [] response, history = model.chat(tokenizer, input, history) return history, history, response def chinese(text_cn, upload1, VoiceMicrophone1): if upload1 is not None: tts.tts_with_vc_to_file( " ".join(text_cn.split()) + "。", speaker_wav=upload1, file_path="output0.wav" ) else: tts.tts_with_vc_to_file( " ".join(text_cn.split()) + "。", speaker_wav=VoiceMicrophone1, file_path="output0.wav" ) noisy = enhance_model.load_audio( "output0.wav" ).unsqueeze(0) enhanced = enhance_model.enhance_batch(noisy, lengths=torch.tensor([1.])) torchaudio.save("enhanced.wav", enhanced.cpu(), 16000) return "enhanced.wav" def english(text_en, upload, VoiceMicrophone): if upload is not None: tts1.tts_to_file(text_en.strip(), speaker_wav = upload, language="en", file_path="output.wav") else: tts1.tts_to_file(text_en.strip(), speaker_wav = VoiceMicrophone, language="en", file_path="output.wav") wav, sr = torchaudio.load("output.wav") wav = convert_audio(wav.cuda(), sr, model1.sample_rate, model1.chin) with torch.no_grad(): denoised = model1(wav[None])[0] torchaudio.save("denoise.wav", denoised.data.cpu(), model1.sample_rate) noisy = enhance_model.load_audio( "denoise.wav" ).unsqueeze(0) enhanced = enhance_model.enhance_batch(noisy, lengths=torch.tensor([1.])) torchaudio.save("enhanced.wav", enhanced.cpu(), 16000) return "enhanced.wav" def clean_text(text,ipa_input): if ipa_input: return shanghainese_cleaners(text) return text def get_text(text, hps, cleaned=False): if cleaned: text_norm = text_to_sequence(text, hps.symbols, []) else: text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners) if hps.data.add_blank: text_norm = commons.intersperse(text_norm, 0) text_norm = torch.LongTensor(text_norm) return text_norm def speech_synthesize(text, cleaned, length_scale): text=text.replace('\n','') print(text) stn_tst = get_text(text, hps_ms, cleaned) with torch.no_grad(): x_tst = stn_tst.unsqueeze(0) x_tst_lengths = torch.LongTensor([stn_tst.size(0)]) sid = torch.LongTensor([0]) audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.8, length_scale=length_scale)[0][0,0].data.cpu().float().numpy() return (hps_ms.data.sampling_rate, audio) hps_ms = utils.get_hparams_from_file('model/config.json') n_speakers = hps_ms.data.n_speakers n_symbols = len(hps_ms.symbols) speakers = hps_ms.speakers net_g_ms = SynthesizerTrn( n_symbols, hps_ms.data.filter_length // 2 + 1, hps_ms.train.segment_size // hps_ms.data.hop_length, n_speakers=n_speakers, **hps_ms.model) _ = net_g_ms.eval() utils.load_checkpoint('model/model.pth', net_g_ms) with gr.Blocks() as demo: gr.Markdown( """ #