#好用的 import os os.system('pip install -U tensorflow') os.system('pip install -q unidecode tensorboardX') os.system('pip install librosa==0.8.0') os.system('pip install pysoundfile==0.9.0.post1') os.system('pip install unidecode==1.3.4') os.system('pip install pyopenjtalk --no-build-isolation') os.system('pip install inflect==5.6.2') os.system('pip install janome==0.4.2') os.system('pip install tqdm -q') os.system('pip install gdown') os.system('pip install -q librosa unidecode') os.system('pip install ipython') os.system('pip install --upgrade jupyter ipywidgets') os.system('jupyter nbextension enable --py widgetsnbextension') os.system('pip uninstall tqdm') os.system('pip install tqdm') import time import pyopenjtalk import soundfile as sf import gradio as gr import torch import IPython.display as ipd import numpy as np import torch import json from hparams import create_hparams from model import Tacotron2 from layers import TacotronSTFT from audio_processing import griffin_lim from text import text_to_sequence from env import AttrDict from meldataset import MAX_WAV_VALUE from models import Generator #@,tlitle 配置并运行 #国际 HiFi-GAN 模型(有点机器音): 1qpgI41wNXFcH-iKq1Y42JlBC9j0je8PW #@markdown 你训练好的tacotron2模型的路径填在`Tacotron2_Model`这里 Tacotron2_Model = '/content/Yui_TrapGenesis'#@param {type:"string"} TACOTRON2_ID = Tacotron2_Model HIFIGAN_ID = "1qpgI41wNXFcH-iKq1Y42JlBC9j0je8PW" #@markdown 选择预处理文本的cleaner text_cleaner = 'japanese_phrase_cleaners'#@param {type:"string"} import pyopenjtalk import soundfile as sf import gradio as gr # 全局变量声明 model = None hparams = None hifigan = None thisdict = None pronounciation_dictionary = False show_graphs = False # 添加show_graphs变量,并赋予默认值 # 初始化函数 def initialize(): global model, hparams, hifigan, thisdict, pronounciation_dictionary # 检查是否已初始化 try: initialized except NameError: print("Setting up, please wait.\n") from tqdm.notebook import tqdm with tqdm(total=5, leave=False) as pbar: import os from os.path import exists, join, basename, splitext git_repo_url = 'https://github.com/CjangCjengh/tacotron2-japanese.git' project_name = splitext(basename(git_repo_url))[0] if not exists(project_name): # clone and install os.system('git clone -q --recursive {git_repo_url}') os.system('git clone -q --recursive https://github.com/SortAnon/hifi-gan') pbar.update(1) # downloaded TT2 and HiFi-GAN import sys sys.path.append('hifi-gan') sys.path.append(project_name) import time import matplotlib import matplotlib.pylab as plt import gdown d = 'https://drive.google.com/uc?id=' # %matplotlib inline import IPython.display as ipd import numpy as np import torch import json from hparams import create_hparams from model import Tacotron2 from layers import TacotronSTFT from audio_processing import griffin_lim from text import text_to_sequence from env import AttrDict from meldataset import MAX_WAV_VALUE from models import Generator pbar.update(1) # initialized Dependancies graph_width = 900 graph_height = 360 def plot_data(data, figsize=(int(graph_width/100), int(graph_height/100))): # %matplotlib inline fig, axes = plt.subplots(1, len(data), figsize=figsize) for i in range(len(data)): axes[i].imshow(data[i], aspect='auto', origin='upper', interpolation='none', cmap='inferno') fig.canvas.draw() plt.show() # Setup Pronounciation Dictionary os.system('wget https://github.com/wind4000/tacotron2/releases/download/v0.2/merged.dict.txt') thisdict = {} for line in reversed((open('merged.dict.txt', "r").read()).splitlines()): thisdict[(line.split(" ",1))[0]] = (line.split(" ",1))[1].strip() pbar.update(1) # Downloaded and Set up Pronounciation Dictionary def ARPA(text, punctuation=r"!?,.;", EOS_Token=True): out = '' for word_ in text.split(" "): word=word_; end_chars = '' while any(elem in word for elem in punctuation) and len(word) > 1: if word[-1] in punctuation: end_chars = word[-1] + end_chars; word = word[:-1] else: break try: word_arpa = thisdict[word.upper()] word = "{" + str(word_arpa) + "}" except KeyError: pass out = (out + " " + word + end_chars).strip() if EOS_Token and out[-1] != ";": out += ";" return out def get_hifigan(MODEL_ID): # Download HiFi-GAN hifigan_pretrained_model = 'hifimodel' gdown.download(d+MODEL_ID, hifigan_pretrained_model, quiet=False) if not exists(hifigan_pretrained_model): raise Exception("HiFI-GAN model failed to download!") # Load HiFi-GAN conf = os.path.join("hifi-gan", "config_v1.json") with open(conf) as f: json_config = json.loads(f.read()) h = AttrDict(json_config) torch.manual_seed(h.seed) hifigan = Generator(h).to(torch.device("cuda")) state_dict_g = torch.load(hifigan_pretrained_model, map_location=torch.device("cuda")) hifigan.load_state_dict(state_dict_g["generator"]) hifigan.eval() hifigan.remove_weight_norm() return hifigan, h hifigan, h = get_hifigan(HIFIGAN_ID) pbar.update(1) # Downloaded and Set up HiFi-GAN def has_MMI(STATE_DICT): return any(True for x in STATE_DICT.keys() if "mi." in x) def get_Tactron2(MODEL_ID): # Download Tacotron2 tacotron2_pretrained_model = TACOTRON2_ID if not exists(tacotron2_pretrained_model): raise Exception("Tacotron2 model failed to download!") # Load Tacotron2 and Config hparams = create_hparams() hparams.sampling_rate = 22050 hparams.max_decoder_steps = 2000 # Max Duration hparams.gate_threshold = 0.80 # Model must be 25% sure the clip is over before ending generation model = Tacotron2(hparams) state_dict = torch.load(tacotron2_pretrained_model)['state_dict'] if has_MMI(state_dict): raise Exception("ERROR: This notebook does not currently support MMI models.") model.load_state_dict(state_dict) _ = model.cuda().eval().half() return model, hparams model, hparams = get_Tactron2(TACOTRON2_ID) previous_tt2_id = TACOTRON2_ID pbar.update(1) # Downloaded and Set up Tacotron2 # 初始化 initialize() import soundfile as sf def end_to_end_infer(text, pronounciation_dictionary, show_graphs): audio = None # 定义一个变量用于存储音频数据 for i in [x for x in text.split("\n") if len(x)]: if not pronounciation_dictionary: if i[-1] != ";": i = i + ";" else: i = ARPA(i) with torch.no_grad(): sequence = np.array(text_to_sequence(i, [text_cleaner]))[None, :] sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long() mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence) if show_graphs: plot_data((mel_outputs_postnet.float().data.cpu().numpy()[0], alignments.float().data.cpu().numpy()[0].T)) y_g_hat = hifigan(mel_outputs_postnet.float()) audio = y_g_hat.squeeze() audio = audio * MAX_WAV_VALUE output_filename = f"output_{time.strftime('%Y%m%d%H%M%S')}.wav" sf.write(output_filename, audio.cpu().numpy().astype('int16'), hparams.sampling_rate) print(f"音频已保存为 {output_filename}") print("") ipd.display(ipd.Audio(audio.cpu().numpy().astype("int16"), rate=hparams.sampling_rate)) return audio # 返回音频数据 # 文本到语音转换函数 def text_to_speech(text, max_decoder_steps=2000, gate_threshold=0.5): global model, hparams, hifigan, thisdict, pronounciation_dictionary, show_graphs hparams.max_decoder_steps = max_decoder_steps hparams.gate_threshold = gate_threshold output_filename = f"output_{time.strftime('%Y%m%d%H%M%S')}.wav" audio = end_to_end_infer(text, pronounciation_dictionary, show_graphs) if audio is not None: sf.write(output_filename, audio.cpu().numpy().astype('int16'), hparams.sampling_rate) return output_filename else: return None # Gradio界面 inputs = [ gr.inputs.Textbox(lines=3, label="输入文本"), gr.inputs.Slider(minimum=100, maximum=5000, default=2000, step=100, label="最大解码步数"), gr.inputs.Slider(minimum=0.0, maximum=1.0, default=0.5, step=0.05, label="门控阈值") ] outputs = gr.outputs.File(label="下载生成的音频") gr.Interface(fn=text_to_speech, inputs=inputs, outputs=outputs).launch(debug=True,share=True)