Yue_tacotron2 / Yue_gradio_cpu.py

Upload Yue_gradio_cpu.py

b99e56b over 1 year ago

9.55 kB

	#好用的

	import os
	os.system('pip install -U tensorflow')
	os.system('pip install -q unidecode tensorboardX')
	os.system('pip install librosa==0.8.0')
	os.system('pip install pysoundfile==0.9.0.post1')
	os.system('pip install unidecode==1.3.4')
	os.system('pip install pyopenjtalk --no-build-isolation')
	os.system('pip install inflect==5.6.2')
	os.system('pip install janome==0.4.2')
	os.system('pip install tqdm -q')
	os.system('pip install gdown')
	os.system('pip install -q librosa unidecode')

	os.system('pip install ipython')
	os.system('pip install --upgrade jupyter ipywidgets')
	os.system('jupyter nbextension enable --py widgetsnbextension')
	os.system('pip uninstall tqdm')
	os.system('pip install tqdm')

	import time
	import pyopenjtalk
	import soundfile as sf
	import gradio as gr
	import torch
	import IPython.display as ipd
	import numpy as np
	import torch
	import json
	from hparams import create_hparams
	from model import Tacotron2
	from layers import TacotronSTFT
	from audio_processing import griffin_lim
	from text import text_to_sequence
	from env import AttrDict
	from meldataset import MAX_WAV_VALUE
	from models import Generator

	#@，tlitle 配置并运行

	#国际 HiFi-GAN 模型(有点机器音): 1qpgI41wNXFcH-iKq1Y42JlBC9j0je8PW
	#@markdown 你训练好的tacotron2模型的路径填在`Tacotron2_Model`这里
	Tacotron2_Model = 'Yui_TrapGenesis'#@param {type:"string"}
	TACOTRON2_ID = Tacotron2_Model
	HIFIGAN_ID = "1qpgI41wNXFcH-iKq1Y42JlBC9j0je8PW"
	#@markdown 选择预处理文本的cleaner
	text_cleaner = 'japanese_phrase_cleaners'#@param {type:"string"}
	import pyopenjtalk
	import soundfile as sf
	import gradio as gr

	# 全局变量声明
	model = None
	hparams = None
	hifigan = None
	thisdict = None
	pronounciation_dictionary = False
	show_graphs = False # 添加show_graphs变量，并赋予默认值

	# 初始化函数
	def initialize():
	global model, hparams, hifigan, thisdict, pronounciation_dictionary

	# 检查是否已初始化
	try:
	initialized
	except NameError:
	print("Setting up, please wait.\n")

	from tqdm.notebook import tqdm
	with tqdm(total=5, leave=False) as pbar:
	import os
	from os.path import exists, join, basename, splitext
	git_repo_url = 'https://github.com/CjangCjengh/tacotron2-japanese.git'
	project_name = splitext(basename(git_repo_url))[0]
	if not exists(project_name):
	# clone and install
	os.system('git clone -q --recursive {git_repo_url}')
	os.system('git clone -q --recursive https://github.com/SortAnon/hifi-gan')

	pbar.update(1) # downloaded TT2 and HiFi-GAN
	import sys
	sys.path.append('hifi-gan')
	sys.path.append(project_name)
	import time
	import matplotlib
	import matplotlib.pylab as plt
	import gdown
	d = 'https://drive.google.com/uc?id='

	# %matplotlib inline
	import IPython.display as ipd
	import numpy as np
	import torch
	import json
	from hparams import create_hparams
	from model import Tacotron2
	from layers import TacotronSTFT
	from audio_processing import griffin_lim
	from text import text_to_sequence
	from env import AttrDict
	from meldataset import MAX_WAV_VALUE
	from models import Generator

	pbar.update(1) # initialized Dependancies

	graph_width = 900
	graph_height = 360
	def plot_data(data, figsize=(int(graph_width/100), int(graph_height/100))):
	# %matplotlib inline
	fig, axes = plt.subplots(1, len(data), figsize=figsize)
	for i in range(len(data)):
	axes[i].imshow(data[i], aspect='auto', origin='upper',
	interpolation='none', cmap='inferno')
	fig.canvas.draw()
	plt.show()

	# Setup Pronounciation Dictionary
	os.system('wget https://github.com/wind4000/tacotron2/releases/download/v0.2/merged.dict.txt')
	thisdict = {}
	for line in reversed((open('merged.dict.txt', "r").read()).splitlines()):
	thisdict[(line.split(" ",1))[0]] = (line.split(" ",1))[1].strip()

	pbar.update(1) # Downloaded and Set up Pronounciation Dictionary

	def ARPA(text, punctuation=r"!?,.;", EOS_Token=True):
	out = ''
	for word_ in text.split(" "):
	word=word_; end_chars = ''
	while any(elem in word for elem in punctuation) and len(word) > 1:
	if word[-1] in punctuation: end_chars = word[-1] + end_chars; word = word[:-1]
	else: break
	try:
	word_arpa = thisdict[word.upper()]
	word = "{" + str(word_arpa) + "}"
	except KeyError: pass
	out = (out + " " + word + end_chars).strip()
	if EOS_Token and out[-1] != ";": out += ";"
	return out

	def get_hifigan(MODEL_ID):
	# Download HiFi-GAN
	hifigan_pretrained_model = 'hifimodel'
	gdown.download(d+MODEL_ID, hifigan_pretrained_model, quiet=False)
	if not exists(hifigan_pretrained_model):
	raise Exception("HiFI-GAN model failed to download!")

	# Load HiFi-GAN
	conf = os.path.join("hifi-gan", "config_v1.json")
	with open(conf) as f:
	json_config = json.loads(f.read())
	h = AttrDict(json_config)
	torch.manual_seed(h.seed)
	hifigan = Generator(h).to(torch.device("cpu"))
	state_dict_g = torch.load(hifigan_pretrained_model, map_location=torch.device("cpu"))
	hifigan.load_state_dict(state_dict_g["generator"])
	hifigan.eval()
	hifigan.remove_weight_norm()
	return hifigan, h

	hifigan, h = get_hifigan(HIFIGAN_ID)
	pbar.update(1) # Downloaded and Set up HiFi-GAN

	def has_MMI(STATE_DICT):
	return any(True for x in STATE_DICT.keys() if "mi." in x)

	def get_Tactron2(MODEL_ID):
	# Download Tacotron2
	tacotron2_pretrained_model = TACOTRON2_ID
	if not exists(tacotron2_pretrained_model):
	raise Exception("Tacotron2 model failed to download!")
	# Load Tacotron2 and Config
	hparams = create_hparams()
	hparams.sampling_rate = 22050
	hparams.max_decoder_steps = 2000 # Max Duration
	hparams.gate_threshold = 0.80 # Model must be 25% sure the clip is over before ending generation
	model = Tacotron2(hparams)
	state_dict = torch.load(tacotron2_pretrained_model, map_location=torch.device('cpu'))['state_dict']

	if has_MMI(state_dict):
	raise Exception("ERROR: This notebook does not currently support MMI models.")
	model.load_state_dict(state_dict)
	_ = model.cpu().eval().float()
	return model, hparams

	model, hparams = get_Tactron2(TACOTRON2_ID)
	previous_tt2_id = TACOTRON2_ID

	pbar.update(1) # Downloaded and Set up Tacotron2

	# 初始化
	initialize()

	import soundfile as sf

	def end_to_end_infer(text, pronounciation_dictionary, show_graphs):
	audio = None # 定义一个变量用于存储音频数据
	for i in [x for x in text.split("\n") if len(x)]:
	if not pronounciation_dictionary:
	if i[-1] != ";":
	i = i + ";"
	else:
	i = ARPA(i)
	with torch.no_grad():
	sequence = np.array(text_to_sequence(i, [text_cleaner]))[None, :]
	sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cpu().long() # 或者使用 .int()

	mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
	if show_graphs:
	plot_data((mel_outputs_postnet.float().data.cpu().numpy()[0],
	alignments.float().data.cpu().numpy()[0].T))
	y_g_hat = hifigan(mel_outputs_postnet.float())
	audio = y_g_hat.squeeze()
	audio = audio * MAX_WAV_VALUE
	output_filename = f"output_{time.strftime('%Y%m%d%H%M%S')}.wav"
	sf.write(output_filename, audio.cpu().numpy().astype('int16'), hparams.sampling_rate)
	print(f"音频已保存为 {output_filename}")
	print("")
	ipd.display(ipd.Audio(audio.cpu().numpy().astype("int16"), rate=hparams.sampling_rate))
	return audio # 返回音频数据

	# 文本到语音转换函数
	def text_to_speech(text, max_decoder_steps=2000, gate_threshold=0.5):
	global model, hparams, hifigan, thisdict, pronounciation_dictionary, show_graphs

	hparams.max_decoder_steps = max_decoder_steps
	hparams.gate_threshold = gate_threshold
	output_filename = f"output_{time.strftime('%Y%m%d%H%M%S')}.wav"
	audio = end_to_end_infer(text, pronounciation_dictionary, show_graphs)
	if audio is not None:
	sf.write(output_filename, audio.cpu().numpy().astype('int16'), hparams.sampling_rate)
	return output_filename
	else:
	return None

	# Gradio界面
	inputs = [
	gr.inputs.Textbox(lines=3, label="输入文本"),
	gr.inputs.Slider(minimum=100, maximum=5000, default=2000, step=100, label="最大解码步数"),
	gr.inputs.Slider(minimum=0.0, maximum=1.0, default=0.5, step=0.05, label="门控阈值")
	]
	outputs = gr.outputs.File(label="下载生成的音频")

	gr.Interface(fn=text_to_speech, inputs=inputs, outputs=outputs).launch(debug=True)