|
|
|
|
|
import os |
|
os.system('pip install -U tensorflow') |
|
os.system('pip install -q unidecode tensorboardX') |
|
os.system('pip install librosa==0.8.0') |
|
os.system('pip install pysoundfile==0.9.0.post1') |
|
os.system('pip install unidecode==1.3.4') |
|
os.system('pip install pyopenjtalk --no-build-isolation') |
|
os.system('pip install inflect==5.6.2') |
|
os.system('pip install janome==0.4.2') |
|
os.system('pip install tqdm -q') |
|
os.system('pip install gdown') |
|
os.system('pip install -q librosa unidecode') |
|
|
|
os.system('pip install ipython') |
|
os.system('pip install --upgrade jupyter ipywidgets') |
|
os.system('jupyter nbextension enable --py widgetsnbextension') |
|
os.system('pip uninstall tqdm') |
|
os.system('pip install tqdm') |
|
|
|
import time |
|
import pyopenjtalk |
|
import soundfile as sf |
|
import gradio as gr |
|
import torch |
|
import IPython.display as ipd |
|
import numpy as np |
|
import torch |
|
import json |
|
from hparams import create_hparams |
|
from model import Tacotron2 |
|
from layers import TacotronSTFT |
|
from audio_processing import griffin_lim |
|
from text import text_to_sequence |
|
from env import AttrDict |
|
from meldataset import MAX_WAV_VALUE |
|
from models import Generator |
|
|
|
|
|
|
|
|
|
|
|
Tacotron2_Model = 'Yui_TrapGenesis' |
|
TACOTRON2_ID = Tacotron2_Model |
|
HIFIGAN_ID = "1qpgI41wNXFcH-iKq1Y42JlBC9j0je8PW" |
|
|
|
text_cleaner = 'japanese_phrase_cleaners' |
|
import pyopenjtalk |
|
import soundfile as sf |
|
import gradio as gr |
|
|
|
|
|
model = None |
|
hparams = None |
|
hifigan = None |
|
thisdict = None |
|
pronounciation_dictionary = False |
|
show_graphs = False |
|
|
|
|
|
def initialize(): |
|
global model, hparams, hifigan, thisdict, pronounciation_dictionary |
|
|
|
|
|
try: |
|
initialized |
|
except NameError: |
|
print("Setting up, please wait.\n") |
|
|
|
from tqdm.notebook import tqdm |
|
with tqdm(total=5, leave=False) as pbar: |
|
import os |
|
from os.path import exists, join, basename, splitext |
|
git_repo_url = 'https://github.com/CjangCjengh/tacotron2-japanese.git' |
|
project_name = splitext(basename(git_repo_url))[0] |
|
if not exists(project_name): |
|
|
|
os.system('git clone -q --recursive {git_repo_url}') |
|
os.system('git clone -q --recursive https://github.com/SortAnon/hifi-gan') |
|
|
|
pbar.update(1) |
|
import sys |
|
sys.path.append('hifi-gan') |
|
sys.path.append(project_name) |
|
import time |
|
import matplotlib |
|
import matplotlib.pylab as plt |
|
import gdown |
|
d = 'https://drive.google.com/uc?id=' |
|
|
|
|
|
import IPython.display as ipd |
|
import numpy as np |
|
import torch |
|
import json |
|
from hparams import create_hparams |
|
from model import Tacotron2 |
|
from layers import TacotronSTFT |
|
from audio_processing import griffin_lim |
|
from text import text_to_sequence |
|
from env import AttrDict |
|
from meldataset import MAX_WAV_VALUE |
|
from models import Generator |
|
|
|
pbar.update(1) |
|
|
|
graph_width = 900 |
|
graph_height = 360 |
|
def plot_data(data, figsize=(int(graph_width/100), int(graph_height/100))): |
|
|
|
fig, axes = plt.subplots(1, len(data), figsize=figsize) |
|
for i in range(len(data)): |
|
axes[i].imshow(data[i], aspect='auto', origin='upper', |
|
interpolation='none', cmap='inferno') |
|
fig.canvas.draw() |
|
plt.show() |
|
|
|
|
|
os.system('wget https://github.com/wind4000/tacotron2/releases/download/v0.2/merged.dict.txt') |
|
thisdict = {} |
|
for line in reversed((open('merged.dict.txt', "r").read()).splitlines()): |
|
thisdict[(line.split(" ",1))[0]] = (line.split(" ",1))[1].strip() |
|
|
|
pbar.update(1) |
|
|
|
def ARPA(text, punctuation=r"!?,.;", EOS_Token=True): |
|
out = '' |
|
for word_ in text.split(" "): |
|
word=word_; end_chars = '' |
|
while any(elem in word for elem in punctuation) and len(word) > 1: |
|
if word[-1] in punctuation: end_chars = word[-1] + end_chars; word = word[:-1] |
|
else: break |
|
try: |
|
word_arpa = thisdict[word.upper()] |
|
word = "{" + str(word_arpa) + "}" |
|
except KeyError: pass |
|
out = (out + " " + word + end_chars).strip() |
|
if EOS_Token and out[-1] != ";": out += ";" |
|
return out |
|
|
|
def get_hifigan(MODEL_ID): |
|
|
|
hifigan_pretrained_model = 'hifimodel' |
|
gdown.download(d+MODEL_ID, hifigan_pretrained_model, quiet=False) |
|
if not exists(hifigan_pretrained_model): |
|
raise Exception("HiFI-GAN model failed to download!") |
|
|
|
|
|
conf = os.path.join("hifi-gan", "config_v1.json") |
|
with open(conf) as f: |
|
json_config = json.loads(f.read()) |
|
h = AttrDict(json_config) |
|
torch.manual_seed(h.seed) |
|
hifigan = Generator(h).to(torch.device("cpu")) |
|
state_dict_g = torch.load(hifigan_pretrained_model, map_location=torch.device("cpu")) |
|
hifigan.load_state_dict(state_dict_g["generator"]) |
|
hifigan.eval() |
|
hifigan.remove_weight_norm() |
|
return hifigan, h |
|
|
|
hifigan, h = get_hifigan(HIFIGAN_ID) |
|
pbar.update(1) |
|
|
|
def has_MMI(STATE_DICT): |
|
return any(True for x in STATE_DICT.keys() if "mi." in x) |
|
|
|
def get_Tactron2(MODEL_ID): |
|
|
|
tacotron2_pretrained_model = TACOTRON2_ID |
|
if not exists(tacotron2_pretrained_model): |
|
raise Exception("Tacotron2 model failed to download!") |
|
|
|
hparams = create_hparams() |
|
hparams.sampling_rate = 22050 |
|
hparams.max_decoder_steps = 2000 |
|
hparams.gate_threshold = 0.80 |
|
model = Tacotron2(hparams) |
|
state_dict = torch.load(tacotron2_pretrained_model, map_location=torch.device('cpu'))['state_dict'] |
|
|
|
if has_MMI(state_dict): |
|
raise Exception("ERROR: This notebook does not currently support MMI models.") |
|
model.load_state_dict(state_dict) |
|
_ = model.cpu().eval().float() |
|
return model, hparams |
|
|
|
model, hparams = get_Tactron2(TACOTRON2_ID) |
|
previous_tt2_id = TACOTRON2_ID |
|
|
|
pbar.update(1) |
|
|
|
|
|
initialize() |
|
|
|
import soundfile as sf |
|
|
|
def end_to_end_infer(text, pronounciation_dictionary, show_graphs): |
|
audio = None |
|
for i in [x for x in text.split("\n") if len(x)]: |
|
if not pronounciation_dictionary: |
|
if i[-1] != ";": |
|
i = i + ";" |
|
else: |
|
i = ARPA(i) |
|
with torch.no_grad(): |
|
sequence = np.array(text_to_sequence(i, [text_cleaner]))[None, :] |
|
sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cpu().long() |
|
|
|
mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence) |
|
if show_graphs: |
|
plot_data((mel_outputs_postnet.float().data.cpu().numpy()[0], |
|
alignments.float().data.cpu().numpy()[0].T)) |
|
y_g_hat = hifigan(mel_outputs_postnet.float()) |
|
audio = y_g_hat.squeeze() |
|
audio = audio * MAX_WAV_VALUE |
|
output_filename = f"output_{time.strftime('%Y%m%d%H%M%S')}.wav" |
|
sf.write(output_filename, audio.cpu().numpy().astype('int16'), hparams.sampling_rate) |
|
print(f"音频已保存为 {output_filename}") |
|
print("") |
|
ipd.display(ipd.Audio(audio.cpu().numpy().astype("int16"), rate=hparams.sampling_rate)) |
|
return audio |
|
|
|
|
|
def text_to_speech(text, max_decoder_steps=2000, gate_threshold=0.5): |
|
global model, hparams, hifigan, thisdict, pronounciation_dictionary, show_graphs |
|
|
|
hparams.max_decoder_steps = max_decoder_steps |
|
hparams.gate_threshold = gate_threshold |
|
output_filename = f"output_{time.strftime('%Y%m%d%H%M%S')}.wav" |
|
audio = end_to_end_infer(text, pronounciation_dictionary, show_graphs) |
|
if audio is not None: |
|
sf.write(output_filename, audio.cpu().numpy().astype('int16'), hparams.sampling_rate) |
|
return output_filename |
|
else: |
|
return None |
|
|
|
|
|
inputs = [ |
|
gr.inputs.Textbox(lines=3, label="输入文本"), |
|
gr.inputs.Slider(minimum=100, maximum=5000, default=2000, step=100, label="最大解码步数"), |
|
gr.inputs.Slider(minimum=0.0, maximum=1.0, default=0.5, step=0.05, label="门控阈值") |
|
] |
|
outputs = gr.outputs.File(label="下载生成的音频") |
|
|
|
gr.Interface(fn=text_to_speech, inputs=inputs, outputs=outputs).launch(debug=True) |