Ritori
/

Yue_tacotron2

Transformers

Inference Endpoints

Model card Files Files and versions Community

Ritori commited on Aug 4, 2023

Commit

b99e56b

1 Parent(s): 234ff35

Upload Yue_gradio_cpu.py

Browse files

Files changed (1) hide show

Yue_gradio_cpu.py +245 -0

Yue_gradio_cpu.py ADDED Viewed

	@@ -0,0 +1,245 @@

+#好用的
+import os
+os.system('pip install -U tensorflow')
+os.system('pip install -q unidecode tensorboardX')
+os.system('pip install librosa==0.8.0')
+os.system('pip install pysoundfile==0.9.0.post1')
+os.system('pip install unidecode==1.3.4')
+os.system('pip install pyopenjtalk --no-build-isolation')
+os.system('pip install inflect==5.6.2')
+os.system('pip install janome==0.4.2')
+os.system('pip install tqdm -q')
+os.system('pip install gdown')
+os.system('pip install -q librosa unidecode')
+os.system('pip install ipython')
+os.system('pip install --upgrade jupyter ipywidgets')
+os.system('jupyter nbextension enable --py widgetsnbextension')
+os.system('pip uninstall tqdm')
+os.system('pip install tqdm')
+import time
+import pyopenjtalk
+import soundfile as sf
+import gradio as gr
+import torch
+import IPython.display as ipd
+import numpy as np
+import torch
+import json
+from hparams import create_hparams
+from model import Tacotron2
+from layers import TacotronSTFT
+from audio_processing import griffin_lim
+from text import text_to_sequence
+from env import AttrDict
+from meldataset import MAX_WAV_VALUE
+from models import Generator
+#@，tlitle 配置并运行
+#国际 HiFi-GAN 模型(有点机器音): 1qpgI41wNXFcH-iKq1Y42JlBC9j0je8PW
+#@markdown 你训练好的tacotron2模型的路径填在`Tacotron2_Model`这里
+Tacotron2_Model = 'Yui_TrapGenesis'#@param {type:"string"}
+TACOTRON2_ID = Tacotron2_Model
+HIFIGAN_ID = "1qpgI41wNXFcH-iKq1Y42JlBC9j0je8PW"
+#@markdown 选择预处理文本的cleaner
+text_cleaner = 'japanese_phrase_cleaners'#@param {type:"string"}
+import pyopenjtalk
+import soundfile as sf
+import gradio as gr
+# 全局变量声明
+model = None
+hparams = None
+hifigan = None
+thisdict = None
+pronounciation_dictionary = False
+show_graphs = False  # 添加show_graphs变量，并赋予默认值
+# 初始化函数
+def initialize():
+    global model, hparams, hifigan, thisdict, pronounciation_dictionary
+    # 检查是否已初始化
+    try:
+        initialized
+    except NameError:
+        print("Setting up, please wait.\n")
+    from tqdm.notebook import tqdm
+    with tqdm(total=5, leave=False) as pbar:
+        import os
+        from os.path import exists, join, basename, splitext
+        git_repo_url = 'https://github.com/CjangCjengh/tacotron2-japanese.git'
+        project_name = splitext(basename(git_repo_url))[0]
+        if not exists(project_name):
+            # clone and install
+            os.system('git clone -q --recursive {git_repo_url}')
+            os.system('git clone -q --recursive https://github.com/SortAnon/hifi-gan')
+        pbar.update(1) # downloaded TT2 and HiFi-GAN
+        import sys
+        sys.path.append('hifi-gan')
+        sys.path.append(project_name)
+        import time
+        import matplotlib
+        import matplotlib.pylab as plt
+        import gdown
+        d = 'https://drive.google.com/uc?id='
+       # %matplotlib inline
+        import IPython.display as ipd
+        import numpy as np
+        import torch
+        import json
+        from hparams import create_hparams
+        from model import Tacotron2
+        from layers import TacotronSTFT
+        from audio_processing import griffin_lim
+        from text import text_to_sequence
+        from env import AttrDict
+        from meldataset import MAX_WAV_VALUE
+        from models import Generator
+        pbar.update(1) # initialized Dependancies
+        graph_width = 900
+        graph_height = 360
+        def plot_data(data, figsize=(int(graph_width/100), int(graph_height/100))):
+           # %matplotlib inline
+            fig, axes = plt.subplots(1, len(data), figsize=figsize)
+            for i in range(len(data)):
+                axes[i].imshow(data[i], aspect='auto', origin='upper',
+                            interpolation='none', cmap='inferno')
+            fig.canvas.draw()
+            plt.show()
+        # Setup Pronounciation Dictionary
+        os.system('wget https://github.com/wind4000/tacotron2/releases/download/v0.2/merged.dict.txt')
+        thisdict = {}
+        for line in reversed((open('merged.dict.txt', "r").read()).splitlines()):
+            thisdict[(line.split(" ",1))[0]] = (line.split(" ",1))[1].strip()
+        pbar.update(1) # Downloaded and Set up Pronounciation Dictionary
+        def ARPA(text, punctuation=r"!?,.;", EOS_Token=True):
+            out = ''
+            for word_ in text.split(" "):
+                word=word_; end_chars = ''
+                while any(elem in word for elem in punctuation) and len(word) > 1:
+                    if word[-1] in punctuation: end_chars = word[-1] + end_chars; word = word[:-1]
+                    else: break
+                try:
+                    word_arpa = thisdict[word.upper()]
+                    word = "{" + str(word_arpa) + "}"
+                except KeyError: pass
+                out = (out + " " + word + end_chars).strip()
+            if EOS_Token and out[-1] != ";": out += ";"
+            return out
+        def get_hifigan(MODEL_ID):
+            # Download HiFi-GAN
+            hifigan_pretrained_model = 'hifimodel'
+            gdown.download(d+MODEL_ID, hifigan_pretrained_model, quiet=False)
+            if not exists(hifigan_pretrained_model):
+                raise Exception("HiFI-GAN model failed to download!")
+            # Load HiFi-GAN
+            conf = os.path.join("hifi-gan", "config_v1.json")
+            with open(conf) as f:
+                json_config = json.loads(f.read())
+            h = AttrDict(json_config)
+            torch.manual_seed(h.seed)
+            hifigan = Generator(h).to(torch.device("cpu"))
+            state_dict_g = torch.load(hifigan_pretrained_model, map_location=torch.device("cpu"))
+            hifigan.load_state_dict(state_dict_g["generator"])
+            hifigan.eval()
+            hifigan.remove_weight_norm()
+            return hifigan, h
+        hifigan, h = get_hifigan(HIFIGAN_ID)
+        pbar.update(1) # Downloaded and Set up HiFi-GAN
+        def has_MMI(STATE_DICT):
+            return any(True for x in STATE_DICT.keys() if "mi." in x)
+        def get_Tactron2(MODEL_ID):
+            # Download Tacotron2
+            tacotron2_pretrained_model = TACOTRON2_ID
+            if not exists(tacotron2_pretrained_model):
+                raise Exception("Tacotron2 model failed to download!")
+            # Load Tacotron2 and Config
+            hparams = create_hparams()
+            hparams.sampling_rate = 22050
+            hparams.max_decoder_steps = 2000 # Max Duration
+            hparams.gate_threshold = 0.80 # Model must be 25% sure the clip is over before ending generation
+            model = Tacotron2(hparams)
+            state_dict = torch.load(tacotron2_pretrained_model, map_location=torch.device('cpu'))['state_dict']
+            if has_MMI(state_dict):
+                raise Exception("ERROR: This notebook does not currently support MMI models.")
+            model.load_state_dict(state_dict)
+            _ = model.cpu().eval().float()
+            return model, hparams
+        model, hparams = get_Tactron2(TACOTRON2_ID)
+        previous_tt2_id = TACOTRON2_ID
+        pbar.update(1) # Downloaded and Set up Tacotron2
+        # 初始化
+initialize()
+import soundfile as sf
+def end_to_end_infer(text, pronounciation_dictionary, show_graphs):
+    audio = None  # 定义一个变量用于存储音频数据
+    for i in [x for x in text.split("\n") if len(x)]:
+        if not pronounciation_dictionary:
+            if i[-1] != ";":
+                i = i + ";"
+        else:
+            i = ARPA(i)
+        with torch.no_grad():
+            sequence = np.array(text_to_sequence(i, [text_cleaner]))[None, :]
+            sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cpu().long()  # 或者使用 .int()
+            mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
+            if show_graphs:
+                plot_data((mel_outputs_postnet.float().data.cpu().numpy()[0],
+                           alignments.float().data.cpu().numpy()[0].T))
+            y_g_hat = hifigan(mel_outputs_postnet.float())
+            audio = y_g_hat.squeeze()
+            audio = audio * MAX_WAV_VALUE
+            output_filename = f"output_{time.strftime('%Y%m%d%H%M%S')}.wav"
+            sf.write(output_filename, audio.cpu().numpy().astype('int16'), hparams.sampling_rate)
+            print(f"音频已保存为 {output_filename}")
+            print("")
+            ipd.display(ipd.Audio(audio.cpu().numpy().astype("int16"), rate=hparams.sampling_rate))
+    return audio  # 返回音频数据
+# 文本到语音转换函数
+def text_to_speech(text, max_decoder_steps=2000, gate_threshold=0.5):
+    global model, hparams, hifigan, thisdict, pronounciation_dictionary, show_graphs
+    hparams.max_decoder_steps = max_decoder_steps
+    hparams.gate_threshold = gate_threshold
+    output_filename = f"output_{time.strftime('%Y%m%d%H%M%S')}.wav"
+    audio = end_to_end_infer(text, pronounciation_dictionary, show_graphs)
+    if audio is not None:
+        sf.write(output_filename, audio.cpu().numpy().astype('int16'), hparams.sampling_rate)
+        return output_filename
+    else:
+        return None
+# Gradio界面
+inputs = [
+    gr.inputs.Textbox(lines=3, label="输入文本"),
+    gr.inputs.Slider(minimum=100, maximum=5000, default=2000, step=100, label="最大解码步数"),
+    gr.inputs.Slider(minimum=0.0, maximum=1.0, default=0.5, step=0.05, label="门控阈值")
+]
+outputs = gr.outputs.File(label="下载生成的音频")
+gr.Interface(fn=text_to_speech, inputs=inputs, outputs=outputs).launch(debug=True)