import os import json import math import torch from torch import nn from torch.nn import functional as F from torch.utils.data import DataLoader import commons import utils from models import SynthesizerTrn from text.symbols import symbols from text import text_to_sequence import gradio as gr pth_path = "G_240000.pth" hps = utils.get_hparams_from_file("./configs/hoshimi_base.json") # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") device = torch.device("cpu") def get_text(text, hps): text_norm = text_to_sequence(text, hps.data.text_cleaners) if hps.data.add_blank: text_norm = commons.intersperse(text_norm, 0) text_norm = torch.LongTensor(text_norm) return text_norm def load_model(pth_path): net_g = SynthesizerTrn( len(symbols), hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, **hps.model).to(device) _ = net_g.eval() _ = utils.load_checkpoint(pth_path, net_g, None) return net_g def list_model(): global pth_path res = [] dir = os.getcwd() for f in os.listdir(dir): if (f.startswith("D_")): continue if (f.endswith(".pth")): res.append(f) if len(f) >= len(pth_path): pth_path = f return res def infer(text): stn_tst = get_text(text, hps) with torch.no_grad(): x_tst = stn_tst.unsqueeze(0).to(device) x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device) audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.float().numpy() return (hps.data.sampling_rate, audio) models = list_model() net_g = load_model(pth_path) def change_model(model): global pth_path global net_g_ms pth_path = model net_g_ms = load_model(pth_path) return "载入模型:"+pth_path app = gr.Blocks() with app: with open("header.html", "r") as f: gr.HTML(f.read()) with gr.Tabs(): with gr.TabItem("Basic"): choice_model = gr.Dropdown( choices=models, label="模型", value=pth_path) tts_input1 = gr.TextArea( label="请输入文本(目前只支持汉字和单个英文字母,也可以使用逗号、句号、感叹号、空格等常用符号来改变语调和停顿)", value="这里是爱喝奶茶,穿得也像奶茶魅力点是普通话二乙的星弥吼西咪,晚上齁。") tts_submit = gr.Button("用文本合成", variant="primary") tts_output = gr.Audio(label="Output") tts_model = gr.Markdown("") tts_submit.click(infer, [tts_input1], [tts_output]) choice_model.change(change_model, inputs=[ choice_model], outputs=[tts_model]) app.launch()