File size: 7,287 Bytes
8a1292d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9116564
8a1292d
 
 
 
 
 
 
 
 
 
 
 
 
 
9116564
8a1292d
9116564
 
 
 
 
 
 
 
 
 
cabd1e3
8a1292d
 
 
9116564
 
 
8a1292d
 
 
 
f59b38e
8a1292d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fec9963
 
 
f59b38e
171db4e
8654ef4
4cecaf4
 
221b8b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1d45c44
4cecaf4
 
f59b38e
 
fec9963
8a1292d
0627904
9116564
8a1292d
d2565d9
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import sys, os

if sys.platform == "darwin":
    os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"

import logging

logging.getLogger("numba").setLevel(logging.WARNING)
logging.getLogger("markdown_it").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)
logging.getLogger("matplotlib").setLevel(logging.WARNING)

logging.basicConfig(level=logging.INFO, format="| %(name)s | %(levelname)s | %(message)s")

logger = logging.getLogger(__name__)

import torch
import argparse
import commons
import utils
from models import SynthesizerTrn
from text.symbols import symbols
from text import cleaned_text_to_sequence, get_bert
from text.cleaner import clean_text
import gradio as gr
import webbrowser


net_g = None


def get_text(text, language_str, hps):
    norm_text, phone, tone, word2ph = clean_text(text, language_str)
    phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)

    if hps.data.add_blank:
        phone = commons.intersperse(phone, 0)
        tone = commons.intersperse(tone, 0)
        language = commons.intersperse(language, 0)
        for i in range(len(word2ph)):
            word2ph[i] = word2ph[i] * 2
        word2ph[0] += 1
    bert = get_bert(norm_text, word2ph, language_str)
    del word2ph

    assert bert.shape[-1] == len(phone)

    phone = torch.LongTensor(phone)
    tone = torch.LongTensor(tone)
    language = torch.LongTensor(language)

    return bert, phone, tone, language
import soundfile as sf
def infer(text, sdp_ratio, noise_scale, noise_scale_w, length_scale, sid):
    global net_g
    bert, phones, tones, lang_ids = get_text(text, "ZH", hps)
    with torch.no_grad():
        x_tst=phones.to(device).unsqueeze(0)
        tones=tones.to(device).unsqueeze(0)
        lang_ids=lang_ids.to(device).unsqueeze(0)
        bert = bert.to(device).unsqueeze(0)
        x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
        del phones
        speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
        audio = net_g.infer(x_tst, x_tst_lengths, speakers, tones, lang_ids, bert, sdp_ratio=sdp_ratio
                           , noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0,0].data.cpu().float().numpy()
        del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers
        sf.write("tmp.wav", audio, 44100)
        return audio
def convert_wav_to_ogg(wav_file):
    os.makedirs('out', exist_ok=True)
    filename = os.path.splitext(os.path.basename(wav_file.name))[0]
    output_path_ogg = os.path.join('out', f"out.ogg")

    renamed_input_path = os.path.join('in', f"in.wav")
    os.makedirs('in', exist_ok=True)
    os.rename(wav_file.name, renamed_input_path)
    command = ["ffmpeg", "-i", renamed_input_path, "-acodec", "libopus", "-y", output_path_ogg]
    os.system(" ".join(command))
    return output_path_ogg
def tts_fn(text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale):
    with torch.no_grad():
        audio = infer(text, sdp_ratio=sdp_ratio, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale, sid=speaker)
    with open('tmp.wav', 'rb') as wav_file:
        newogg = convert_wav_to_ogg(wav_file)    
    return "Success", (hps.data.sampling_rate, audio),newogg


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_dir", default="./logs/ign/ign.pth", help="path of your model")
    parser.add_argument("--config_dir", default="./configs/config.json", help="path of your config file")
    parser.add_argument("--share", default=False, help="make link public")
    parser.add_argument("-d", "--debug", action="store_true", help="enable DEBUG-LEVEL log")

    args = parser.parse_args()
    if args.debug:
        logger.info("Enable DEBUG-LEVEL log")
        logging.basicConfig(level=logging.DEBUG)
    hps = utils.get_hparams_from_file(args.config_dir)
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    '''
    device = (
        "cuda:0"
        if torch.cuda.is_available()
        else (
            "mps"
            if sys.platform == "darwin" and torch.backends.mps.is_available()
            else "cpu"
        )
    )
    '''
    net_g = SynthesizerTrn(
        len(symbols),
        hps.data.filter_length // 2 + 1,
        hps.train.segment_size // hps.data.hop_length,
        n_speakers=hps.data.n_speakers,
        **hps.model).to(device)
    _ = net_g.eval()

    _ = utils.load_checkpoint(args.model_dir, net_g, None, skip_optimizer=True)

    speaker_ids = hps.data.spk2id
    speakers = list(speaker_ids.keys())
    with gr.Blocks() as app:
        with gr.Row():
            with gr.Column():


                gr.Markdown(value="""
                IGN 中国 Bert-Vits2在线语音生成\n
                1、模型作者:数字星瞳企划 https://t.me/xingtong25680 \n
                2、原项目地址:https://github.com/Stardust-minus/Bert-VITS2\n
                3、使用此模型进行二创请注明AI生成,以及原项目地址\n
                4、素材来自散文朗读比赛,严禁将此项目用于一切违反《中华人民共和国宪法》,《中华人民共和国刑法》,《中华人民共和国治安管理处罚法》和《中华人民共和国民法典》之用途。严禁用于任何政治相关用途。 \n
                """)
                text = gr.TextArea(label="Text", placeholder="Input Text Here",
                                      value="这里是数字星瞳企画,请在电报搜索星瞳全拼加二五六八零,获取最新更新进展。")
                speaker = gr.Dropdown(choices=speakers, value=speakers[0], label='Speaker')
                sdp_ratio = gr.Slider(minimum=0, maximum=1, value=0.2, step=0.01, label='语调变化')
                noise_scale = gr.Slider(minimum=0.1, maximum=1.5, value=0.6, step=0.01, label='感情变化')
                noise_scale_w = gr.Slider(minimum=0.1, maximum=1.4, value=0.8, step=0.01, label='音节发音长度变化')
                length_scale = gr.Slider(minimum=0.1, maximum=2, value=1, step=0.01, label='语速')
                btn = gr.Button("开启AI语音之旅吧!", variant="primary")
            with gr.Column():
                text_output = gr.Textbox(label="Message")
                audio_output = gr.Audio(label="Output Audio")
                ogg_output = gr.File(label="Converted OGG file")
                gr.Markdown(value="""
                模型汇总:\n
                星瞳整合 https://huggingface.co/spaces/digitalxingtong/Xingtong-All-in-One\n
                男声朗读 https://huggingface.co/spaces/digitalxingtong/Kanghui-Read-Bert-VITS2 \n
                男声朗读(长文本) https://huggingface.co/spaces/digitalxingtong/Kanghui-Longread-Bert-VITS2\n
                IGN 中国 https://huggingface.co/spaces/digitalxingtong/Ign-Read-Bert-VITS2 \n
                IGN 中国(长文本)https://huggingface.co/spaces/digitalxingtong/Ign-Longread-Bert-VITS2 \n
                """)
        btn.click(tts_fn,
                inputs=[text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale],
                outputs=[text_output, audio_output,ogg_output])
    
        
    app.launch(show_error=True)