File size: 6,949 Bytes
4723c1b
 
 
 
3ad3198
557aafd
4723c1b
 
 
 
557aafd
 
3ad3198
557aafd
4723c1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
557aafd
 
 
 
3ad3198
4723c1b
 
 
 
 
 
 
719a597
4723c1b
 
 
719a597
557aafd
 
4723c1b
 
 
 
 
 
 
 
 
 
 
719a597
4723c1b
fe495ec
4723c1b
 
 
 
 
557aafd
fe495ec
 
 
 
4723c1b
 
 
 
 
 
719a597
4723c1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d81ff1
4723c1b
420ede4
4723c1b
 
 
 
 
 
 
 
557aafd
 
4723c1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import argparse
import logging
import os
import re
import gradio.processing_utils as gr_pu
import gradio as gr
import librosa
import numpy as np
import soundfile
from scipy.io import wavfile
import tempfile
import edge_tts
import utils

from inference.infer_tool import Svc

logging.getLogger('numba').setLevel(logging.WARNING)
logging.getLogger('markdown_it').setLevel(logging.WARNING)
logging.getLogger('urllib3').setLevel(logging.WARNING)
logging.getLogger('matplotlib').setLevel(logging.WARNING)

sampling_rate = 44100

tts_voice = {
    "中文男": "zh-CN-YunxiNeural",
    "中文女": "zh-CN-XiaoyiNeural",
    "英文男": "en-US-EricNeural",
    "英文女": "en-US-AnaNeural"
}

hubert_dict = {
    "vec768l12": utils.get_speech_encoder("vec768l12", device="cpu"),
    "vec256l9": utils.get_speech_encoder("vec256l9", device="cpu")
}


def create_fn(model, spk):
    def svc_fn(input_audio, vc_transform, auto_f0, f0p):
        if input_audio is None:
            return 0, None
        sr, audio = input_audio
        audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
        audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)
        if len(audio.shape) > 1:
            audio = librosa.to_mono(audio.transpose(1, 0))
        temp_path = "temp.wav"
        soundfile.write(temp_path, audio, sampling_rate, format="wav")

        model.hubert_model = hubert_dict[model.speech_encoder]
        out_audio = model.slice_inference(raw_audio_path=temp_path,
                                          spk=spk,
                                          slice_db=-40,
                                          cluster_infer_ratio=0,
                                          noice_scale=0.4,
                                          clip_seconds=10,
                                          tran=vc_transform,
                                          f0_predictor=f0p,
                                          auto_predict_f0=auto_f0)
        model.clear_empty()
        os.remove(temp_path)
        return sampling_rate, out_audio

    async def tts_fn(input_text, gender, tts_rate, vc_transform, auto_f0, f0p):
        if input_text == '':
            return 0, None
        input_text = re.sub(r"[\n\,\(\) ]", "", input_text)
        voice = tts_voice[gender]
        ratestr = "+{:.0%}".format(tts_rate) if tts_rate >= 0 else "{:.0%}".format(tts_rate)
        communicate = edge_tts.Communicate(text=input_text, voice=voice, rate=ratestr)
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
            temp_path = tmp_file.name
        await communicate.save(temp_path)

        audio, sr = librosa.load(temp_path)
        audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)
        os.remove(temp_path)
        temp_path = "temp.wav"
        wavfile.write(temp_path, sampling_rate, (audio * np.iinfo(np.int16).max).astype(np.int16))
        sr, audio = gr_pu.audio_from_file(temp_path)
        input_audio = (sampling_rate, audio)
        return svc_fn(input_audio, vc_transform, auto_f0, f0p)

    return svc_fn, tts_fn


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--device', type=str, default='cpu')
    parser.add_argument('--api', action="store_true", default=False)
    parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
    args = parser.parse_args()
    models = []
    for f in os.listdir("models"):
        name = f
        model = Svc(fr"models/{f}/{f}.pth", f"models/{f}/config_{f}.json", device=args.device)
        cover = f"models/{f}/cover.png" if os.path.exists(f"models/{f}/cover.png") else f"models/{f}/cover.jpg"
        models.append((name, cover, create_fn(model, name)))
    with gr.Blocks() as app:
        gr.Markdown(
            "# <center> 罪恶都市角色语音生成\n"
            "## <center> 模型作者:B站[Cyber蝈蝈总](https://space.bilibili.com/37706580)\n"
            "#### <center> 圣安地列斯人物AI语音请移步[GTASA](https://huggingface.co/spaces/GroveStreet/GTA_SOVITS),使用此资源创作的作品请标明出处\n"
        )
        with gr.Tabs():
            for (name, cover, (svc_fn, tts_fn)) in models:
                with gr.TabItem(name):
                    with gr.Row():
                        with gr.Column():
                            with gr.Row():
                                vc_transform = gr.Number(label="音高调整 (正负半音,12为1个八度)", value=0)
                                f0_predictor = gr.Radio(label="f0预测器 (推荐rmvpe)",
                                                        choices=['crepe', 'harvest', 'rmvpe'], value='rmvpe')
                            auto_f0 = gr.Checkbox(label="自动音高预测 (文本转语音或讲话可选,会导致唱歌跑调)",
                                                  value=False)
                            with gr.Tabs():
                                with gr.TabItem('语音转语音'):
                                    svc_input = gr.Audio(
                                        label="上传干声 (已支持无限长音频,处理时间约为原音频时间的5倍)")
                                    svc_submit = gr.Button("生成", variant="primary")

                                with gr.TabItem('文本转语音'):
                                    tts_input = gr.Textbox(label='说话内容', value='',
                                                           placeholder='已支持无限长内容,处理时间约为说完原内容时间的5倍')
                                    with gr.Row():
                                        gender = gr.Radio(label='说话人性别 (男音调低,女音调高)', value='中文男',
                                                          choices=['中文男', '中文女', '英文男', '英文女'])
                                        tts_rate = gr.Number(label='语速 (正负, 单位百分比)', value=0)
                                    tts_submit = gr.Button("生成", variant="primary")

                        with gr.Column():
                            gr.Markdown(
                                '<div align="center">'
                                f'<img style="width:auto;height:300px;" src="file/{cover}">' if cover else ""
                                                                                                           '</div>'
                            )
                            vc_output = gr.Audio(label="输出音频")
                    svc_submit.click(svc_fn, [svc_input, vc_transform, auto_f0, f0_predictor], vc_output)
                    tts_submit.click(tts_fn, [tts_input, gender, tts_rate, vc_transform, auto_f0, f0_predictor],
                                     vc_output)
        app.queue(concurrency_count=1, api_open=args.api).launch(share=args.share)