File size: 4,877 Bytes
d59aeff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c59c783
d59aeff
e42c164
 
d59aeff
 
 
 
 
 
 
 
 
 
 
 
c59c783
f954003
 
 
c59c783
 
 
 
 
 
 
 
d59aeff
e42c164
d59aeff
 
498fbde
 
e42c164
 
 
 
 
 
cffda07
e99101c
 
3e2bb0e
e99101c
 
 
 
e42c164
72542b5
1f3bc5d
2e04df8
e99101c
e42c164
0d86152
e42c164
48971d8
 
58a5d8e
3e2bb0e
 
050bcde
3e2bb0e
 
 
 
 
e42c164
f656c09
f29a4ca
b913e3e
ca2d513
d59aeff
 
c59c783
43e698f
 
a0907be
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136

import gradio as gr

import re
import random
import string
import librosa
import numpy as np

from pathlib import Path
from scipy.io.wavfile import write

from encoder import inference as encoder
from vocoder.hifigan import inference as gan_vocoder
from synthesizer.inference import Synthesizer

class Mandarin:
    def __init__(self):
        self.encoder_path = "encoder/saved_models/pretrained.pt"
        self.vocoder_path = "vocoder/saved_models/pretrained/g_hifigan.pt"
        self.config_fpath = "vocoder/hifigan/config_16k_.json"
        self.accent = "synthesizer/saved_models/普通话.pt"

        synthesizers_cache = {}
        if synthesizers_cache.get(self.accent) is None:
            self.current_synt = Synthesizer(Path(self.accent))
            synthesizers_cache[self.accent] = self.current_synt
        else:
            self.current_synt = synthesizers_cache[self.accent]

        encoder.load_model(Path(self.encoder_path))
        gan_vocoder.load_model(Path(self.vocoder_path), self.config_fpath)

    def setVoice(self, timbre):
        self.timbre = timbre
        wav, sample_rate,  = librosa.load(self.timbre)

        encoder_wav = encoder.preprocess_wav(wav, sample_rate)
        self.embed, _, _ = encoder.embed_utterance(encoder_wav, return_partials=True)

    def say(self, text):
        texts = filter(None, text.split("\n"))
        punctuation = "!,。、?!,.?::" # punctuate and split/clean text
        processed_texts = []
        for text in texts:
            for processed_text in re.sub(r'[{}]+'.format(punctuation), '\n', text).split('\n'):
                if processed_text:
                    processed_texts.append(processed_text.strip())
        texts = processed_texts
        embeds = [self.embed] * len(texts)

        specs = self.current_synt.synthesize_spectrograms(texts, embeds)
        spec = np.concatenate(specs, axis=1)
        wav, sample_rate = gan_vocoder.infer_waveform(spec)

        return wav, sample_rate


def greet(audio, text, voice=None):
    print(f"Log print: audio name=[{audio.name}], text=[{text}]")
    
    if voice is None:
        voice = Mandarin()
        voice.setVoice(audio.name)
        voice.say("加载成功")
    wav, sample_rate = voice.say(text)

    output_file = "".join( random.sample(string.ascii_lowercase + string.digits, 11) ) + ".wav"

    write(output_file, sample_rate, wav.astype(np.float32))

    return output_file, voice

def new_greet(audio, text):
    audio_path = audio
    print(f"Log print: audio name=[{audio_path}], text=[{text}]")
    voice.setVoice(audio_path)
    wav, sample_rate = voice.say(text)

    output_file = "".join( random.sample(string.ascii_lowercase + string.digits, 11) ) + ".wav"

    write(output_file, sample_rate, wav.astype(np.float32))

    return output_file

def main():
    demo = gr.Interface(
        fn=greet,
        inputs=[gr.inputs.Audio(type="file"),"text", "state"],
        outputs=[gr.outputs.Audio(type="file"), "state"],
        title="Tacotron Zero-short Voice Clone (Chinese Version)"
    )
    
    demo.launch()

def new_main():
    with gr.Blocks() as demo:
        title = gr.Markdown("# <center>Tacotron Zero-short Voice Clone (Chinese Version)</center>")
        gr.Markdown(
            """这是Zero-short语音克隆模型Tacotron的中文版本。<br>
            1. 上传想要克隆的人的一段语音,长度3~8秒即可。<br>
            2. 输入想要合成的文本<br>
            3. 点击Submit按钮,稍等30秒便可合成语音
            """
        )
        with gr.Row():
            with gr.Column():
                # input_audio = gr.Audio(type="file", label="Source Audio", value="exp/lihao_01.wav")
                input_audio = gr.Audio(type="filepath", label="Source Audio:")
                input_text = gr.Textbox(value="大家好,我是正在搬砖的李昊,这是一段合成音频。", label="TTS Text:")
                with gr.Row():
                    # clear = gr.ClearButton()
                    submit = gr.Button(value="Submit", variant='primary')
                
            with gr.Column():
                output_audio = gr.Audio(type="filepath", label="Output Audio:")
                gr.Markdown("Audio Examples:")
                gr.Examples(
                    examples=[["exp/lihao_01.wav", "大家好,我是正在搬砖的李昊,这是一段合成音频。"]],
                    inputs=[input_audio, input_text],
                    outputs=[output_audio],
                    fn=new_greet,
                    cache_examples=True,
                )

        _ = submit.click(new_greet, inputs=[input_audio, input_text], outputs=[output_audio])

    demo.queue(concurrency_count=20)
    demo.launch()

if __name__=="__main__":
    voice = Mandarin()
    # voice.setVoice(audio.name)
    # voice.say("加载成功")
    new_main()