File size: 8,410 Bytes
a5e008e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
from inference.infer_tool import Svc
from vextract.vocal_extract import VEX
import gradio as gr
import os


# os.environ['CUDA_VISIBLE_DEVICES'] = '1,2'


class VitsGradio:
    def __init__(self):
        self.so = Svc()
        self.v = VEX()
        self.lspk = []
        self.modelPaths = []
        for root, dirs, files in os.walk("checkpoints"):
            for dir in dirs:
                self.modelPaths.append(dir)
        with gr.Blocks(title="Sovits Singing Synthesis Tool") as self.Vits:
            gr.Markdown(
                """
                # Singing Synthesis Tool
                - Please select the voice model, device, and operating mode in sequence, then click "Load Model"
                - The input audio needs to be clean vocals
                """
            )
            with gr.Tab("Vocal Extraction"):
                with gr.Row():
                    with gr.Column():
                        sample_audio = gr.Audio(label="Input Audio")
                        extractAudioBtn = gr.Button("Extract Vocals")
                with gr.Row():
                    with gr.Column():
                        self.sample_vocal_output = gr.Audio(label="Output Audio")
                        self.sample_accompaniment_output = gr.Audio()
                extractAudioBtn.click(self.v.separate, inputs=[sample_audio],
                                      outputs=[self.sample_vocal_output, self.sample_accompaniment_output],
                                      show_progress=True, api_name="extract")
            with gr.Tab("Singing Synthesis"):
                with gr.Row(visible=False) as self.VoiceConversion:
                    with gr.Column():
                        with gr.Row():
                            with gr.Column():
                                self.srcaudio = gr.Audio(label="Input Audio")
                                self.btnVC = gr.Button("Speaker Conversion")
                            with gr.Column():
                                with gr.Row():
                                    with gr.Column():
                                        self.dsid0 = gr.Dropdown(label="Target Character", choices=self.lspk)
                                        self.tran = gr.Slider(label="Pitch Shift", maximum=60, minimum=-60, step=1, value=0)
                                        self.th = gr.Slider(label="Slice Threshold", maximum=32767, minimum=-32768, step=0.1,
                                                            value=-40)
                                        self.ns = gr.Slider(label="Noise Level", maximum=1.0, minimum=0.0, step=0.1,
                                                            value=0.4)
                        with gr.Row():
                            self.VCOutputs = gr.Audio()
                    self.btnVC.click(self.so.inference, inputs=[self.srcaudio, self.dsid0, self.tran, self.th, self.ns],
                                     outputs=[self.VCOutputs], show_progress=True, api_name="run")

                with gr.Row(visible=False) as self.VoiceBatchConversion:
                    with gr.Column():
                        with gr.Row():
                            with gr.Column():
                                self.srcaudio = gr.Files(label="Upload Multiple Audio Files", file_types=['.wav'],
                                                         interactive=True)
                                self.btnVC = gr.Button("Speaker Conversion")
                            with gr.Column():
                                with gr.Row():
                                    with gr.Column():
                                        self.dsid1 = gr.Dropdown(label="Target Character", choices=self.lspk)
                                        self.tran = gr.Slider(label="Pitch Shift", maximum=60, minimum=-60, step=1, value=0)
                                        self.th = gr.Slider(label="Slice Threshold", maximum=32767, minimum=-32768, step=0.1,
                                                            value=-40)
                                        self.ns = gr.Slider(label="Noise Level", maximum=1.0, minimum=0.0, step=0.1,
                                                            value=0.4)
                        with gr.Row():
                            self.VCOutputs = gr.File(label="Output Zip File", interactive=False)
                    self.btnVC.click(self.batch_inference, inputs=[self.srcaudio, self.dsid1, self.tran, self.th, self.ns],
                                     outputs=[self.VCOutputs], show_progress=True, api_name="batch")

                with gr.Row():
                    with gr.Column():
                        modelstrs = gr.Dropdown(label="Model", choices=self.modelPaths, value=self.modelPaths[0],
                                                type="value")
                        devicestrs = gr.Dropdown(label="Device", choices=["cpu", "cuda"], value="cuda", type="value")
                        isbatchmod = gr.Radio(label="Operating Mode", choices=["single", "batch"], value="single",
                                              info="single: Single file processing. batch: Batch processing supports uploading multiple files")
                        btnMod = gr.Button("Load Model")
                        btnMod.click(self.loadModel, inputs=[modelstrs, devicestrs, isbatchmod],
                                     outputs=[self.dsid0, self.dsid1, self.VoiceConversion, self.VoiceBatchConversion],
                                     show_progress=True, api_name="switch")

    def batch_inference(self, files, chara, tran, slice_db, ns, progress=gr.Progress()):
        from zipfile import ZipFile
        from scipy.io import wavfile
        import uuid

        temp_directory = "temp"
        if not os.path.exists(temp_directory):
            os.mkdir(temp_directory)

        progress(0.00, desc="Initializing Directory")
        tmp_workdir_name = f"{temp_directory}/batch_{uuid.uuid4()}"
        if not os.path.exists(tmp_workdir_name):
            os.mkdir(tmp_workdir_name)

        progress(0.10, desc="Initializing Directory")

        output_files = []

        for idx, file in enumerate(files):
            filename = os.path.basename(file.name)
            progress(0.10 + (0.70 / float(len(files))) * (idx + 1.00), desc=f"Processing Audio {(idx + 1)}/{len(files)}: {filename}")
            print(f"{idx}, {file}, {filename}")
            sampling_rate, audio = wavfile.read(file.name)
            output_sampling_rate, output_audio = self.so.inference((sampling_rate, audio), chara=chara, tran=tran,
                                                                   slice_db=slice_db, ns=ns)
            new_filepath = f"{tmp_workdir_name}/{filename}"
            wavfile.write(filename=new_filepath, rate=output_sampling_rate, data=output_audio)
            output_files.append(new_filepath)

        progress(0.70, desc="Audio Processing Complete")

        zipfilename = f"{tmp_workdir_name}/output.zip"
        with ZipFile(zipfilename, "w") as zip_obj:
            for idx, filepath in enumerate(output_files):
                zip_obj.write(filepath, os.path.basename(filepath))
        progress(0.80, desc="Compression Complete")
        # todo: remove data
        progress(1.00, desc="Cleaning Up")
        return zipfilename

    def loadModel(self, path, device, process_mode):
        self.lspk = []
        print(f"path: {path}, device: {device}")
        self.so.set_device(device)
        print(f"device set.")
        self.so.load_checkpoint(path)
        print(f"checkpoint loaded")
        for spk, sid in self.so.hps_ms.spk.items():
            self.lspk.append(spk)
        print(f"LSPK: {self.lspk}")
        if process_mode == "single":
            VChange = gr.update(visible=True)
            VBChange = gr.update(visible=False)
        else:
            VChange = gr.update(visible=False)
            VBChange = gr.update(visible=True)
        SD0Change = gr.update(choices=self.lspk, value=self.lspk[0])
        SD1Change = gr.update(choices=self.lspk, value=self.lspk[0])
        print("All set. Updating display")
        return [SD0Change, SD1Change, VChange, VBChange]


if __name__ == "__main__":
    grVits = VitsGradio()
    grVits.Vits\
        .queue(concurrency_count=20, status_update_rate=5.0)\
        .launch(server_port=7870, share=True, show_api=False)