feat: added v2 support
Browse files- README.md +1 -1
- app.py +373 -77
- config.py +18 -12
- infer_pack/models.py +177 -35
- infer_pack/models_onnx.py +76 -18
- requirements.txt +21 -41
- vc_infer_pipeline.py +130 -21
README.md
CHANGED
@@ -4,7 +4,7 @@ emoji: 🎤
|
|
4 |
colorFrom: red
|
5 |
colorTo: purple
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 3.
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
license: mit
|
|
|
4 |
colorFrom: red
|
5 |
colorTo: purple
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 3.34.0
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
license: mit
|
app.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
import os
|
2 |
import glob
|
3 |
import json
|
4 |
-
import argparse
|
5 |
import traceback
|
6 |
import logging
|
7 |
import gradio as gr
|
@@ -10,37 +9,48 @@ import librosa
|
|
10 |
import torch
|
11 |
import asyncio
|
12 |
import edge_tts
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
from datetime import datetime
|
14 |
from fairseq import checkpoint_utils
|
15 |
-
from infer_pack.models import
|
|
|
|
|
|
|
|
|
|
|
16 |
from vc_infer_pipeline import VC
|
17 |
from config import Config
|
18 |
config = Config()
|
19 |
logging.getLogger("numba").setLevel(logging.WARNING)
|
20 |
-
limitation = os.getenv("SYSTEM") == "spaces"
|
21 |
|
22 |
def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index):
|
23 |
def vc_fn(
|
24 |
-
|
|
|
|
|
|
|
25 |
f0_up_key,
|
|
|
26 |
f0_method,
|
27 |
index_rate,
|
28 |
-
|
29 |
-
|
30 |
-
|
|
|
31 |
):
|
32 |
try:
|
33 |
-
if
|
34 |
-
|
35 |
-
|
36 |
-
if
|
37 |
-
return "You need to enter text and select a voice", None
|
38 |
-
asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save("tts.mp3"))
|
39 |
-
audio, sr = librosa.load("tts.mp3", sr=16000, mono=True)
|
40 |
-
else:
|
41 |
-
if input_audio is None:
|
42 |
return "You need to upload an audio", None
|
43 |
-
sampling_rate, audio =
|
44 |
duration = audio.shape[0] / sampling_rate
|
45 |
if duration > 20 and limitation:
|
46 |
return "Please upload an audio file that is less than 20 seconds. If you need to generate a longer audio file, please use Colab.", None
|
@@ -49,31 +59,102 @@ def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index):
|
|
49 |
audio = librosa.to_mono(audio.transpose(1, 0))
|
50 |
if sampling_rate != 16000:
|
51 |
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
times = [0, 0, 0]
|
53 |
f0_up_key = int(f0_up_key)
|
54 |
audio_opt = vc.pipeline(
|
55 |
hubert_model,
|
56 |
net_g,
|
57 |
-
|
58 |
audio,
|
|
|
59 |
times,
|
60 |
f0_up_key,
|
61 |
f0_method,
|
62 |
file_index,
|
63 |
index_rate,
|
64 |
if_f0,
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
f0_file=None,
|
66 |
)
|
67 |
-
|
68 |
-
|
69 |
-
)
|
70 |
-
return (tgt_sr, audio_opt)
|
71 |
except:
|
72 |
info = traceback.format_exc()
|
73 |
print(info)
|
74 |
return info, (None, None)
|
75 |
return vc_fn
|
76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
def load_hubert():
|
78 |
global hubert_model
|
79 |
models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
|
@@ -88,11 +169,107 @@ def load_hubert():
|
|
88 |
hubert_model = hubert_model.float()
|
89 |
hubert_model.eval()
|
90 |
|
91 |
-
def
|
92 |
-
if
|
93 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
else:
|
95 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
97 |
if __name__ == '__main__':
|
98 |
load_hubert()
|
@@ -121,10 +298,19 @@ if __name__ == '__main__':
|
|
121 |
tgt_sr = cpt["config"][-1]
|
122 |
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
|
123 |
if_f0 = cpt.get("f0", 1)
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
del net_g.enc_q
|
129 |
print(net_g.load_state_dict(cpt["weight"], strict=False))
|
130 |
net_g.eval().to(config.device)
|
@@ -134,18 +320,13 @@ if __name__ == '__main__':
|
|
134 |
net_g = net_g.float()
|
135 |
vc = VC(tgt_sr, config)
|
136 |
print(f"Model loaded: {model_name}")
|
137 |
-
models.append((model_name, model_title, model_author, model_cover, create_vc_fn(tgt_sr, net_g, vc, if_f0, model_index)))
|
138 |
categories.append([category_title, category_folder, description, models])
|
139 |
with gr.Blocks() as app:
|
140 |
gr.Markdown(
|
141 |
-
"# <center> RVC Genshin Impact\n"
|
142 |
-
"
|
143 |
-
"
|
144 |
-
"### <center> I limit the number of models to 15 due to an error caused by exceeding the available memory. (16 GB limit)\n"
|
145 |
-
"### <center> This project was inspired by [zomehwh](https://huggingface.co/spaces/zomehwh/rvc-models) and [ardha27](https://huggingface.co/spaces/ardha27/rvc-models)\n"
|
146 |
-
"[![image](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/110kiMZTdP6Ri1lY9-NbQf17GVPPhHyeT?usp=sharing)\n\n"
|
147 |
-
"[![Original RVC Repo](https://badgen.net/badge/icon/github?icon=github&label=Original%20Repo)](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)"
|
148 |
-
"[![RVC Inference Repo](https://badgen.net/badge/icon/github?icon=github&label)](https://github.com/ArkanDash/rvc-inference)"
|
149 |
)
|
150 |
for (folder_title, folder, description, models) in categories:
|
151 |
with gr.TabItem(folder_title):
|
@@ -154,44 +335,159 @@ if __name__ == '__main__':
|
|
154 |
with gr.Tabs():
|
155 |
if not models:
|
156 |
gr.Markdown("# <center> No Model Loaded.")
|
157 |
-
gr.Markdown("## <center> Please
|
158 |
continue
|
159 |
-
|
160 |
-
|
161 |
-
with gr.
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
)
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
197 |
app.queue(concurrency_count=1, max_size=20, api_open=config.api).launch(share=config.colab)
|
|
|
1 |
import os
|
2 |
import glob
|
3 |
import json
|
|
|
4 |
import traceback
|
5 |
import logging
|
6 |
import gradio as gr
|
|
|
9 |
import torch
|
10 |
import asyncio
|
11 |
import edge_tts
|
12 |
+
import yt_dlp
|
13 |
+
import ffmpeg
|
14 |
+
import subprocess
|
15 |
+
import sys
|
16 |
+
import io
|
17 |
+
import wave
|
18 |
from datetime import datetime
|
19 |
from fairseq import checkpoint_utils
|
20 |
+
from infer_pack.models import (
|
21 |
+
SynthesizerTrnMs256NSFsid,
|
22 |
+
SynthesizerTrnMs256NSFsid_nono,
|
23 |
+
SynthesizerTrnMs768NSFsid,
|
24 |
+
SynthesizerTrnMs768NSFsid_nono,
|
25 |
+
)
|
26 |
from vc_infer_pipeline import VC
|
27 |
from config import Config
|
28 |
config = Config()
|
29 |
logging.getLogger("numba").setLevel(logging.WARNING)
|
30 |
+
limitation = os.getenv("SYSTEM") == "spaces"
|
31 |
|
32 |
def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index):
|
33 |
def vc_fn(
|
34 |
+
vc_input,
|
35 |
+
vc_upload,
|
36 |
+
tts_text,
|
37 |
+
tts_voice,
|
38 |
f0_up_key,
|
39 |
+
vc_transform,
|
40 |
f0_method,
|
41 |
index_rate,
|
42 |
+
filter_radius,
|
43 |
+
resample_sr,
|
44 |
+
rms_mix_rate,
|
45 |
+
protect,
|
46 |
):
|
47 |
try:
|
48 |
+
if vc_audio_mode == "Input path" or "Youtube" and vc_input != "":
|
49 |
+
audio, sr = librosa.load(vc_input, sr=16000, mono=True)
|
50 |
+
elif vc_audio_mode == "Upload audio":
|
51 |
+
if vc_upload is None:
|
|
|
|
|
|
|
|
|
|
|
52 |
return "You need to upload an audio", None
|
53 |
+
sampling_rate, audio = vc_upload
|
54 |
duration = audio.shape[0] / sampling_rate
|
55 |
if duration > 20 and limitation:
|
56 |
return "Please upload an audio file that is less than 20 seconds. If you need to generate a longer audio file, please use Colab.", None
|
|
|
59 |
audio = librosa.to_mono(audio.transpose(1, 0))
|
60 |
if sampling_rate != 16000:
|
61 |
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
|
62 |
+
elif vc_audio_mode == "TTS Audio":
|
63 |
+
if len(tts_text) > 100 and limitation:
|
64 |
+
return "Text is too long", None
|
65 |
+
if tts_text is None or tts_voice is None:
|
66 |
+
return "You need to enter text and select a voice", None
|
67 |
+
asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save("tts.mp3"))
|
68 |
+
audio, sr = librosa.load("tts.mp3", sr=16000, mono=True)
|
69 |
times = [0, 0, 0]
|
70 |
f0_up_key = int(f0_up_key)
|
71 |
audio_opt = vc.pipeline(
|
72 |
hubert_model,
|
73 |
net_g,
|
74 |
+
vc_transform,
|
75 |
audio,
|
76 |
+
vc_input,
|
77 |
times,
|
78 |
f0_up_key,
|
79 |
f0_method,
|
80 |
file_index,
|
81 |
index_rate,
|
82 |
if_f0,
|
83 |
+
filter_radius,
|
84 |
+
tgt_sr,
|
85 |
+
resample_sr,
|
86 |
+
rms_mix_rate,
|
87 |
+
version,
|
88 |
+
protect,
|
89 |
f0_file=None,
|
90 |
)
|
91 |
+
info = f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
|
92 |
+
print(info)
|
93 |
+
return info, (tgt_sr, audio_opt)
|
|
|
94 |
except:
|
95 |
info = traceback.format_exc()
|
96 |
print(info)
|
97 |
return info, (None, None)
|
98 |
return vc_fn
|
99 |
|
100 |
+
def cut_vocal_and_inst(url, audio_provider, split_model):
|
101 |
+
if url != "":
|
102 |
+
if not os.path.exists("dl_audio"):
|
103 |
+
os.mkdir("dl_audio")
|
104 |
+
if audio_provider == "Youtube":
|
105 |
+
ydl_opts = {
|
106 |
+
'format': 'bestaudio/best',
|
107 |
+
'postprocessors': [{
|
108 |
+
'key': 'FFmpegExtractAudio',
|
109 |
+
'preferredcodec': 'wav',
|
110 |
+
}],
|
111 |
+
"outtmpl": 'dl_audio/youtube_audio',
|
112 |
+
}
|
113 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
114 |
+
ydl.download([url])
|
115 |
+
audio_path = "dl_audio/youtube_audio.wav"
|
116 |
+
else:
|
117 |
+
# Spotify doesnt work.
|
118 |
+
# Need to find other solution soon.
|
119 |
+
'''
|
120 |
+
command = f"spotdl download {url} --output dl_audio/.wav"
|
121 |
+
result = subprocess.run(command.split(), stdout=subprocess.PIPE)
|
122 |
+
print(result.stdout.decode())
|
123 |
+
audio_path = "dl_audio/spotify_audio.wav"
|
124 |
+
'''
|
125 |
+
if split_model == "htdemucs":
|
126 |
+
command = f"demucs --two-stems=vocals {audio_path} -o output"
|
127 |
+
result = subprocess.run(command.split(), stdout=subprocess.PIPE)
|
128 |
+
print(result.stdout.decode())
|
129 |
+
return "output/htdemucs/youtube_audio/vocals.wav", "output/htdemucs/youtube_audio/no_vocals.wav", audio_path, "output/htdemucs/youtube_audio/vocals.wav"
|
130 |
+
else:
|
131 |
+
command = f"demucs --two-stems=vocals -n mdx_extra_q {audio_path} -o output"
|
132 |
+
result = subprocess.run(command.split(), stdout=subprocess.PIPE)
|
133 |
+
print(result.stdout.decode())
|
134 |
+
return "output/mdx_extra_q/youtube_audio/vocals.wav", "output/mdx_extra_q/youtube_audio/no_vocals.wav", audio_path, "output/mdx_extra_q/youtube_audio/vocals.wav"
|
135 |
+
else:
|
136 |
+
raise gr.Error("URL Required!")
|
137 |
+
return None, None, None, None
|
138 |
+
|
139 |
+
def combine_vocal_and_inst(audio_data, audio_volume, split_model):
|
140 |
+
if not os.path.exists("output/result"):
|
141 |
+
os.mkdir("output/result")
|
142 |
+
vocal_path = "output/result/output.wav"
|
143 |
+
output_path = "output/result/combine.mp3"
|
144 |
+
if split_model == "htdemucs":
|
145 |
+
inst_path = "output/htdemucs/youtube_audio/no_vocals.wav"
|
146 |
+
else:
|
147 |
+
inst_path = "output/mdx_extra_q/youtube_audio/no_vocals.wav"
|
148 |
+
with wave.open(vocal_path, "w") as wave_file:
|
149 |
+
wave_file.setnchannels(1)
|
150 |
+
wave_file.setsampwidth(2)
|
151 |
+
wave_file.setframerate(audio_data[0])
|
152 |
+
wave_file.writeframes(audio_data[1].tobytes())
|
153 |
+
command = f'ffmpeg -y -i {inst_path} -i {vocal_path} -filter_complex [1:a]volume={audio_volume}dB[v];[0:a][v]amix=inputs=2:duration=longest -b:a 320k -c:a libmp3lame {output_path}'
|
154 |
+
result = subprocess.run(command.split(), stdout=subprocess.PIPE)
|
155 |
+
print(result.stdout.decode())
|
156 |
+
return output_path
|
157 |
+
|
158 |
def load_hubert():
|
159 |
global hubert_model
|
160 |
models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
|
|
|
169 |
hubert_model = hubert_model.float()
|
170 |
hubert_model.eval()
|
171 |
|
172 |
+
def change_audio_mode(vc_audio_mode):
|
173 |
+
if vc_audio_mode == "Input path":
|
174 |
+
return (
|
175 |
+
# Input & Upload
|
176 |
+
gr.Textbox.update(visible=True),
|
177 |
+
gr.Audio.update(visible=False),
|
178 |
+
# Youtube
|
179 |
+
gr.Dropdown.update(visible=False),
|
180 |
+
gr.Textbox.update(visible=False),
|
181 |
+
gr.Dropdown.update(visible=False),
|
182 |
+
gr.Button.update(visible=False),
|
183 |
+
gr.Audio.update(visible=False),
|
184 |
+
gr.Audio.update(visible=False),
|
185 |
+
gr.Audio.update(visible=False),
|
186 |
+
gr.Slider.update(visible=False),
|
187 |
+
gr.Audio.update(visible=False),
|
188 |
+
gr.Button.update(visible=False),
|
189 |
+
# TTS
|
190 |
+
gr.Textbox.update(visible=False),
|
191 |
+
gr.Dropdown.update(visible=False)
|
192 |
+
)
|
193 |
+
elif vc_audio_mode == "Upload audio":
|
194 |
+
return (
|
195 |
+
# Input & Upload
|
196 |
+
gr.Textbox.update(visible=False),
|
197 |
+
gr.Audio.update(visible=True),
|
198 |
+
# Youtube
|
199 |
+
gr.Dropdown.update(visible=False),
|
200 |
+
gr.Textbox.update(visible=False),
|
201 |
+
gr.Dropdown.update(visible=False),
|
202 |
+
gr.Button.update(visible=False),
|
203 |
+
gr.Audio.update(visible=False),
|
204 |
+
gr.Audio.update(visible=False),
|
205 |
+
gr.Audio.update(visible=False),
|
206 |
+
gr.Slider.update(visible=False),
|
207 |
+
gr.Audio.update(visible=False),
|
208 |
+
gr.Button.update(visible=False),
|
209 |
+
# TTS
|
210 |
+
gr.Textbox.update(visible=False),
|
211 |
+
gr.Dropdown.update(visible=False)
|
212 |
+
)
|
213 |
+
elif vc_audio_mode == "Youtube":
|
214 |
+
return (
|
215 |
+
# Input & Upload
|
216 |
+
gr.Textbox.update(visible=False),
|
217 |
+
gr.Audio.update(visible=False),
|
218 |
+
# Youtube
|
219 |
+
gr.Dropdown.update(visible=True),
|
220 |
+
gr.Textbox.update(visible=True),
|
221 |
+
gr.Dropdown.update(visible=True),
|
222 |
+
gr.Button.update(visible=True),
|
223 |
+
gr.Audio.update(visible=True),
|
224 |
+
gr.Audio.update(visible=True),
|
225 |
+
gr.Audio.update(visible=True),
|
226 |
+
gr.Slider.update(visible=True),
|
227 |
+
gr.Audio.update(visible=True),
|
228 |
+
gr.Button.update(visible=True),
|
229 |
+
# TTS
|
230 |
+
gr.Textbox.update(visible=False),
|
231 |
+
gr.Dropdown.update(visible=False)
|
232 |
+
)
|
233 |
+
elif vc_audio_mode == "TTS Audio":
|
234 |
+
return (
|
235 |
+
# Input & Upload
|
236 |
+
gr.Textbox.update(visible=False),
|
237 |
+
gr.Audio.update(visible=False),
|
238 |
+
# Youtube
|
239 |
+
gr.Dropdown.update(visible=False),
|
240 |
+
gr.Textbox.update(visible=False),
|
241 |
+
gr.Dropdown.update(visible=False),
|
242 |
+
gr.Button.update(visible=False),
|
243 |
+
gr.Audio.update(visible=False),
|
244 |
+
gr.Audio.update(visible=False),
|
245 |
+
gr.Audio.update(visible=False),
|
246 |
+
gr.Slider.update(visible=False),
|
247 |
+
gr.Audio.update(visible=False),
|
248 |
+
gr.Button.update(visible=False),
|
249 |
+
# TTS
|
250 |
+
gr.Textbox.update(visible=True),
|
251 |
+
gr.Dropdown.update(visible=True)
|
252 |
+
)
|
253 |
else:
|
254 |
+
return (
|
255 |
+
# Input & Upload
|
256 |
+
gr.Textbox.update(visible=False),
|
257 |
+
gr.Audio.update(visible=True),
|
258 |
+
# Youtube
|
259 |
+
gr.Dropdown.update(visible=False),
|
260 |
+
gr.Textbox.update(visible=False),
|
261 |
+
gr.Dropdown.update(visible=False),
|
262 |
+
gr.Button.update(visible=False),
|
263 |
+
gr.Audio.update(visible=False),
|
264 |
+
gr.Audio.update(visible=False),
|
265 |
+
gr.Audio.update(visible=False),
|
266 |
+
gr.Slider.update(visible=False),
|
267 |
+
gr.Audio.update(visible=False),
|
268 |
+
gr.Button.update(visible=False),
|
269 |
+
# TTS
|
270 |
+
gr.Textbox.update(visible=False),
|
271 |
+
gr.Dropdown.update(visible=False)
|
272 |
+
)
|
273 |
|
274 |
if __name__ == '__main__':
|
275 |
load_hubert()
|
|
|
298 |
tgt_sr = cpt["config"][-1]
|
299 |
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
|
300 |
if_f0 = cpt.get("f0", 1)
|
301 |
+
version = cpt.get("version", "v1")
|
302 |
+
if version == "v1":
|
303 |
+
if if_f0 == 1:
|
304 |
+
net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
|
305 |
+
else:
|
306 |
+
net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
|
307 |
+
nodel_version = "V1"
|
308 |
+
elif version == "v2":
|
309 |
+
if if_f0 == 1:
|
310 |
+
net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half)
|
311 |
+
else:
|
312 |
+
net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
|
313 |
+
nodel_version = "V2"
|
314 |
del net_g.enc_q
|
315 |
print(net_g.load_state_dict(cpt["weight"], strict=False))
|
316 |
net_g.eval().to(config.device)
|
|
|
320 |
net_g = net_g.float()
|
321 |
vc = VC(tgt_sr, config)
|
322 |
print(f"Model loaded: {model_name}")
|
323 |
+
models.append((model_name, model_title, model_author, model_cover, nodel_version, create_vc_fn(tgt_sr, net_g, vc, if_f0, model_index)))
|
324 |
categories.append([category_title, category_folder, description, models])
|
325 |
with gr.Blocks() as app:
|
326 |
gr.Markdown(
|
327 |
+
"# <center> RVC Genshin Impact Inference\n"
|
328 |
+
"#### From [Retrieval-based-Voice-Conversion](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)\n"
|
329 |
+
"[![Original Repo](https://badgen.net/badge/icon/github?icon=github&label=Original%20Repo)](https://github.com/ArkanDash/Multi-Model-RVC-Inference)"
|
|
|
|
|
|
|
|
|
|
|
330 |
)
|
331 |
for (folder_title, folder, description, models) in categories:
|
332 |
with gr.TabItem(folder_title):
|
|
|
335 |
with gr.Tabs():
|
336 |
if not models:
|
337 |
gr.Markdown("# <center> No Model Loaded.")
|
338 |
+
gr.Markdown("## <center> Please add model or fix your model path.")
|
339 |
continue
|
340 |
+
for (name, title, author, cover, model_version, vc_fn) in models:
|
341 |
+
with gr.TabItem(name):
|
342 |
+
with gr.Row():
|
343 |
+
gr.Markdown(
|
344 |
+
'<div align="center">'
|
345 |
+
f'<div>{title}</div>\n'+
|
346 |
+
f'<div>RVC {model_version} Model</div>\n'+
|
347 |
+
(f'<div>Model author: {author}</div>' if author else "")+
|
348 |
+
(f'<img style="width:auto;height:300px;" src="file/{cover}">' if cover else "")+
|
349 |
+
'</div>'
|
350 |
+
)
|
351 |
+
with gr.Row():
|
352 |
+
with gr.Column():
|
353 |
+
vc_audio_mode = gr.Dropdown(label="Input voice", choices=["Upload audio", "TTS Audio"], allow_custom_value=False, value="Upload audio")
|
354 |
+
# Input and Upload
|
355 |
+
vc_input = gr.Textbox(label="Input audio path", visible=False)
|
356 |
+
vc_upload = gr.Audio(label="Upload audio file", visible=True, interactive=True)
|
357 |
+
# Youtube
|
358 |
+
vc_download_audio = gr.Dropdown(label="Provider", choices=["Youtube"], allow_custom_value=False, visible=False, value="Youtube", info="Select provider (Default: Youtube)")
|
359 |
+
vc_link = gr.Textbox(label="Youtube URL", visible=False, info="Example: https://www.youtube.com/watch?v=Nc0sB1Bmf-A", placeholder="https://www.youtube.com/watch?v=...")
|
360 |
+
vc_split_model = gr.Dropdown(label="Splitter Model", choices=["htdemucs", "mdx_extra_q"], allow_custom_value=False, visible=False, value="htdemucs", info="Select the splitter model (Default: htdemucs)")
|
361 |
+
vc_split = gr.Button("Split Audio", variant="primary", visible=False)
|
362 |
+
vc_vocal_preview = gr.Audio(label="Vocal Preview", visible=False)
|
363 |
+
vc_inst_preview = gr.Audio(label="Instrumental Preview", visible=False)
|
364 |
+
vc_audio_preview = gr.Audio(label="Audio Preview", visible=False)
|
365 |
+
# TTS
|
366 |
+
tts_text = gr.Textbox(visible=False, label="TTS text", info="Text to speech input")
|
367 |
+
tts_voice = gr.Dropdown(label="Edge-tts speaker", choices=voices, visible=False, allow_custom_value=False, value="en-US-AnaNeural-Female")
|
368 |
+
with gr.Column():
|
369 |
+
spk_item = gr.Slider(
|
370 |
+
minimum=0,
|
371 |
+
maximum=2333,
|
372 |
+
step=1,
|
373 |
+
label="Speaker ID",
|
374 |
+
info="(Default: 0)",
|
375 |
+
value=0,
|
376 |
+
interactive=True,
|
377 |
+
)
|
378 |
+
vc_transform0 = gr.Number(label="Transpose", value=0, info='Type "12" to change from male to female voice. Type "-12" to change female to male voice')
|
379 |
+
f0method0 = gr.Radio(
|
380 |
+
label="Pitch extraction algorithm",
|
381 |
+
info="PM is fast, Harvest is good but extremely slow (Default: PM)",
|
382 |
+
choices=["pm", "harvest"],
|
383 |
+
value="pm",
|
384 |
+
interactive=True,
|
385 |
+
)
|
386 |
+
index_rate1 = gr.Slider(
|
387 |
+
minimum=0,
|
388 |
+
maximum=1,
|
389 |
+
label="Retrieval feature ratio",
|
390 |
+
info="(Default: 0.6)",
|
391 |
+
value=0.6,
|
392 |
+
interactive=True,
|
393 |
+
)
|
394 |
+
filter_radius0 = gr.Slider(
|
395 |
+
minimum=0,
|
396 |
+
maximum=7,
|
397 |
+
label="Apply Median Filtering",
|
398 |
+
info="The value represents the filter radius and can reduce breathiness.",
|
399 |
+
value=3,
|
400 |
+
step=1,
|
401 |
+
interactive=True,
|
402 |
+
)
|
403 |
+
resample_sr0 = gr.Slider(
|
404 |
+
minimum=0,
|
405 |
+
maximum=48000,
|
406 |
+
label="Resample the output audio",
|
407 |
+
info="Resample the output audio in post-processing to the final sample rate. Set to 0 for no resampling",
|
408 |
+
value=0,
|
409 |
+
step=1,
|
410 |
+
interactive=True,
|
411 |
+
)
|
412 |
+
rms_mix_rate0 = gr.Slider(
|
413 |
+
minimum=0,
|
414 |
+
maximum=1,
|
415 |
+
label="Volume Envelope",
|
416 |
+
info="Use the volume envelope of the input to replace or mix with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is used",
|
417 |
+
value=1,
|
418 |
+
interactive=True,
|
419 |
+
)
|
420 |
+
protect0 = gr.Slider(
|
421 |
+
minimum=0,
|
422 |
+
maximum=0.5,
|
423 |
+
label="Voice Protection",
|
424 |
+
info="Protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy",
|
425 |
+
value=0.35,
|
426 |
+
step=0.01,
|
427 |
+
interactive=True,
|
428 |
+
)
|
429 |
+
with gr.Column():
|
430 |
+
vc_log = gr.Textbox(label="Output Information", interactive=False)
|
431 |
+
vc_output = gr.Audio(label="Output Audio", interactive=False)
|
432 |
+
vc_convert = gr.Button("Convert", variant="primary")
|
433 |
+
vc_volume = gr.Slider(
|
434 |
+
minimum=0,
|
435 |
+
maximum=10,
|
436 |
+
label="Vocal volume",
|
437 |
+
value=4,
|
438 |
+
interactive=True,
|
439 |
+
step=1,
|
440 |
+
info="Adjust vocal volume (Default: 4}",
|
441 |
+
visible=False
|
442 |
)
|
443 |
+
vc_combined_output = gr.Audio(label="Output Combined Audio", visible=False)
|
444 |
+
vc_combine = gr.Button("Combine",variant="primary", visible=False)
|
445 |
+
vc_convert.click(
|
446 |
+
fn=vc_fn,
|
447 |
+
inputs=[
|
448 |
+
vc_input,
|
449 |
+
vc_upload,
|
450 |
+
tts_text,
|
451 |
+
tts_voice,
|
452 |
+
spk_item,
|
453 |
+
vc_transform0,
|
454 |
+
f0method0,
|
455 |
+
index_rate1,
|
456 |
+
filter_radius0,
|
457 |
+
resample_sr0,
|
458 |
+
rms_mix_rate0,
|
459 |
+
protect0,
|
460 |
+
],
|
461 |
+
outputs=[vc_log ,vc_output]
|
462 |
+
)
|
463 |
+
vc_split.click(
|
464 |
+
fn=cut_vocal_and_inst,
|
465 |
+
inputs=[vc_link, vc_download_audio, vc_split_model],
|
466 |
+
outputs=[vc_vocal_preview, vc_inst_preview, vc_audio_preview]
|
467 |
+
)
|
468 |
+
vc_combine.click(
|
469 |
+
fn=combine_vocal_and_inst,
|
470 |
+
inputs=[vc_output, vc_volume, vc_split_model],
|
471 |
+
outputs=[vc_combined_output]
|
472 |
+
)
|
473 |
+
vc_audio_mode.change(
|
474 |
+
fn=change_audio_mode,
|
475 |
+
inputs=[vc_audio_mode],
|
476 |
+
outputs=[
|
477 |
+
vc_input,
|
478 |
+
vc_upload,
|
479 |
+
vc_download_audio,
|
480 |
+
vc_link,
|
481 |
+
vc_split_model,
|
482 |
+
vc_split,
|
483 |
+
vc_vocal_preview,
|
484 |
+
vc_inst_preview,
|
485 |
+
vc_audio_preview,
|
486 |
+
vc_volume,
|
487 |
+
vc_combined_output,
|
488 |
+
vc_combine,
|
489 |
+
tts_text,
|
490 |
+
tts_voice
|
491 |
+
]
|
492 |
+
)
|
493 |
app.queue(concurrency_count=1, max_size=20, api_open=config.api).launch(share=config.colab)
|
config.py
CHANGED
@@ -3,6 +3,18 @@ import torch
|
|
3 |
from multiprocessing import cpu_count
|
4 |
|
5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
class Config:
|
7 |
def __init__(self):
|
8 |
self.device = "cuda:0"
|
@@ -36,7 +48,7 @@ class Config:
|
|
36 |
action="store_true",
|
37 |
help="Do not open in browser automatically",
|
38 |
)
|
39 |
-
parser.add_argument(
|
40 |
cmd_opts = parser.parse_args()
|
41 |
|
42 |
cmd_opts.port = cmd_opts.port if 0 <= cmd_opts.port <= 65535 else 7865
|
@@ -47,7 +59,7 @@ class Config:
|
|
47 |
cmd_opts.colab,
|
48 |
cmd_opts.noparallel,
|
49 |
cmd_opts.noautoopen,
|
50 |
-
cmd_opts.api
|
51 |
)
|
52 |
|
53 |
def device_config(self) -> tuple:
|
@@ -63,15 +75,7 @@ class Config:
|
|
63 |
):
|
64 |
print("16系/10系显卡和P40强制单精度")
|
65 |
self.is_half = False
|
66 |
-
|
67 |
-
with open(f"configs/{config_file}", "r") as f:
|
68 |
-
strr = f.read().replace("true", "false")
|
69 |
-
with open(f"configs/{config_file}", "w") as f:
|
70 |
-
f.write(strr)
|
71 |
-
with open("trainset_preprocess_pipeline_print.py", "r") as f:
|
72 |
-
strr = f.read().replace("3.7", "3.0")
|
73 |
-
with open("trainset_preprocess_pipeline_print.py", "w") as f:
|
74 |
-
f.write(strr)
|
75 |
else:
|
76 |
self.gpu_name = None
|
77 |
self.gpu_mem = int(
|
@@ -90,10 +94,12 @@ class Config:
|
|
90 |
print("没有发现支持的N卡, 使用MPS进行推理")
|
91 |
self.device = "mps"
|
92 |
self.is_half = False
|
|
|
93 |
else:
|
94 |
print("没有发现支持的N卡, 使用CPU进行推理")
|
95 |
self.device = "cpu"
|
96 |
self.is_half = False
|
|
|
97 |
|
98 |
if self.n_cpu == 0:
|
99 |
self.n_cpu = cpu_count()
|
@@ -117,4 +123,4 @@ class Config:
|
|
117 |
x_center = 30
|
118 |
x_max = 32
|
119 |
|
120 |
-
return x_pad, x_query, x_center, x_max
|
|
|
3 |
from multiprocessing import cpu_count
|
4 |
|
5 |
|
6 |
+
def config_file_change_fp32():
|
7 |
+
for config_file in ["32k.json", "40k.json", "48k.json"]:
|
8 |
+
with open(f"configs/{config_file}", "r") as f:
|
9 |
+
strr = f.read().replace("true", "false")
|
10 |
+
with open(f"configs/{config_file}", "w") as f:
|
11 |
+
f.write(strr)
|
12 |
+
with open("trainset_preprocess_pipeline_print.py", "r") as f:
|
13 |
+
strr = f.read().replace("3.7", "3.0")
|
14 |
+
with open("trainset_preprocess_pipeline_print.py", "w") as f:
|
15 |
+
f.write(strr)
|
16 |
+
|
17 |
+
|
18 |
class Config:
|
19 |
def __init__(self):
|
20 |
self.device = "cuda:0"
|
|
|
48 |
action="store_true",
|
49 |
help="Do not open in browser automatically",
|
50 |
)
|
51 |
+
parser.add_argument("--api", action="store_true", help="Launch with api")
|
52 |
cmd_opts = parser.parse_args()
|
53 |
|
54 |
cmd_opts.port = cmd_opts.port if 0 <= cmd_opts.port <= 65535 else 7865
|
|
|
59 |
cmd_opts.colab,
|
60 |
cmd_opts.noparallel,
|
61 |
cmd_opts.noautoopen,
|
62 |
+
cmd_opts.api
|
63 |
)
|
64 |
|
65 |
def device_config(self) -> tuple:
|
|
|
75 |
):
|
76 |
print("16系/10系显卡和P40强制单精度")
|
77 |
self.is_half = False
|
78 |
+
config_file_change_fp32()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
else:
|
80 |
self.gpu_name = None
|
81 |
self.gpu_mem = int(
|
|
|
94 |
print("没有发现支持的N卡, 使用MPS进行推理")
|
95 |
self.device = "mps"
|
96 |
self.is_half = False
|
97 |
+
config_file_change_fp32()
|
98 |
else:
|
99 |
print("没有发现支持的N卡, 使用CPU进行推理")
|
100 |
self.device = "cpu"
|
101 |
self.is_half = False
|
102 |
+
config_file_change_fp32()
|
103 |
|
104 |
if self.n_cpu == 0:
|
105 |
self.n_cpu = cpu_count()
|
|
|
123 |
x_center = 30
|
124 |
x_max = 32
|
125 |
|
126 |
+
return x_pad, x_query, x_center, x_max
|
infer_pack/models.py
CHANGED
@@ -61,7 +61,7 @@ class TextEncoder256(nn.Module):
|
|
61 |
return m, logs, x_mask
|
62 |
|
63 |
|
64 |
-
class
|
65 |
def __init__(
|
66 |
self,
|
67 |
out_channels,
|
@@ -81,14 +81,14 @@ class TextEncoder256Sim(nn.Module):
|
|
81 |
self.n_layers = n_layers
|
82 |
self.kernel_size = kernel_size
|
83 |
self.p_dropout = p_dropout
|
84 |
-
self.emb_phone = nn.Linear(
|
85 |
self.lrelu = nn.LeakyReLU(0.1, inplace=True)
|
86 |
if f0 == True:
|
87 |
self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
|
88 |
self.encoder = attentions.Encoder(
|
89 |
hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
|
90 |
)
|
91 |
-
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
|
92 |
|
93 |
def forward(self, phone, pitch, lengths):
|
94 |
if pitch == None:
|
@@ -102,8 +102,10 @@ class TextEncoder256Sim(nn.Module):
|
|
102 |
x.dtype
|
103 |
)
|
104 |
x = self.encoder(x * x_mask, x_mask)
|
105 |
-
|
106 |
-
|
|
|
|
|
107 |
|
108 |
|
109 |
class ResidualCouplingBlock(nn.Module):
|
@@ -638,6 +640,117 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
|
|
638 |
return o, x_mask, (z, z_p, m_p, logs_p)
|
639 |
|
640 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
641 |
class SynthesizerTrnMs256NSFsid_nono(nn.Module):
|
642 |
def __init__(
|
643 |
self,
|
@@ -740,11 +853,7 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
|
|
740 |
return o, x_mask, (z, z_p, m_p, logs_p)
|
741 |
|
742 |
|
743 |
-
class
|
744 |
-
"""
|
745 |
-
Synthesizer for Training
|
746 |
-
"""
|
747 |
-
|
748 |
def __init__(
|
749 |
self,
|
750 |
spec_channels,
|
@@ -763,9 +872,8 @@ class SynthesizerTrnMs256NSFsid_sim(nn.Module):
|
|
763 |
upsample_initial_channel,
|
764 |
upsample_kernel_sizes,
|
765 |
spk_embed_dim,
|
766 |
-
|
767 |
-
|
768 |
-
use_sdp=True,
|
769 |
**kwargs
|
770 |
):
|
771 |
super().__init__()
|
@@ -787,7 +895,7 @@ class SynthesizerTrnMs256NSFsid_sim(nn.Module):
|
|
787 |
self.gin_channels = gin_channels
|
788 |
# self.hop_length = hop_length#
|
789 |
self.spk_embed_dim = spk_embed_dim
|
790 |
-
self.enc_p =
|
791 |
inter_channels,
|
792 |
hidden_channels,
|
793 |
filter_channels,
|
@@ -795,8 +903,9 @@ class SynthesizerTrnMs256NSFsid_sim(nn.Module):
|
|
795 |
n_layers,
|
796 |
kernel_size,
|
797 |
p_dropout,
|
|
|
798 |
)
|
799 |
-
self.dec =
|
800 |
inter_channels,
|
801 |
resblock,
|
802 |
resblock_kernel_sizes,
|
@@ -805,9 +914,16 @@ class SynthesizerTrnMs256NSFsid_sim(nn.Module):
|
|
805 |
upsample_initial_channel,
|
806 |
upsample_kernel_sizes,
|
807 |
gin_channels=gin_channels,
|
808 |
-
is_half=kwargs["is_half"],
|
809 |
)
|
810 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
811 |
self.flow = ResidualCouplingBlock(
|
812 |
inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
|
813 |
)
|
@@ -819,28 +935,24 @@ class SynthesizerTrnMs256NSFsid_sim(nn.Module):
|
|
819 |
self.flow.remove_weight_norm()
|
820 |
self.enc_q.remove_weight_norm()
|
821 |
|
822 |
-
def forward(
|
823 |
-
self, phone, phone_lengths, pitch, pitchf, y_lengths, ds
|
824 |
-
): # y是spec不需要了现在
|
825 |
g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
|
826 |
-
|
827 |
-
|
|
|
828 |
z_slice, ids_slice = commons.rand_slice_segments(
|
829 |
-
|
830 |
)
|
|
|
|
|
831 |
|
832 |
-
|
833 |
-
|
834 |
-
|
835 |
-
|
836 |
-
|
837 |
-
self
|
838 |
-
|
839 |
-
g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
|
840 |
-
x, x_mask = self.enc_p(phone, pitch, phone_lengths)
|
841 |
-
x = self.flow(x, x_mask, g=g, reverse=True)
|
842 |
-
o = self.dec((x * x_mask)[:, :, :max_len], pitchf, g=g)
|
843 |
-
return o, o
|
844 |
|
845 |
|
846 |
class MultiPeriodDiscriminator(torch.nn.Module):
|
@@ -873,6 +985,36 @@ class MultiPeriodDiscriminator(torch.nn.Module):
|
|
873 |
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
874 |
|
875 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
876 |
class DiscriminatorS(torch.nn.Module):
|
877 |
def __init__(self, use_spectral_norm=False):
|
878 |
super(DiscriminatorS, self).__init__()
|
|
|
61 |
return m, logs, x_mask
|
62 |
|
63 |
|
64 |
+
class TextEncoder768(nn.Module):
|
65 |
def __init__(
|
66 |
self,
|
67 |
out_channels,
|
|
|
81 |
self.n_layers = n_layers
|
82 |
self.kernel_size = kernel_size
|
83 |
self.p_dropout = p_dropout
|
84 |
+
self.emb_phone = nn.Linear(768, hidden_channels)
|
85 |
self.lrelu = nn.LeakyReLU(0.1, inplace=True)
|
86 |
if f0 == True:
|
87 |
self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
|
88 |
self.encoder = attentions.Encoder(
|
89 |
hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
|
90 |
)
|
91 |
+
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
92 |
|
93 |
def forward(self, phone, pitch, lengths):
|
94 |
if pitch == None:
|
|
|
102 |
x.dtype
|
103 |
)
|
104 |
x = self.encoder(x * x_mask, x_mask)
|
105 |
+
stats = self.proj(x) * x_mask
|
106 |
+
|
107 |
+
m, logs = torch.split(stats, self.out_channels, dim=1)
|
108 |
+
return m, logs, x_mask
|
109 |
|
110 |
|
111 |
class ResidualCouplingBlock(nn.Module):
|
|
|
640 |
return o, x_mask, (z, z_p, m_p, logs_p)
|
641 |
|
642 |
|
643 |
+
class SynthesizerTrnMs768NSFsid(nn.Module):
|
644 |
+
def __init__(
|
645 |
+
self,
|
646 |
+
spec_channels,
|
647 |
+
segment_size,
|
648 |
+
inter_channels,
|
649 |
+
hidden_channels,
|
650 |
+
filter_channels,
|
651 |
+
n_heads,
|
652 |
+
n_layers,
|
653 |
+
kernel_size,
|
654 |
+
p_dropout,
|
655 |
+
resblock,
|
656 |
+
resblock_kernel_sizes,
|
657 |
+
resblock_dilation_sizes,
|
658 |
+
upsample_rates,
|
659 |
+
upsample_initial_channel,
|
660 |
+
upsample_kernel_sizes,
|
661 |
+
spk_embed_dim,
|
662 |
+
gin_channels,
|
663 |
+
sr,
|
664 |
+
**kwargs
|
665 |
+
):
|
666 |
+
super().__init__()
|
667 |
+
if type(sr) == type("strr"):
|
668 |
+
sr = sr2sr[sr]
|
669 |
+
self.spec_channels = spec_channels
|
670 |
+
self.inter_channels = inter_channels
|
671 |
+
self.hidden_channels = hidden_channels
|
672 |
+
self.filter_channels = filter_channels
|
673 |
+
self.n_heads = n_heads
|
674 |
+
self.n_layers = n_layers
|
675 |
+
self.kernel_size = kernel_size
|
676 |
+
self.p_dropout = p_dropout
|
677 |
+
self.resblock = resblock
|
678 |
+
self.resblock_kernel_sizes = resblock_kernel_sizes
|
679 |
+
self.resblock_dilation_sizes = resblock_dilation_sizes
|
680 |
+
self.upsample_rates = upsample_rates
|
681 |
+
self.upsample_initial_channel = upsample_initial_channel
|
682 |
+
self.upsample_kernel_sizes = upsample_kernel_sizes
|
683 |
+
self.segment_size = segment_size
|
684 |
+
self.gin_channels = gin_channels
|
685 |
+
# self.hop_length = hop_length#
|
686 |
+
self.spk_embed_dim = spk_embed_dim
|
687 |
+
self.enc_p = TextEncoder768(
|
688 |
+
inter_channels,
|
689 |
+
hidden_channels,
|
690 |
+
filter_channels,
|
691 |
+
n_heads,
|
692 |
+
n_layers,
|
693 |
+
kernel_size,
|
694 |
+
p_dropout,
|
695 |
+
)
|
696 |
+
self.dec = GeneratorNSF(
|
697 |
+
inter_channels,
|
698 |
+
resblock,
|
699 |
+
resblock_kernel_sizes,
|
700 |
+
resblock_dilation_sizes,
|
701 |
+
upsample_rates,
|
702 |
+
upsample_initial_channel,
|
703 |
+
upsample_kernel_sizes,
|
704 |
+
gin_channels=gin_channels,
|
705 |
+
sr=sr,
|
706 |
+
is_half=kwargs["is_half"],
|
707 |
+
)
|
708 |
+
self.enc_q = PosteriorEncoder(
|
709 |
+
spec_channels,
|
710 |
+
inter_channels,
|
711 |
+
hidden_channels,
|
712 |
+
5,
|
713 |
+
1,
|
714 |
+
16,
|
715 |
+
gin_channels=gin_channels,
|
716 |
+
)
|
717 |
+
self.flow = ResidualCouplingBlock(
|
718 |
+
inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
|
719 |
+
)
|
720 |
+
self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
|
721 |
+
print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
|
722 |
+
|
723 |
+
def remove_weight_norm(self):
|
724 |
+
self.dec.remove_weight_norm()
|
725 |
+
self.flow.remove_weight_norm()
|
726 |
+
self.enc_q.remove_weight_norm()
|
727 |
+
|
728 |
+
def forward(
|
729 |
+
self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
|
730 |
+
): # 这里ds是id,[bs,1]
|
731 |
+
# print(1,pitch.shape)#[bs,t]
|
732 |
+
g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
|
733 |
+
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
|
734 |
+
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
|
735 |
+
z_p = self.flow(z, y_mask, g=g)
|
736 |
+
z_slice, ids_slice = commons.rand_slice_segments(
|
737 |
+
z, y_lengths, self.segment_size
|
738 |
+
)
|
739 |
+
# print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
|
740 |
+
pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
|
741 |
+
# print(-2,pitchf.shape,z_slice.shape)
|
742 |
+
o = self.dec(z_slice, pitchf, g=g)
|
743 |
+
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
|
744 |
+
|
745 |
+
def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
|
746 |
+
g = self.emb_g(sid).unsqueeze(-1)
|
747 |
+
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
|
748 |
+
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
|
749 |
+
z = self.flow(z_p, x_mask, g=g, reverse=True)
|
750 |
+
o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
|
751 |
+
return o, x_mask, (z, z_p, m_p, logs_p)
|
752 |
+
|
753 |
+
|
754 |
class SynthesizerTrnMs256NSFsid_nono(nn.Module):
|
755 |
def __init__(
|
756 |
self,
|
|
|
853 |
return o, x_mask, (z, z_p, m_p, logs_p)
|
854 |
|
855 |
|
856 |
+
class SynthesizerTrnMs768NSFsid_nono(nn.Module):
|
|
|
|
|
|
|
|
|
857 |
def __init__(
|
858 |
self,
|
859 |
spec_channels,
|
|
|
872 |
upsample_initial_channel,
|
873 |
upsample_kernel_sizes,
|
874 |
spk_embed_dim,
|
875 |
+
gin_channels,
|
876 |
+
sr=None,
|
|
|
877 |
**kwargs
|
878 |
):
|
879 |
super().__init__()
|
|
|
895 |
self.gin_channels = gin_channels
|
896 |
# self.hop_length = hop_length#
|
897 |
self.spk_embed_dim = spk_embed_dim
|
898 |
+
self.enc_p = TextEncoder768(
|
899 |
inter_channels,
|
900 |
hidden_channels,
|
901 |
filter_channels,
|
|
|
903 |
n_layers,
|
904 |
kernel_size,
|
905 |
p_dropout,
|
906 |
+
f0=False,
|
907 |
)
|
908 |
+
self.dec = Generator(
|
909 |
inter_channels,
|
910 |
resblock,
|
911 |
resblock_kernel_sizes,
|
|
|
914 |
upsample_initial_channel,
|
915 |
upsample_kernel_sizes,
|
916 |
gin_channels=gin_channels,
|
|
|
917 |
)
|
918 |
+
self.enc_q = PosteriorEncoder(
|
919 |
+
spec_channels,
|
920 |
+
inter_channels,
|
921 |
+
hidden_channels,
|
922 |
+
5,
|
923 |
+
1,
|
924 |
+
16,
|
925 |
+
gin_channels=gin_channels,
|
926 |
+
)
|
927 |
self.flow = ResidualCouplingBlock(
|
928 |
inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
|
929 |
)
|
|
|
935 |
self.flow.remove_weight_norm()
|
936 |
self.enc_q.remove_weight_norm()
|
937 |
|
938 |
+
def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]
|
|
|
|
|
939 |
g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
|
940 |
+
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
|
941 |
+
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
|
942 |
+
z_p = self.flow(z, y_mask, g=g)
|
943 |
z_slice, ids_slice = commons.rand_slice_segments(
|
944 |
+
z, y_lengths, self.segment_size
|
945 |
)
|
946 |
+
o = self.dec(z_slice, g=g)
|
947 |
+
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
|
948 |
|
949 |
+
def infer(self, phone, phone_lengths, sid, max_len=None):
|
950 |
+
g = self.emb_g(sid).unsqueeze(-1)
|
951 |
+
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
|
952 |
+
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
|
953 |
+
z = self.flow(z_p, x_mask, g=g, reverse=True)
|
954 |
+
o = self.dec((z * x_mask)[:, :, :max_len], g=g)
|
955 |
+
return o, x_mask, (z, z_p, m_p, logs_p)
|
|
|
|
|
|
|
|
|
|
|
956 |
|
957 |
|
958 |
class MultiPeriodDiscriminator(torch.nn.Module):
|
|
|
985 |
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
986 |
|
987 |
|
988 |
+
class MultiPeriodDiscriminatorV2(torch.nn.Module):
|
989 |
+
def __init__(self, use_spectral_norm=False):
|
990 |
+
super(MultiPeriodDiscriminatorV2, self).__init__()
|
991 |
+
# periods = [2, 3, 5, 7, 11, 17]
|
992 |
+
periods = [2, 3, 5, 7, 11, 17, 23, 37]
|
993 |
+
|
994 |
+
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
|
995 |
+
discs = discs + [
|
996 |
+
DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
|
997 |
+
]
|
998 |
+
self.discriminators = nn.ModuleList(discs)
|
999 |
+
|
1000 |
+
def forward(self, y, y_hat):
|
1001 |
+
y_d_rs = [] #
|
1002 |
+
y_d_gs = []
|
1003 |
+
fmap_rs = []
|
1004 |
+
fmap_gs = []
|
1005 |
+
for i, d in enumerate(self.discriminators):
|
1006 |
+
y_d_r, fmap_r = d(y)
|
1007 |
+
y_d_g, fmap_g = d(y_hat)
|
1008 |
+
# for j in range(len(fmap_r)):
|
1009 |
+
# print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
|
1010 |
+
y_d_rs.append(y_d_r)
|
1011 |
+
y_d_gs.append(y_d_g)
|
1012 |
+
fmap_rs.append(fmap_r)
|
1013 |
+
fmap_gs.append(fmap_g)
|
1014 |
+
|
1015 |
+
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
1016 |
+
|
1017 |
+
|
1018 |
class DiscriminatorS(torch.nn.Module):
|
1019 |
def __init__(self, use_spectral_norm=False):
|
1020 |
super(DiscriminatorS, self).__init__()
|
infer_pack/models_onnx.py
CHANGED
@@ -61,7 +61,7 @@ class TextEncoder256(nn.Module):
|
|
61 |
return m, logs, x_mask
|
62 |
|
63 |
|
64 |
-
class
|
65 |
def __init__(
|
66 |
self,
|
67 |
out_channels,
|
@@ -81,14 +81,14 @@ class TextEncoder256Sim(nn.Module):
|
|
81 |
self.n_layers = n_layers
|
82 |
self.kernel_size = kernel_size
|
83 |
self.p_dropout = p_dropout
|
84 |
-
self.emb_phone = nn.Linear(
|
85 |
self.lrelu = nn.LeakyReLU(0.1, inplace=True)
|
86 |
if f0 == True:
|
87 |
self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
|
88 |
self.encoder = attentions.Encoder(
|
89 |
hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
|
90 |
)
|
91 |
-
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
|
92 |
|
93 |
def forward(self, phone, pitch, lengths):
|
94 |
if pitch == None:
|
@@ -102,8 +102,10 @@ class TextEncoder256Sim(nn.Module):
|
|
102 |
x.dtype
|
103 |
)
|
104 |
x = self.encoder(x * x_mask, x_mask)
|
105 |
-
|
106 |
-
|
|
|
|
|
107 |
|
108 |
|
109 |
class ResidualCouplingBlock(nn.Module):
|
@@ -527,7 +529,7 @@ sr2sr = {
|
|
527 |
}
|
528 |
|
529 |
|
530 |
-
class
|
531 |
def __init__(
|
532 |
self,
|
533 |
spec_channels,
|
@@ -571,15 +573,26 @@ class SynthesizerTrnMs256NSFsidO(nn.Module):
|
|
571 |
self.gin_channels = gin_channels
|
572 |
# self.hop_length = hop_length#
|
573 |
self.spk_embed_dim = spk_embed_dim
|
574 |
-
self.
|
575 |
-
|
576 |
-
|
577 |
-
|
578 |
-
|
579 |
-
|
580 |
-
|
581 |
-
|
582 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
583 |
self.dec = GeneratorNSF(
|
584 |
inter_channels,
|
585 |
resblock,
|
@@ -605,6 +618,7 @@ class SynthesizerTrnMs256NSFsidO(nn.Module):
|
|
605 |
inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
|
606 |
)
|
607 |
self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
|
|
|
608 |
print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
|
609 |
|
610 |
def remove_weight_norm(self):
|
@@ -612,10 +626,24 @@ class SynthesizerTrnMs256NSFsidO(nn.Module):
|
|
612 |
self.flow.remove_weight_norm()
|
613 |
self.enc_q.remove_weight_norm()
|
614 |
|
615 |
-
def
|
616 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
617 |
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
|
618 |
-
z_p = (m_p + torch.exp(logs_p) *
|
619 |
z = self.flow(z_p, x_mask, g=g, reverse=True)
|
620 |
o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
|
621 |
return o
|
@@ -651,6 +679,36 @@ class MultiPeriodDiscriminator(torch.nn.Module):
|
|
651 |
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
652 |
|
653 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
654 |
class DiscriminatorS(torch.nn.Module):
|
655 |
def __init__(self, use_spectral_norm=False):
|
656 |
super(DiscriminatorS, self).__init__()
|
|
|
61 |
return m, logs, x_mask
|
62 |
|
63 |
|
64 |
+
class TextEncoder768(nn.Module):
|
65 |
def __init__(
|
66 |
self,
|
67 |
out_channels,
|
|
|
81 |
self.n_layers = n_layers
|
82 |
self.kernel_size = kernel_size
|
83 |
self.p_dropout = p_dropout
|
84 |
+
self.emb_phone = nn.Linear(768, hidden_channels)
|
85 |
self.lrelu = nn.LeakyReLU(0.1, inplace=True)
|
86 |
if f0 == True:
|
87 |
self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
|
88 |
self.encoder = attentions.Encoder(
|
89 |
hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
|
90 |
)
|
91 |
+
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
92 |
|
93 |
def forward(self, phone, pitch, lengths):
|
94 |
if pitch == None:
|
|
|
102 |
x.dtype
|
103 |
)
|
104 |
x = self.encoder(x * x_mask, x_mask)
|
105 |
+
stats = self.proj(x) * x_mask
|
106 |
+
|
107 |
+
m, logs = torch.split(stats, self.out_channels, dim=1)
|
108 |
+
return m, logs, x_mask
|
109 |
|
110 |
|
111 |
class ResidualCouplingBlock(nn.Module):
|
|
|
529 |
}
|
530 |
|
531 |
|
532 |
+
class SynthesizerTrnMsNSFsidM(nn.Module):
|
533 |
def __init__(
|
534 |
self,
|
535 |
spec_channels,
|
|
|
573 |
self.gin_channels = gin_channels
|
574 |
# self.hop_length = hop_length#
|
575 |
self.spk_embed_dim = spk_embed_dim
|
576 |
+
if self.gin_channels == 256:
|
577 |
+
self.enc_p = TextEncoder256(
|
578 |
+
inter_channels,
|
579 |
+
hidden_channels,
|
580 |
+
filter_channels,
|
581 |
+
n_heads,
|
582 |
+
n_layers,
|
583 |
+
kernel_size,
|
584 |
+
p_dropout,
|
585 |
+
)
|
586 |
+
else:
|
587 |
+
self.enc_p = TextEncoder768(
|
588 |
+
inter_channels,
|
589 |
+
hidden_channels,
|
590 |
+
filter_channels,
|
591 |
+
n_heads,
|
592 |
+
n_layers,
|
593 |
+
kernel_size,
|
594 |
+
p_dropout,
|
595 |
+
)
|
596 |
self.dec = GeneratorNSF(
|
597 |
inter_channels,
|
598 |
resblock,
|
|
|
618 |
inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
|
619 |
)
|
620 |
self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
|
621 |
+
self.speaker_map = None
|
622 |
print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
|
623 |
|
624 |
def remove_weight_norm(self):
|
|
|
626 |
self.flow.remove_weight_norm()
|
627 |
self.enc_q.remove_weight_norm()
|
628 |
|
629 |
+
def construct_spkmixmap(self, n_speaker):
|
630 |
+
self.speaker_map = torch.zeros((n_speaker, 1, 1, self.gin_channels))
|
631 |
+
for i in range(n_speaker):
|
632 |
+
self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]]))
|
633 |
+
self.speaker_map = self.speaker_map.unsqueeze(0)
|
634 |
+
|
635 |
+
def forward(self, phone, phone_lengths, pitch, nsff0, g, rnd, max_len=None):
|
636 |
+
if self.speaker_map is not None: # [N, S] * [S, B, 1, H]
|
637 |
+
g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1)) # [N, S, B, 1, 1]
|
638 |
+
g = g * self.speaker_map # [N, S, B, 1, H]
|
639 |
+
g = torch.sum(g, dim=1) # [N, 1, B, 1, H]
|
640 |
+
g = g.transpose(0, -1).transpose(0, -2).squeeze(0) # [B, H, N]
|
641 |
+
else:
|
642 |
+
g = g.unsqueeze(0)
|
643 |
+
g = self.emb_g(g).transpose(1, 2)
|
644 |
+
|
645 |
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
|
646 |
+
z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask
|
647 |
z = self.flow(z_p, x_mask, g=g, reverse=True)
|
648 |
o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
|
649 |
return o
|
|
|
679 |
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
680 |
|
681 |
|
682 |
+
class MultiPeriodDiscriminatorV2(torch.nn.Module):
|
683 |
+
def __init__(self, use_spectral_norm=False):
|
684 |
+
super(MultiPeriodDiscriminatorV2, self).__init__()
|
685 |
+
# periods = [2, 3, 5, 7, 11, 17]
|
686 |
+
periods = [2, 3, 5, 7, 11, 17, 23, 37]
|
687 |
+
|
688 |
+
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
|
689 |
+
discs = discs + [
|
690 |
+
DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
|
691 |
+
]
|
692 |
+
self.discriminators = nn.ModuleList(discs)
|
693 |
+
|
694 |
+
def forward(self, y, y_hat):
|
695 |
+
y_d_rs = [] #
|
696 |
+
y_d_gs = []
|
697 |
+
fmap_rs = []
|
698 |
+
fmap_gs = []
|
699 |
+
for i, d in enumerate(self.discriminators):
|
700 |
+
y_d_r, fmap_r = d(y)
|
701 |
+
y_d_g, fmap_g = d(y_hat)
|
702 |
+
# for j in range(len(fmap_r)):
|
703 |
+
# print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
|
704 |
+
y_d_rs.append(y_d_r)
|
705 |
+
y_d_gs.append(y_d_g)
|
706 |
+
fmap_rs.append(fmap_r)
|
707 |
+
fmap_gs.append(fmap_g)
|
708 |
+
|
709 |
+
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
710 |
+
|
711 |
+
|
712 |
class DiscriminatorS(torch.nn.Module):
|
713 |
def __init__(self, use_spectral_norm=False):
|
714 |
super(DiscriminatorS, self).__init__()
|
requirements.txt
CHANGED
@@ -1,46 +1,26 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
librosa==0.9.2
|
5 |
-
llvmlite==0.39.0
|
6 |
fairseq==0.12.2
|
7 |
-
faiss-cpu==1.7.0; sys_platform == "darwin"
|
8 |
-
faiss-cpu==1.7.2; sys_platform != "darwin"
|
9 |
gradio
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
Markdown
|
20 |
-
matplotlib>=3.7.1
|
21 |
-
matplotlib-inline>=0.1.6
|
22 |
-
praat-parselmouth>=0.4.3
|
23 |
-
Pillow>=9.1.1
|
24 |
pyworld>=0.3.2
|
25 |
-
resampy>=0.4.2
|
26 |
-
scikit-learn>=1.2.2
|
27 |
-
starlette>=0.26.1
|
28 |
tensorboard
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
sympy>=1.11.1
|
37 |
-
tabulate>=0.9.0
|
38 |
-
PyYAML>=6.0
|
39 |
-
pyasn1>=0.4.8
|
40 |
-
pyasn1-modules>=0.2.8
|
41 |
-
fsspec>=2023.3.0
|
42 |
-
absl-py>=1.4.0
|
43 |
-
audioread
|
44 |
-
uvicorn>=0.21.1
|
45 |
-
colorama>=0.4.6
|
46 |
edge-tts
|
|
|
|
|
|
1 |
+
setuptools
|
2 |
+
wheel
|
3 |
+
httpx==0.23.0
|
|
|
|
|
4 |
fairseq==0.12.2
|
|
|
|
|
5 |
gradio
|
6 |
+
ffmpeg
|
7 |
+
praat-parselmouth
|
8 |
+
pyworld
|
9 |
+
numpy==1.23.5
|
10 |
+
numba==0.56.4
|
11 |
+
librosa==0.9.2
|
12 |
+
faiss-cpu==1.7.3
|
13 |
+
faiss-gpu
|
14 |
+
scipy==1.9.3
|
|
|
|
|
|
|
|
|
|
|
15 |
pyworld>=0.3.2
|
|
|
|
|
|
|
16 |
tensorboard
|
17 |
+
tensorboardX
|
18 |
+
onnxruntime
|
19 |
+
pyngrok==4.1.12
|
20 |
+
soundfile>=0.12.1
|
21 |
+
tqdm>=4.63.1
|
22 |
+
torchcrepe
|
23 |
+
asyncio
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
edge-tts
|
25 |
+
demucs
|
26 |
+
yt_dlp
|
vc_infer_pipeline.py
CHANGED
@@ -2,11 +2,50 @@ import numpy as np, parselmouth, torch, pdb
|
|
2 |
from time import time as ttime
|
3 |
import torch.nn.functional as F
|
4 |
import scipy.signal as signal
|
5 |
-
import pyworld, os, traceback, faiss
|
6 |
from scipy import signal
|
|
|
7 |
|
8 |
bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
class VC(object):
|
12 |
def __init__(self, tgt_sr, config):
|
@@ -27,7 +66,17 @@ class VC(object):
|
|
27 |
self.t_max = self.sr * self.x_max # 免查询时长阈值
|
28 |
self.device = config.device
|
29 |
|
30 |
-
def get_f0(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
time_step = self.window / self.sr * 1000
|
32 |
f0_min = 50
|
33 |
f0_max = 1100
|
@@ -50,15 +99,31 @@ class VC(object):
|
|
50 |
f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
|
51 |
)
|
52 |
elif f0_method == "harvest":
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
)
|
60 |
-
|
61 |
-
f0 =
|
|
|
|
|
62 |
f0 *= pow(2, f0_up_key / 12)
|
63 |
# with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
|
64 |
tf0 = self.sr // self.window # 每秒f0点数
|
@@ -96,6 +161,8 @@ class VC(object):
|
|
96 |
index,
|
97 |
big_npy,
|
98 |
index_rate,
|
|
|
|
|
99 |
): # ,file_index,file_big_npy
|
100 |
feats = torch.from_numpy(audio0)
|
101 |
if self.is_half:
|
@@ -111,13 +178,14 @@ class VC(object):
|
|
111 |
inputs = {
|
112 |
"source": feats.to(self.device),
|
113 |
"padding_mask": padding_mask,
|
114 |
-
"output_layer": 9
|
115 |
}
|
116 |
t0 = ttime()
|
117 |
with torch.no_grad():
|
118 |
logits = model.extract_features(**inputs)
|
119 |
-
feats = model.final_proj(logits[0])
|
120 |
-
|
|
|
121 |
if (
|
122 |
isinstance(index, type(None)) == False
|
123 |
and isinstance(big_npy, type(None)) == False
|
@@ -143,6 +211,8 @@ class VC(object):
|
|
143 |
)
|
144 |
|
145 |
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
|
|
|
|
|
146 |
t1 = ttime()
|
147 |
p_len = audio0.shape[0] // self.window
|
148 |
if feats.shape[1] < p_len:
|
@@ -150,23 +220,26 @@ class VC(object):
|
|
150 |
if pitch != None and pitchf != None:
|
151 |
pitch = pitch[:, :p_len]
|
152 |
pitchf = pitchf[:, :p_len]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
p_len = torch.tensor([p_len], device=self.device).long()
|
154 |
with torch.no_grad():
|
155 |
if pitch != None and pitchf != None:
|
156 |
audio1 = (
|
157 |
-
(net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]
|
158 |
.data.cpu()
|
159 |
.float()
|
160 |
.numpy()
|
161 |
-
.astype(np.int16)
|
162 |
)
|
163 |
else:
|
164 |
audio1 = (
|
165 |
-
(net_g.infer(feats, p_len, sid)[0][0, 0]
|
166 |
-
.data.cpu()
|
167 |
-
.float()
|
168 |
-
.numpy()
|
169 |
-
.astype(np.int16)
|
170 |
)
|
171 |
del feats, p_len, padding_mask
|
172 |
if torch.cuda.is_available():
|
@@ -182,6 +255,7 @@ class VC(object):
|
|
182 |
net_g,
|
183 |
sid,
|
184 |
audio,
|
|
|
185 |
times,
|
186 |
f0_up_key,
|
187 |
f0_method,
|
@@ -189,6 +263,12 @@ class VC(object):
|
|
189 |
# file_big_npy,
|
190 |
index_rate,
|
191 |
if_f0,
|
|
|
|
|
|
|
|
|
|
|
|
|
192 |
f0_file=None,
|
193 |
):
|
194 |
if (
|
@@ -243,9 +323,19 @@ class VC(object):
|
|
243 |
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
|
244 |
pitch, pitchf = None, None
|
245 |
if if_f0 == 1:
|
246 |
-
pitch, pitchf = self.get_f0(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
247 |
pitch = pitch[:p_len]
|
248 |
pitchf = pitchf[:p_len]
|
|
|
|
|
249 |
pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
|
250 |
pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
|
251 |
t2 = ttime()
|
@@ -265,6 +355,8 @@ class VC(object):
|
|
265 |
index,
|
266 |
big_npy,
|
267 |
index_rate,
|
|
|
|
|
268 |
)[self.t_pad_tgt : -self.t_pad_tgt]
|
269 |
)
|
270 |
else:
|
@@ -280,6 +372,8 @@ class VC(object):
|
|
280 |
index,
|
281 |
big_npy,
|
282 |
index_rate,
|
|
|
|
|
283 |
)[self.t_pad_tgt : -self.t_pad_tgt]
|
284 |
)
|
285 |
s = t
|
@@ -296,6 +390,8 @@ class VC(object):
|
|
296 |
index,
|
297 |
big_npy,
|
298 |
index_rate,
|
|
|
|
|
299 |
)[self.t_pad_tgt : -self.t_pad_tgt]
|
300 |
)
|
301 |
else:
|
@@ -311,9 +407,22 @@ class VC(object):
|
|
311 |
index,
|
312 |
big_npy,
|
313 |
index_rate,
|
|
|
|
|
314 |
)[self.t_pad_tgt : -self.t_pad_tgt]
|
315 |
)
|
316 |
audio_opt = np.concatenate(audio_opt)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
317 |
del pitch, pitchf, sid
|
318 |
if torch.cuda.is_available():
|
319 |
torch.cuda.empty_cache()
|
|
|
2 |
from time import time as ttime
|
3 |
import torch.nn.functional as F
|
4 |
import scipy.signal as signal
|
5 |
+
import pyworld, os, traceback, faiss, librosa, torchcrepe
|
6 |
from scipy import signal
|
7 |
+
from functools import lru_cache
|
8 |
|
9 |
bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
|
10 |
|
11 |
+
input_audio_path2wav = {}
|
12 |
+
|
13 |
+
|
14 |
+
@lru_cache
|
15 |
+
def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
|
16 |
+
audio = input_audio_path2wav[input_audio_path]
|
17 |
+
f0, t = pyworld.harvest(
|
18 |
+
audio,
|
19 |
+
fs=fs,
|
20 |
+
f0_ceil=f0max,
|
21 |
+
f0_floor=f0min,
|
22 |
+
frame_period=frame_period,
|
23 |
+
)
|
24 |
+
f0 = pyworld.stonemask(audio, f0, t, fs)
|
25 |
+
return f0
|
26 |
+
|
27 |
+
|
28 |
+
def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate是2的占比
|
29 |
+
# print(data1.max(),data2.max())
|
30 |
+
rms1 = librosa.feature.rms(
|
31 |
+
y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2
|
32 |
+
) # 每半秒一个点
|
33 |
+
rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2)
|
34 |
+
rms1 = torch.from_numpy(rms1)
|
35 |
+
rms1 = F.interpolate(
|
36 |
+
rms1.unsqueeze(0), size=data2.shape[0], mode="linear"
|
37 |
+
).squeeze()
|
38 |
+
rms2 = torch.from_numpy(rms2)
|
39 |
+
rms2 = F.interpolate(
|
40 |
+
rms2.unsqueeze(0), size=data2.shape[0], mode="linear"
|
41 |
+
).squeeze()
|
42 |
+
rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6)
|
43 |
+
data2 *= (
|
44 |
+
torch.pow(rms1, torch.tensor(1 - rate))
|
45 |
+
* torch.pow(rms2, torch.tensor(rate - 1))
|
46 |
+
).numpy()
|
47 |
+
return data2
|
48 |
+
|
49 |
|
50 |
class VC(object):
|
51 |
def __init__(self, tgt_sr, config):
|
|
|
66 |
self.t_max = self.sr * self.x_max # 免查询时长阈值
|
67 |
self.device = config.device
|
68 |
|
69 |
+
def get_f0(
|
70 |
+
self,
|
71 |
+
input_audio_path,
|
72 |
+
x,
|
73 |
+
p_len,
|
74 |
+
f0_up_key,
|
75 |
+
f0_method,
|
76 |
+
filter_radius,
|
77 |
+
inp_f0=None,
|
78 |
+
):
|
79 |
+
global input_audio_path2wav
|
80 |
time_step = self.window / self.sr * 1000
|
81 |
f0_min = 50
|
82 |
f0_max = 1100
|
|
|
99 |
f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
|
100 |
)
|
101 |
elif f0_method == "harvest":
|
102 |
+
input_audio_path2wav[input_audio_path] = x.astype(np.double)
|
103 |
+
f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
|
104 |
+
if filter_radius > 2:
|
105 |
+
f0 = signal.medfilt(f0, 3)
|
106 |
+
elif f0_method == "crepe":
|
107 |
+
model = "full"
|
108 |
+
# Pick a batch size that doesn't cause memory errors on your gpu
|
109 |
+
batch_size = 512
|
110 |
+
# Compute pitch using first gpu
|
111 |
+
audio = torch.tensor(np.copy(x))[None].float()
|
112 |
+
f0, pd = torchcrepe.predict(
|
113 |
+
audio,
|
114 |
+
self.sr,
|
115 |
+
self.window,
|
116 |
+
f0_min,
|
117 |
+
f0_max,
|
118 |
+
model,
|
119 |
+
batch_size=batch_size,
|
120 |
+
device=self.device,
|
121 |
+
return_periodicity=True,
|
122 |
)
|
123 |
+
pd = torchcrepe.filter.median(pd, 3)
|
124 |
+
f0 = torchcrepe.filter.mean(f0, 3)
|
125 |
+
f0[pd < 0.1] = 0
|
126 |
+
f0 = f0[0].cpu().numpy()
|
127 |
f0 *= pow(2, f0_up_key / 12)
|
128 |
# with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
|
129 |
tf0 = self.sr // self.window # 每秒f0点数
|
|
|
161 |
index,
|
162 |
big_npy,
|
163 |
index_rate,
|
164 |
+
version,
|
165 |
+
protect
|
166 |
): # ,file_index,file_big_npy
|
167 |
feats = torch.from_numpy(audio0)
|
168 |
if self.is_half:
|
|
|
178 |
inputs = {
|
179 |
"source": feats.to(self.device),
|
180 |
"padding_mask": padding_mask,
|
181 |
+
"output_layer": 9 if version == "v1" else 12,
|
182 |
}
|
183 |
t0 = ttime()
|
184 |
with torch.no_grad():
|
185 |
logits = model.extract_features(**inputs)
|
186 |
+
feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
|
187 |
+
if(protect<0.5):
|
188 |
+
feats0=feats.clone()
|
189 |
if (
|
190 |
isinstance(index, type(None)) == False
|
191 |
and isinstance(big_npy, type(None)) == False
|
|
|
211 |
)
|
212 |
|
213 |
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
|
214 |
+
if(protect<0.5):
|
215 |
+
feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
|
216 |
t1 = ttime()
|
217 |
p_len = audio0.shape[0] // self.window
|
218 |
if feats.shape[1] < p_len:
|
|
|
220 |
if pitch != None and pitchf != None:
|
221 |
pitch = pitch[:, :p_len]
|
222 |
pitchf = pitchf[:, :p_len]
|
223 |
+
|
224 |
+
if(protect<0.5):
|
225 |
+
pitchff = pitchf.clone()
|
226 |
+
pitchff[pitchf > 0] = 1
|
227 |
+
pitchff[pitchf < 1] = protect
|
228 |
+
pitchff = pitchff.unsqueeze(-1)
|
229 |
+
feats = feats * pitchff + feats0 * (1 - pitchff)
|
230 |
+
feats=feats.to(feats0.dtype)
|
231 |
p_len = torch.tensor([p_len], device=self.device).long()
|
232 |
with torch.no_grad():
|
233 |
if pitch != None and pitchf != None:
|
234 |
audio1 = (
|
235 |
+
(net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0])
|
236 |
.data.cpu()
|
237 |
.float()
|
238 |
.numpy()
|
|
|
239 |
)
|
240 |
else:
|
241 |
audio1 = (
|
242 |
+
(net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy()
|
|
|
|
|
|
|
|
|
243 |
)
|
244 |
del feats, p_len, padding_mask
|
245 |
if torch.cuda.is_available():
|
|
|
255 |
net_g,
|
256 |
sid,
|
257 |
audio,
|
258 |
+
input_audio_path,
|
259 |
times,
|
260 |
f0_up_key,
|
261 |
f0_method,
|
|
|
263 |
# file_big_npy,
|
264 |
index_rate,
|
265 |
if_f0,
|
266 |
+
filter_radius,
|
267 |
+
tgt_sr,
|
268 |
+
resample_sr,
|
269 |
+
rms_mix_rate,
|
270 |
+
version,
|
271 |
+
protect,
|
272 |
f0_file=None,
|
273 |
):
|
274 |
if (
|
|
|
323 |
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
|
324 |
pitch, pitchf = None, None
|
325 |
if if_f0 == 1:
|
326 |
+
pitch, pitchf = self.get_f0(
|
327 |
+
input_audio_path,
|
328 |
+
audio_pad,
|
329 |
+
p_len,
|
330 |
+
f0_up_key,
|
331 |
+
f0_method,
|
332 |
+
filter_radius,
|
333 |
+
inp_f0,
|
334 |
+
)
|
335 |
pitch = pitch[:p_len]
|
336 |
pitchf = pitchf[:p_len]
|
337 |
+
if self.device == "mps":
|
338 |
+
pitchf = pitchf.astype(np.float32)
|
339 |
pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
|
340 |
pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
|
341 |
t2 = ttime()
|
|
|
355 |
index,
|
356 |
big_npy,
|
357 |
index_rate,
|
358 |
+
version,
|
359 |
+
protect
|
360 |
)[self.t_pad_tgt : -self.t_pad_tgt]
|
361 |
)
|
362 |
else:
|
|
|
372 |
index,
|
373 |
big_npy,
|
374 |
index_rate,
|
375 |
+
version,
|
376 |
+
protect
|
377 |
)[self.t_pad_tgt : -self.t_pad_tgt]
|
378 |
)
|
379 |
s = t
|
|
|
390 |
index,
|
391 |
big_npy,
|
392 |
index_rate,
|
393 |
+
version,
|
394 |
+
protect
|
395 |
)[self.t_pad_tgt : -self.t_pad_tgt]
|
396 |
)
|
397 |
else:
|
|
|
407 |
index,
|
408 |
big_npy,
|
409 |
index_rate,
|
410 |
+
version,
|
411 |
+
protect
|
412 |
)[self.t_pad_tgt : -self.t_pad_tgt]
|
413 |
)
|
414 |
audio_opt = np.concatenate(audio_opt)
|
415 |
+
if rms_mix_rate != 1:
|
416 |
+
audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
|
417 |
+
if resample_sr >= 16000 and tgt_sr != resample_sr:
|
418 |
+
audio_opt = librosa.resample(
|
419 |
+
audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
|
420 |
+
)
|
421 |
+
audio_max = np.abs(audio_opt).max() / 0.99
|
422 |
+
max_int16 = 32768
|
423 |
+
if audio_max > 1:
|
424 |
+
max_int16 /= audio_max
|
425 |
+
audio_opt = (audio_opt * max_int16).astype(np.int16)
|
426 |
del pitch, pitchf, sid
|
427 |
if torch.cuda.is_available():
|
428 |
torch.cuda.empty_cache()
|