Vijish commited on
Commit
dd0fa64
1 Parent(s): f1fab6f

Upload 4 files

Browse files
Files changed (4) hide show
  1. config.py +79 -0
  2. handler.py +74 -117
  3. vc_infer_pipeline.py +451 -0
  4. voice_processing.py +230 -0
config.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import sys
3
+ import torch
4
+ from multiprocessing import cpu_count
5
+
6
+ class Config:
7
+ def __init__(self):
8
+ self.device = "cuda:0"
9
+ self.is_half = True
10
+ self.n_cpu = 0
11
+ self.gpu_name = None
12
+ self.gpu_mem = None
13
+ self.python_cmd, self.listen_port, self.iscolab, self.noparallel, self.noautoopen = self.arg_parse()
14
+ self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
15
+
16
+ @staticmethod
17
+ def arg_parse() -> tuple:
18
+ exe = sys.executable or "python"
19
+ parser = argparse.ArgumentParser()
20
+ parser.add_argument("--port", type=int, default=7865, help="Listen port")
21
+ parser.add_argument("--pycmd", type=str, default=exe, help="Python command")
22
+ parser.add_argument("--colab", action="store_true", help="Launch in colab")
23
+ parser.add_argument("--noparallel", action="store_true", help="Disable parallel processing")
24
+ parser.add_argument("--noautoopen", action="store_true", help="Do not open in browser automatically")
25
+
26
+ if len(sys.argv) > 1 and sys.argv[0].endswith("colab_kernel_launcher.py"):
27
+ args = parser.parse_known_args(sys.argv[1:])[0]
28
+ else:
29
+ args = parser.parse_args()
30
+
31
+ args.port = args.port if 0 <= args.port <= 65535 else 7865
32
+
33
+ return args.pycmd, args.port, args.colab, args.noparallel, args.noautoopen
34
+
35
+ @staticmethod
36
+ def has_mps() -> bool:
37
+ if not torch.backends.mps.is_available():
38
+ return False
39
+ try:
40
+ torch.zeros(1).to(torch.device("mps"))
41
+ return True
42
+ except Exception:
43
+ return False
44
+
45
+ def device_config(self) -> tuple:
46
+ if torch.cuda.is_available():
47
+ i_device = int(self.device.split(":")[-1])
48
+ self.gpu_name = torch.cuda.get_device_name(i_device)
49
+ if ("16" in self.gpu_name and "V100" not in self.gpu_name.upper()) or "P40" in self.gpu_name.upper() or "1060" in self.gpu_name or "1070" in self.gpu_name or "1080" in self.gpu_name:
50
+ print("Found GPU", self.gpu_name, ", force to fp32")
51
+ self.is_half = False
52
+ else:
53
+ print("Found GPU", self.gpu_name)
54
+ self.gpu_mem = int(torch.cuda.get_device_properties(i_device).total_memory / 1024 / 1024 / 1024 + 0.4)
55
+ elif self.has_mps():
56
+ print("No supported Nvidia GPU found, use MPS instead")
57
+ self.device = "mps"
58
+ self.is_half = False
59
+ else:
60
+ print("No supported Nvidia GPU found, use CPU instead")
61
+ self.device = "cpu"
62
+ self.is_half = False
63
+
64
+ if self.n_cpu == 0:
65
+ self.n_cpu = cpu_count()
66
+
67
+ if self.is_half:
68
+ x_pad, x_query, x_center, x_max = 3, 10, 60, 65
69
+ else:
70
+ x_pad, x_query, x_center, x_max = 1, 6, 38, 41
71
+
72
+ if self.gpu_mem is not None and self.gpu_mem <= 4:
73
+ x_pad, x_query, x_center, x_max = 1, 5, 30, 32
74
+
75
+ return x_pad, x_query, x_center, x_max
76
+
77
+ if __name__ == "__main__":
78
+ config = Config()
79
+ print(config.__dict__)
handler.py CHANGED
@@ -1,118 +1,75 @@
1
- from pydantic import BaseModel
2
- from environs import Env
3
- from typing import List, Dict, Any
4
- import os
5
- import base64
6
- import numpy as np
7
- import librosa
8
- from scipy.io import wavfile
9
- import asyncio
10
- import shutil
11
- import zipfile
12
- import requests
13
-
14
-
15
- def download_and_extract_files():
16
- files_to_download = [
17
- ("config.py", "https://www.dropbox.com/scl/fi/ls7vmjk75uou8ayfn6kj4/config.py?rlkey=4qluzl5l07zq1j9mkl9n6j66u&st=0yit9dzx&dl=1"),
18
- ("hubert_base.pt", "https://www.dropbox.com/scl/fi/g7oohuwfzlnrbd8zic6gj/hubert_base.pt?rlkey=ddeyqex1morsm54azyakmd62e&st=rsrvf964&dl=1"),
19
- ("lib.zip", "https://www.dropbox.com/scl/fi/ia6p6cf5xvcbi78dmkbbz/lib.zip?rlkey=k3chc1nlaswsqdo7slqco56wi&st=19n9syfd&dl=1"),
20
- ("rmvpe.pt", "https://www.dropbox.com/scl/fi/7pl7u6fvydwgtz19n8nzx/rmvpe.pt?rlkey=tnbxmarogivbw3qy34hgy7r7q&st=um8d4230&dl=1"),
21
- ("rmvpe.py", "https://www.dropbox.com/scl/fi/i2shk4otwyg4ns8yod5h1/rmvpe.py?rlkey=l7313htdh1ihylb6bx91el0lv&st=xhkfog8j&dl=1"),
22
- ("vc_infer_pipeline.py", "https://www.dropbox.com/scl/fi/bvz7s2wf2y67twpg583lg/vc_infer_pipeline.py?rlkey=q4w7oww5e7e2qdfh3herofk4o&st=4sck87ny&dl=1"),
23
- ("voice_processing.py", "https://www.dropbox.com/scl/fi/emrmjsuz0mod4r2x9e43f/voice_processing.py?rlkey=6baomwowns9y3yq1pl6syer0t&st=d9u51gba&dl=1"),
24
- ("weights.zip", "https://www.dropbox.com/scl/fi/tr5a04wlow5go8cv3d6qp/weights.zip?rlkey=qvpwax97nn5a4iv79g76lcbz2&st=5ueb2gva&dl=1")
25
- ]
26
-
27
- for file_name, url in files_to_download:
28
- if not os.path.exists(file_name):
29
- response = requests.get(url)
30
- with open(file_name, "wb") as file:
31
- file.write(response.content)
32
-
33
- if file_name.endswith(".zip"):
34
- with zipfile.ZipFile(file_name, "r") as zip_ref:
35
- extract_to = os.path.splitext(file_name)[0]
36
- for member in zip_ref.namelist():
37
- # Extract files into the target directory without the first part of the path
38
- member_path = os.path.join(extract_to, *member.split('/')[1:])
39
- if member.endswith('/'):
40
- os.makedirs(member_path, exist_ok=True)
41
- else:
42
- os.makedirs(os.path.dirname(member_path), exist_ok=True)
43
- with open(member_path, 'wb') as f:
44
- f.write(zip_ref.read(member))
45
-
46
- # Optionally, remove the zip file after extraction
47
- os.remove(file_name)
48
-
49
- # Run the function
50
- download_and_extract_files()
51
-
52
- from voice_processing import tts, get_model_names, voice_mapping, get_unique_filename
53
-
54
-
55
- class EndpointHandler:
56
- def __init__(self, model_dir=None):
57
- self.model_dir = model_dir
58
-
59
- def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
60
- try:
61
- if "inputs" in data: # Check if data is in Hugging Face JSON format
62
- return self.process_hf_input(data)
63
- else:
64
- return self.process_json_input(data)
65
- except ValueError as e:
66
- return {"error": str(e)}
67
- except Exception as e:
68
- return {"error": str(e)}
69
-
70
- def process_json_input(self, json_data):
71
- if all(key in json_data for key in ["model_name", "tts_text", "selected_voice", "slang_rate", "use_uploaded_voice"]):
72
- model_name = json_data["model_name"]
73
- tts_text = json_data["tts_text"]
74
- selected_voice = json_data["selected_voice"]
75
- slang_rate = json_data["slang_rate"]
76
- use_uploaded_voice = json_data["use_uploaded_voice"]
77
- voice_upload_file = json_data.get("voice_upload_file", None)
78
-
79
- edge_tts_voice = voice_mapping.get(selected_voice)
80
- if not edge_tts_voice:
81
- raise ValueError(f"Invalid voice '{selected_voice}'.")
82
-
83
- info, edge_tts_output_path, tts_output_data, edge_output_file = asyncio.run(tts(
84
- model_name, tts_text, edge_tts_voice, slang_rate, use_uploaded_voice, voice_upload_file
85
- ))
86
-
87
- if edge_output_file and os.path.exists(edge_output_file):
88
- os.remove(edge_output_file)
89
-
90
- _, audio_output = tts_output_data
91
-
92
- audio_file_path = self.save_audio_data_to_file(audio_output) if isinstance(audio_output, np.ndarray) else audio_output
93
-
94
- try:
95
- with open(audio_file_path, 'rb') as file:
96
- audio_bytes = file.read()
97
- audio_data_uri = f"data:audio/wav;base64,{base64.b64encode(audio_bytes).decode('utf-8')}"
98
- except Exception as e:
99
- raise Exception(f"Failed to read audio file: {e}")
100
- finally:
101
- if os.path.exists(audio_file_path):
102
- os.remove(audio_file_path)
103
-
104
- return {"info": info, "audio_data_uri": audio_data_uri}
105
- else:
106
- raise ValueError("Invalid JSON structure.")
107
-
108
- def process_hf_input(self, hf_data):
109
- if "inputs" in hf_data:
110
- actual_data = hf_data["inputs"]
111
- return self.process_json_input(actual_data)
112
- else:
113
- return {"error": "Invalid Hugging Face JSON structure."}
114
-
115
- def save_audio_data_to_file(self, audio_data, sample_rate=40000):
116
- file_path = get_unique_filename('wav')
117
- wavfile.write(file_path, sample_rate, audio_data)
118
  return file_path
 
1
+ from pydantic import BaseModel
2
+ from environs import Env
3
+ from typing import List, Dict, Any
4
+ import os
5
+ import base64
6
+ import numpy as np
7
+ import librosa
8
+ from scipy.io import wavfile
9
+ import asyncio
10
+ from voice_processing import tts, get_model_names, voice_mapping, get_unique_filename
11
+
12
+ class EndpointHandler:
13
+ def __init__(self, model_dir=None):
14
+ self.model_dir = model_dir
15
+
16
+ def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
17
+ try:
18
+ if "inputs" in data: # Check if data is in Hugging Face JSON format
19
+ return self.process_hf_input(data)
20
+ else:
21
+ return self.process_json_input(data)
22
+ except ValueError as e:
23
+ return {"error": str(e)}
24
+ except Exception as e:
25
+ return {"error": str(e)}
26
+
27
+ def process_json_input(self, json_data):
28
+ if all(key in json_data for key in ["model_name", "tts_text", "selected_voice", "slang_rate", "use_uploaded_voice"]):
29
+ model_name = json_data["model_name"]
30
+ tts_text = json_data["tts_text"]
31
+ selected_voice = json_data["selected_voice"]
32
+ slang_rate = json_data["slang_rate"]
33
+ use_uploaded_voice = json_data["use_uploaded_voice"]
34
+ voice_upload_file = json_data.get("voice_upload_file", None)
35
+
36
+ edge_tts_voice = voice_mapping.get(selected_voice)
37
+ if not edge_tts_voice:
38
+ raise ValueError(f"Invalid voice '{selected_voice}'.")
39
+
40
+ info, edge_tts_output_path, tts_output_data, edge_output_file = asyncio.run(tts(
41
+ model_name, tts_text, edge_tts_voice, slang_rate, use_uploaded_voice, voice_upload_file
42
+ ))
43
+
44
+ if edge_output_file and os.path.exists(edge_output_file):
45
+ os.remove(edge_output_file)
46
+
47
+ _, audio_output = tts_output_data
48
+
49
+ audio_file_path = self.save_audio_data_to_file(audio_output) if isinstance(audio_output, np.ndarray) else audio_output
50
+
51
+ try:
52
+ with open(audio_file_path, 'rb') as file:
53
+ audio_bytes = file.read()
54
+ audio_data_uri = f"data:audio/wav;base64,{base64.b64encode(audio_bytes).decode('utf-8')}"
55
+ except Exception as e:
56
+ raise Exception(f"Failed to read audio file: {e}")
57
+ finally:
58
+ if os.path.exists(audio_file_path):
59
+ os.remove(audio_file_path)
60
+
61
+ return {"info": info, "audio_data_uri": audio_data_uri}
62
+ else:
63
+ raise ValueError("Invalid JSON structure.")
64
+
65
+ def process_hf_input(self, hf_data):
66
+ if "inputs" in hf_data:
67
+ actual_data = hf_data["inputs"]
68
+ return self.process_json_input(actual_data)
69
+ else:
70
+ return {"error": "Invalid Hugging Face JSON structure."}
71
+
72
+ def save_audio_data_to_file(self, audio_data, sample_rate=40000):
73
+ file_path = get_unique_filename('wav')
74
+ wavfile.write(file_path, sample_rate, audio_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  return file_path
vc_infer_pipeline.py ADDED
@@ -0,0 +1,451 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import traceback
4
+ from functools import lru_cache
5
+ from time import time as ttime
6
+
7
+ import faiss
8
+ import librosa
9
+ import numpy as np
10
+ import parselmouth
11
+ import pyworld
12
+ import torch
13
+ import torch.nn.functional as F
14
+ import torchcrepe
15
+ from scipy import signal
16
+
17
+ now_dir = os.getcwd()
18
+ sys.path.append(now_dir)
19
+
20
+ bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
21
+
22
+ input_audio_path2wav = {}
23
+
24
+
25
+ @lru_cache
26
+ def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
27
+ audio = input_audio_path2wav[input_audio_path]
28
+ f0, t = pyworld.harvest(
29
+ audio,
30
+ fs=fs,
31
+ f0_ceil=f0max,
32
+ f0_floor=f0min,
33
+ frame_period=frame_period,
34
+ )
35
+ f0 = pyworld.stonemask(audio, f0, t, fs)
36
+ return f0
37
+
38
+
39
+ def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate是2的占比
40
+ # print(data1.max(),data2.max())
41
+ rms1 = librosa.feature.rms(
42
+ y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2
43
+ ) # 每半秒一个点
44
+ rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2)
45
+ rms1 = torch.from_numpy(rms1)
46
+ rms1 = F.interpolate(
47
+ rms1.unsqueeze(0), size=data2.shape[0], mode="linear"
48
+ ).squeeze()
49
+ rms2 = torch.from_numpy(rms2)
50
+ rms2 = F.interpolate(
51
+ rms2.unsqueeze(0), size=data2.shape[0], mode="linear"
52
+ ).squeeze()
53
+ rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6)
54
+ data2 *= (
55
+ torch.pow(rms1, torch.tensor(1 - rate))
56
+ * torch.pow(rms2, torch.tensor(rate - 1))
57
+ ).numpy()
58
+ return data2
59
+
60
+
61
+ class VC(object):
62
+ def __init__(self, tgt_sr, config):
63
+ self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
64
+ config.x_pad,
65
+ config.x_query,
66
+ config.x_center,
67
+ config.x_max,
68
+ config.is_half,
69
+ )
70
+ self.sr = 16000 # hubert输入采样率
71
+ self.window = 160 # 每帧点数
72
+ self.t_pad = self.sr * self.x_pad # 每条前后pad时间
73
+ self.t_pad_tgt = tgt_sr * self.x_pad
74
+ self.t_pad2 = self.t_pad * 2
75
+ self.t_query = self.sr * self.x_query # 查询切点前后查询时间
76
+ self.t_center = self.sr * self.x_center # 查询切点位置
77
+ self.t_max = self.sr * self.x_max # 免查询时长阈值
78
+ self.device = config.device
79
+
80
+ def get_f0(
81
+ self,
82
+ input_audio_path,
83
+ x,
84
+ p_len,
85
+ f0_up_key,
86
+ f0_method,
87
+ filter_radius,
88
+ inp_f0=None,
89
+ ):
90
+ global input_audio_path2wav
91
+ time_step = self.window / self.sr * 1000
92
+ f0_min = 50
93
+ f0_max = 1100
94
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
95
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
96
+ if f0_method == "pm":
97
+ f0 = (
98
+ parselmouth.Sound(x, self.sr)
99
+ .to_pitch_ac(
100
+ time_step=time_step / 1000,
101
+ voicing_threshold=0.6,
102
+ pitch_floor=f0_min,
103
+ pitch_ceiling=f0_max,
104
+ )
105
+ .selected_array["frequency"]
106
+ )
107
+ pad_size = (p_len - len(f0) + 1) // 2
108
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
109
+ f0 = np.pad(
110
+ f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
111
+ )
112
+ elif f0_method == "harvest":
113
+ input_audio_path2wav[input_audio_path] = x.astype(np.double)
114
+ f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
115
+ if filter_radius > 2:
116
+ f0 = signal.medfilt(f0, 3)
117
+ elif f0_method == "crepe":
118
+ model = "full"
119
+ # Pick a batch size that doesn't cause memory errors on your gpu
120
+ batch_size = 512
121
+ # Compute pitch using first gpu
122
+ audio = torch.tensor(np.copy(x))[None].float()
123
+ f0, pd = torchcrepe.predict(
124
+ audio,
125
+ self.sr,
126
+ self.window,
127
+ f0_min,
128
+ f0_max,
129
+ model,
130
+ batch_size=batch_size,
131
+ device=self.device,
132
+ return_periodicity=True,
133
+ )
134
+ pd = torchcrepe.filter.median(pd, 3)
135
+ f0 = torchcrepe.filter.mean(f0, 3)
136
+ f0[pd < 0.1] = 0
137
+ f0 = f0[0].cpu().numpy()
138
+ elif f0_method == "rmvpe":
139
+ if hasattr(self, "model_rmvpe") == False:
140
+ from rmvpe import RMVPE
141
+
142
+ print("loading rmvpe model")
143
+ self.model_rmvpe = RMVPE(
144
+ "rmvpe.pt", is_half=self.is_half, device=self.device
145
+ )
146
+ f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
147
+ f0 *= pow(2, f0_up_key / 12)
148
+ # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
149
+ tf0 = self.sr // self.window # 每秒f0点数
150
+ if inp_f0 is not None:
151
+ delta_t = np.round(
152
+ (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
153
+ ).astype("int16")
154
+ replace_f0 = np.interp(
155
+ list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
156
+ )
157
+ shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
158
+ f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
159
+ :shape
160
+ ]
161
+ # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
162
+ f0bak = f0.copy()
163
+ f0_mel = 1127 * np.log(1 + f0 / 700)
164
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
165
+ f0_mel_max - f0_mel_min
166
+ ) + 1
167
+ f0_mel[f0_mel <= 1] = 1
168
+ f0_mel[f0_mel > 255] = 255
169
+ f0_coarse = np.rint(f0_mel).astype(int)
170
+ return f0_coarse, f0bak # 1-0
171
+
172
+ def vc(
173
+ self,
174
+ model,
175
+ net_g,
176
+ sid,
177
+ audio0,
178
+ pitch,
179
+ pitchf,
180
+ times,
181
+ index,
182
+ big_npy,
183
+ index_rate,
184
+ version,
185
+ protect,
186
+ ): # ,file_index,file_big_npy
187
+ feats = torch.from_numpy(audio0)
188
+ if self.is_half:
189
+ feats = feats.half()
190
+ else:
191
+ feats = feats.float()
192
+ if feats.dim() == 2: # double channels
193
+ feats = feats.mean(-1)
194
+ assert feats.dim() == 1, feats.dim()
195
+ feats = feats.view(1, -1)
196
+ padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
197
+
198
+ inputs = {
199
+ "source": feats.to(self.device),
200
+ "padding_mask": padding_mask,
201
+ "output_layer": 9 if version == "v1" else 12,
202
+ }
203
+ t0 = ttime()
204
+ with torch.no_grad():
205
+ logits = model.extract_features(**inputs)
206
+ feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
207
+ if protect < 0.5 and pitch != None and pitchf != None:
208
+ feats0 = feats.clone()
209
+ if (
210
+ isinstance(index, type(None)) == False
211
+ and isinstance(big_npy, type(None)) == False
212
+ and index_rate != 0
213
+ ):
214
+ npy = feats[0].cpu().numpy()
215
+ if self.is_half:
216
+ npy = npy.astype("float32")
217
+
218
+ # _, I = index.search(npy, 1)
219
+ # npy = big_npy[I.squeeze()]
220
+
221
+ score, ix = index.search(npy, k=8)
222
+ weight = np.square(1 / score)
223
+ weight /= weight.sum(axis=1, keepdims=True)
224
+ npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
225
+
226
+ if self.is_half:
227
+ npy = npy.astype("float16")
228
+ feats = (
229
+ torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
230
+ + (1 - index_rate) * feats
231
+ )
232
+
233
+ feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
234
+ if protect < 0.5 and pitch != None and pitchf != None:
235
+ feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
236
+ 0, 2, 1
237
+ )
238
+ t1 = ttime()
239
+ p_len = audio0.shape[0] // self.window
240
+ if feats.shape[1] < p_len:
241
+ p_len = feats.shape[1]
242
+ if pitch != None and pitchf != None:
243
+ pitch = pitch[:, :p_len]
244
+ pitchf = pitchf[:, :p_len]
245
+
246
+ if protect < 0.5 and pitch != None and pitchf != None:
247
+ pitchff = pitchf.clone()
248
+ pitchff[pitchf > 0] = 1
249
+ pitchff[pitchf < 1] = protect
250
+ pitchff = pitchff.unsqueeze(-1)
251
+ feats = feats * pitchff + feats0 * (1 - pitchff)
252
+ feats = feats.to(feats0.dtype)
253
+ p_len = torch.tensor([p_len], device=self.device).long()
254
+ with torch.no_grad():
255
+ if pitch != None and pitchf != None:
256
+ audio1 = (
257
+ (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0])
258
+ .data.cpu()
259
+ .float()
260
+ .numpy()
261
+ )
262
+ else:
263
+ audio1 = (
264
+ (net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy()
265
+ )
266
+ del feats, p_len, padding_mask
267
+ if torch.cuda.is_available():
268
+ torch.cuda.empty_cache()
269
+ t2 = ttime()
270
+ times[0] += t1 - t0
271
+ times[2] += t2 - t1
272
+ return audio1
273
+
274
+ def pipeline(
275
+ self,
276
+ model,
277
+ net_g,
278
+ sid,
279
+ audio,
280
+ input_audio_path,
281
+ times,
282
+ f0_up_key,
283
+ f0_method,
284
+ file_index,
285
+ # file_big_npy,
286
+ index_rate,
287
+ if_f0,
288
+ filter_radius,
289
+ tgt_sr,
290
+ resample_sr,
291
+ rms_mix_rate,
292
+ version,
293
+ protect,
294
+ f0_file=None,
295
+ ):
296
+ if (
297
+ file_index != ""
298
+ # and file_big_npy != ""
299
+ # and os.path.exists(file_big_npy) == True
300
+ and os.path.exists(file_index) == True
301
+ and index_rate != 0
302
+ ):
303
+ try:
304
+ index = faiss.read_index(file_index)
305
+ # big_npy = np.load(file_big_npy)
306
+ big_npy = index.reconstruct_n(0, index.ntotal)
307
+ except:
308
+ traceback.print_exc()
309
+ index = big_npy = None
310
+ else:
311
+ index = big_npy = None
312
+ audio = signal.filtfilt(bh, ah, audio)
313
+ audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
314
+ opt_ts = []
315
+ if audio_pad.shape[0] > self.t_max:
316
+ audio_sum = np.zeros_like(audio)
317
+ for i in range(self.window):
318
+ audio_sum += audio_pad[i : i - self.window]
319
+ for t in range(self.t_center, audio.shape[0], self.t_center):
320
+ opt_ts.append(
321
+ t
322
+ - self.t_query
323
+ + np.where(
324
+ np.abs(audio_sum[t - self.t_query : t + self.t_query])
325
+ == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
326
+ )[0][0]
327
+ )
328
+ s = 0
329
+ audio_opt = []
330
+ t = None
331
+ t1 = ttime()
332
+ audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
333
+ p_len = audio_pad.shape[0] // self.window
334
+ inp_f0 = None
335
+ if hasattr(f0_file, "name") == True:
336
+ try:
337
+ with open(f0_file.name, "r") as f:
338
+ lines = f.read().strip("\n").split("\n")
339
+ inp_f0 = []
340
+ for line in lines:
341
+ inp_f0.append([float(i) for i in line.split(",")])
342
+ inp_f0 = np.array(inp_f0, dtype="float32")
343
+ except:
344
+ traceback.print_exc()
345
+ sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
346
+ pitch, pitchf = None, None
347
+ if if_f0 == 1:
348
+ pitch, pitchf = self.get_f0(
349
+ input_audio_path,
350
+ audio_pad,
351
+ p_len,
352
+ f0_up_key,
353
+ f0_method,
354
+ filter_radius,
355
+ inp_f0,
356
+ )
357
+ pitch = pitch[:p_len]
358
+ pitchf = pitchf[:p_len]
359
+ if self.device == "mps":
360
+ pitchf = pitchf.astype(np.float32)
361
+ pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
362
+ pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
363
+ t2 = ttime()
364
+ times[1] += t2 - t1
365
+ for t in opt_ts:
366
+ t = t // self.window * self.window
367
+ if if_f0 == 1:
368
+ audio_opt.append(
369
+ self.vc(
370
+ model,
371
+ net_g,
372
+ sid,
373
+ audio_pad[s : t + self.t_pad2 + self.window],
374
+ pitch[:, s // self.window : (t + self.t_pad2) // self.window],
375
+ pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
376
+ times,
377
+ index,
378
+ big_npy,
379
+ index_rate,
380
+ version,
381
+ protect,
382
+ )[self.t_pad_tgt : -self.t_pad_tgt]
383
+ )
384
+ else:
385
+ audio_opt.append(
386
+ self.vc(
387
+ model,
388
+ net_g,
389
+ sid,
390
+ audio_pad[s : t + self.t_pad2 + self.window],
391
+ None,
392
+ None,
393
+ times,
394
+ index,
395
+ big_npy,
396
+ index_rate,
397
+ version,
398
+ protect,
399
+ )[self.t_pad_tgt : -self.t_pad_tgt]
400
+ )
401
+ s = t
402
+ if if_f0 == 1:
403
+ audio_opt.append(
404
+ self.vc(
405
+ model,
406
+ net_g,
407
+ sid,
408
+ audio_pad[t:],
409
+ pitch[:, t // self.window :] if t is not None else pitch,
410
+ pitchf[:, t // self.window :] if t is not None else pitchf,
411
+ times,
412
+ index,
413
+ big_npy,
414
+ index_rate,
415
+ version,
416
+ protect,
417
+ )[self.t_pad_tgt : -self.t_pad_tgt]
418
+ )
419
+ else:
420
+ audio_opt.append(
421
+ self.vc(
422
+ model,
423
+ net_g,
424
+ sid,
425
+ audio_pad[t:],
426
+ None,
427
+ None,
428
+ times,
429
+ index,
430
+ big_npy,
431
+ index_rate,
432
+ version,
433
+ protect,
434
+ )[self.t_pad_tgt : -self.t_pad_tgt]
435
+ )
436
+ audio_opt = np.concatenate(audio_opt)
437
+ if rms_mix_rate != 1:
438
+ audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
439
+ if resample_sr >= 16000 and tgt_sr != resample_sr:
440
+ audio_opt = librosa.resample(
441
+ audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
442
+ )
443
+ audio_max = np.abs(audio_opt).max() / 0.99
444
+ max_int16 = 32768
445
+ if audio_max > 1:
446
+ max_int16 /= audio_max
447
+ audio_opt = (audio_opt * max_int16).astype(np.int16)
448
+ del pitch, pitchf, sid
449
+ if torch.cuda.is_available():
450
+ torch.cuda.empty_cache()
451
+ return audio_opt
voice_processing.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nest_asyncio
2
+ nest_asyncio.apply()
3
+
4
+ import asyncio
5
+ import datetime
6
+ import logging
7
+ import os
8
+ import time
9
+ import traceback
10
+ import tempfile
11
+
12
+ import edge_tts
13
+ import librosa
14
+ import torch
15
+ from fairseq import checkpoint_utils
16
+ import uuid
17
+
18
+ from config import Config
19
+ from lib.infer_pack.models import (
20
+ SynthesizerTrnMs256NSFsid,
21
+ SynthesizerTrnMs256NSFsid_nono,
22
+ SynthesizerTrnMs768NSFsid,
23
+ SynthesizerTrnMs768NSFsid_nono,
24
+ )
25
+ from rmvpe import RMVPE
26
+ from vc_infer_pipeline import VC
27
+
28
+ # Set logging levels
29
+ logging.getLogger("fairseq").setLevel(logging.WARNING)
30
+ logging.getLogger("numba").setLevel(logging.WARNING)
31
+ logging.getLogger("markdown_it").setLevel(logging.WARNING)
32
+ logging.getLogger("urllib3").setLevel(logging.WARNING)
33
+ logging.getLogger("matplotlib").setLevel(logging.WARNING)
34
+
35
+ limitation = os.getenv("SYSTEM") == "spaces"
36
+
37
+ config = Config()
38
+
39
+ # Edge TTS
40
+ tts_voice_list = asyncio.run(edge_tts.list_voices())
41
+ tts_voices = ["mn-MN-BataaNeural", "mn-MN-YesuiNeural"] # Specific voices
42
+
43
+ # RVC models
44
+ model_root = "weights"
45
+ models = [d for d in os.listdir(model_root) if os.path.isdir(f"{model_root}/{d}")]
46
+ models.sort()
47
+
48
+ def get_unique_filename(extension):
49
+ return f"{uuid.uuid4()}.{extension}"
50
+
51
+ def model_data(model_name):
52
+ pth_path = [
53
+ f"{model_root}/{model_name}/{f}"
54
+ for f in os.listdir(f"{model_root}/{model_name}")
55
+ if f.endswith(".pth")
56
+ ][0]
57
+ print(f"Loading {pth_path}")
58
+ cpt = torch.load(pth_path, map_location="cpu")
59
+ tgt_sr = cpt["config"][-1]
60
+ cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
61
+ if_f0 = cpt.get("f0", 1)
62
+ version = cpt.get("version", "v1")
63
+ if version == "v1":
64
+ if if_f0 == 1:
65
+ net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
66
+ else:
67
+ net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
68
+ elif version == "v2":
69
+ if if_f0 == 1:
70
+ net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half)
71
+ else:
72
+ net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
73
+ else:
74
+ raise ValueError("Unknown version")
75
+ del net_g.enc_q
76
+ net_g.load_state_dict(cpt["weight"], strict=False)
77
+ print("Model loaded")
78
+ net_g.eval().to(config.device)
79
+ if config.is_half:
80
+ net_g = net_g.half()
81
+ else:
82
+ net_g = net_g.float()
83
+ vc = VC(tgt_sr, config)
84
+ index_files = [
85
+ f"{model_root}/{model_name}/{f}"
86
+ for f in os.listdir(f"{model_root}/{model_name}")
87
+ if f.endswith(".index")
88
+ ]
89
+ if len(index_files) == 0:
90
+ print("No index file found")
91
+ index_file = ""
92
+ else:
93
+ index_file = index_files[0]
94
+ print(f"Index file found: {index_file}")
95
+
96
+ return tgt_sr, net_g, vc, version, index_file, if_f0
97
+
98
+ def load_hubert():
99
+ models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
100
+ ["hubert_base.pt"],
101
+ suffix="",
102
+ )
103
+ hubert_model = models[0]
104
+ hubert_model = hubert_model.to(config.device)
105
+ if config.is_half:
106
+ hubert_model = hubert_model.half()
107
+ else:
108
+ hubert_model = hubert_model.float()
109
+ return hubert_model.eval()
110
+
111
+ def get_model_names():
112
+ model_root = "weights" # Assuming this is where your models are stored
113
+ return [d for d in os.listdir(model_root) if os.path.isdir(f"{model_root}/{d}")]
114
+
115
+ async def tts(
116
+ model_name,
117
+ tts_text,
118
+ tts_voice,
119
+ index_rate,
120
+ use_uploaded_voice,
121
+ uploaded_voice,
122
+ ):
123
+ speed = 0 # Default speech speed
124
+ f0_up_key = 0 # Default pitch adjustment
125
+ f0_method = "rmvpe" # Default pitch extraction method
126
+ protect = 0.33 # Default protect value
127
+ filter_radius = 3
128
+ resample_sr = 0
129
+ rms_mix_rate = 0.25
130
+ edge_time = 0 # Initialize edge_time
131
+
132
+ edge_output_filename = get_unique_filename("mp3")
133
+
134
+ try:
135
+ if use_uploaded_voice:
136
+ if uploaded_voice is None:
137
+ return "No voice file uploaded.", None, None
138
+
139
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
140
+ tmp_file.write(uploaded_voice)
141
+ uploaded_file_path = tmp_file.name
142
+
143
+ audio, sr = librosa.load(uploaded_file_path, sr=16000, mono=True)
144
+ else:
145
+ if limitation and len(tts_text) > 4000:
146
+ return (
147
+ f"Text characters should be at most 280 in this huggingface space, but got {len(tts_text)} characters.",
148
+ None,
149
+ None,
150
+ )
151
+
152
+ t0 = time.time()
153
+ speed_str = f"+{speed}%" if speed >= 0 else f"{speed}%"
154
+ await edge_tts.Communicate(
155
+ tts_text, tts_voice, rate=speed_str
156
+ ).save(edge_output_filename)
157
+ t1 = time.time()
158
+ edge_time = t1 - t0
159
+
160
+ audio, sr = librosa.load(edge_output_filename, sr=16000, mono=True)
161
+
162
+ duration = len(audio) / sr
163
+ print(f"Audio duration: {duration}s")
164
+ if limitation and duration >= 20:
165
+ return (
166
+ f"Audio should be less than 20 seconds in this huggingface space, but got {duration}s.",
167
+ None,
168
+ None,
169
+ )
170
+
171
+ f0_up_key = int(f0_up_key)
172
+ tgt_sr, net_g, vc, version, index_file, if_f0 = model_data(model_name)
173
+
174
+ if f0_method == "rmvpe":
175
+ vc.model_rmvpe = rmvpe_model
176
+
177
+ times = [0, 0, 0]
178
+ audio_opt = vc.pipeline(
179
+ hubert_model,
180
+ net_g,
181
+ 0,
182
+ audio,
183
+ edge_output_filename if not use_uploaded_voice else uploaded_file_path,
184
+ times,
185
+ f0_up_key,
186
+ f0_method,
187
+ index_file,
188
+ index_rate,
189
+ if_f0,
190
+ filter_radius,
191
+ tgt_sr,
192
+ resample_sr,
193
+ rms_mix_rate,
194
+ version,
195
+ protect,
196
+ None,
197
+ )
198
+
199
+ if tgt_sr != resample_sr and resample_sr >= 16000:
200
+ tgt_sr = resample_sr
201
+
202
+ info = f"Success. Time: tts: {edge_time}s, npy: {times[0]}s, f0: {times[1]}s, infer: {times[2]}s"
203
+ print(info)
204
+ return (
205
+ info,
206
+ edge_output_filename if not use_uploaded_voice else None,
207
+ (tgt_sr, audio_opt),
208
+ edge_output_filename
209
+ )
210
+
211
+ except EOFError:
212
+ info = (
213
+ "output not valid. This may occur when input text and speaker do not match."
214
+ )
215
+ print(info)
216
+ return info, None, None
217
+ except Exception as e:
218
+ traceback_info = traceback.format_exc()
219
+ print(traceback_info)
220
+ return str(e), None, None
221
+
222
+ voice_mapping = {
223
+ "Mongolian Male": "mn-MN-BataaNeural",
224
+ "Mongolian Female": "mn-MN-YesuiNeural"
225
+ }
226
+
227
+ hubert_model = load_hubert()
228
+ rmvpe_model = RMVPE("rmvpe.pt", config.is_half, config.device)
229
+
230
+