Update voice_processing.py
Browse files- voice_processing.py +74 -4
voice_processing.py
CHANGED
@@ -108,7 +108,7 @@ def load_hubert():
|
|
108 |
return hubert_model.eval()
|
109 |
|
110 |
def get_model_names():
|
111 |
-
model_root = "weights"
|
112 |
return [d for d in os.listdir(model_root) if os.path.isdir(f"{model_root}/{d}")]
|
113 |
|
114 |
def run_async_in_thread(fn, *args):
|
@@ -139,7 +139,78 @@ async def tts(
|
|
139 |
edge_output_filename = get_unique_filename("mp3")
|
140 |
|
141 |
try:
|
142 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
|
144 |
info = f"Success. Time: tts: {edge_time}s, npy: {times[0]}s, f0: {times[1]}s, infer: {times[2]}s"
|
145 |
print(info)
|
@@ -210,5 +281,4 @@ async def parallel_tts(tasks):
|
|
210 |
|
211 |
def parallel_tts_wrapper(tasks):
|
212 |
loop = asyncio.get_event_loop()
|
213 |
-
return loop.run_until_complete(parallel_tts(tasks))
|
214 |
-
|
|
|
108 |
return hubert_model.eval()
|
109 |
|
110 |
def get_model_names():
|
111 |
+
model_root = "weights" # Assuming this is where your models are stored
|
112 |
return [d for d in os.listdir(model_root) if os.path.isdir(f"{model_root}/{d}")]
|
113 |
|
114 |
def run_async_in_thread(fn, *args):
|
|
|
139 |
edge_output_filename = get_unique_filename("mp3")
|
140 |
|
141 |
try:
|
142 |
+
if use_uploaded_voice:
|
143 |
+
if uploaded_voice is None:
|
144 |
+
return "No voice file uploaded.", None, None
|
145 |
+
|
146 |
+
# Process the uploaded voice file
|
147 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
|
148 |
+
tmp_file.write(uploaded_voice)
|
149 |
+
uploaded_file_path = tmp_file.name
|
150 |
+
|
151 |
+
audio, sr = librosa.load(uploaded_file_path, sr=16000, mono=True)
|
152 |
+
else:
|
153 |
+
# EdgeTTS processing
|
154 |
+
if limitation and len(tts_text) > 12000:
|
155 |
+
return (
|
156 |
+
f"Text characters should be at most 12000 in this huggingface space, but got {len(tts_text)} characters.",
|
157 |
+
None,
|
158 |
+
None,
|
159 |
+
)
|
160 |
+
|
161 |
+
# Invoke Edge TTS
|
162 |
+
t0 = time.time()
|
163 |
+
speed_str = f"+{speed}%" if speed >= 0 else f"{speed}%"
|
164 |
+
await edge_tts.Communicate(
|
165 |
+
tts_text, tts_voice, rate=speed_str
|
166 |
+
).save(edge_output_filename)
|
167 |
+
t1 = time.time()
|
168 |
+
edge_time = t1 - t0
|
169 |
+
|
170 |
+
audio, sr = librosa.load(edge_output_filename, sr=16000, mono=True)
|
171 |
+
|
172 |
+
# Common processing after loading the audio
|
173 |
+
duration = len(audio) / sr
|
174 |
+
print(f"Audio duration: {duration}s")
|
175 |
+
if limitation and duration >= 20000:
|
176 |
+
return (
|
177 |
+
f"Audio should be less than 20 seconds in this huggingface space, but got {duration}s.",
|
178 |
+
None,
|
179 |
+
None,
|
180 |
+
)
|
181 |
+
|
182 |
+
f0_up_key = int(f0_up_key)
|
183 |
+
tgt_sr, net_g, vc, version, index_file, if_f0 = model_data(model_name)
|
184 |
+
|
185 |
+
# Setup for RMVPE or other pitch extraction methods
|
186 |
+
if f0_method == "rmvpe":
|
187 |
+
vc.model_rmvpe = rmvpe_model
|
188 |
+
|
189 |
+
# Perform voice conversion pipeline
|
190 |
+
times = [0, 0, 0]
|
191 |
+
audio_opt = vc.pipeline(
|
192 |
+
hubert_model,
|
193 |
+
net_g,
|
194 |
+
0,
|
195 |
+
audio,
|
196 |
+
edge_output_filename if not use_uploaded_voice else uploaded_file_path,
|
197 |
+
times,
|
198 |
+
f0_up_key,
|
199 |
+
f0_method,
|
200 |
+
index_file,
|
201 |
+
index_rate,
|
202 |
+
if_f0,
|
203 |
+
filter_radius,
|
204 |
+
tgt_sr,
|
205 |
+
resample_sr,
|
206 |
+
rms_mix_rate,
|
207 |
+
version,
|
208 |
+
protect,
|
209 |
+
None,
|
210 |
+
)
|
211 |
+
|
212 |
+
if tgt_sr != resample_sr and resample_sr >= 16000:
|
213 |
+
tgt_sr = resample_sr
|
214 |
|
215 |
info = f"Success. Time: tts: {edge_time}s, npy: {times[0]}s, f0: {times[1]}s, infer: {times[2]}s"
|
216 |
print(info)
|
|
|
281 |
|
282 |
def parallel_tts_wrapper(tasks):
|
283 |
loop = asyncio.get_event_loop()
|
284 |
+
return loop.run_until_complete(parallel_tts(tasks))
|
|