MAZALA2024 commited on
Commit
e0c5b9e
·
verified ·
1 Parent(s): ba883e9

Update voice_processing.py

Browse files
Files changed (1) hide show
  1. voice_processing.py +74 -4
voice_processing.py CHANGED
@@ -108,7 +108,7 @@ def load_hubert():
108
  return hubert_model.eval()
109
 
110
  def get_model_names():
111
- model_root = "weights"
112
  return [d for d in os.listdir(model_root) if os.path.isdir(f"{model_root}/{d}")]
113
 
114
  def run_async_in_thread(fn, *args):
@@ -139,7 +139,78 @@ async def tts(
139
  edge_output_filename = get_unique_filename("mp3")
140
 
141
  try:
142
- # ... (keep the existing implementation)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
  info = f"Success. Time: tts: {edge_time}s, npy: {times[0]}s, f0: {times[1]}s, infer: {times[2]}s"
145
  print(info)
@@ -210,5 +281,4 @@ async def parallel_tts(tasks):
210
 
211
  def parallel_tts_wrapper(tasks):
212
  loop = asyncio.get_event_loop()
213
- return loop.run_until_complete(parallel_tts(tasks))
214
-
 
108
  return hubert_model.eval()
109
 
110
  def get_model_names():
111
+ model_root = "weights" # Assuming this is where your models are stored
112
  return [d for d in os.listdir(model_root) if os.path.isdir(f"{model_root}/{d}")]
113
 
114
  def run_async_in_thread(fn, *args):
 
139
  edge_output_filename = get_unique_filename("mp3")
140
 
141
  try:
142
+ if use_uploaded_voice:
143
+ if uploaded_voice is None:
144
+ return "No voice file uploaded.", None, None
145
+
146
+ # Process the uploaded voice file
147
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
148
+ tmp_file.write(uploaded_voice)
149
+ uploaded_file_path = tmp_file.name
150
+
151
+ audio, sr = librosa.load(uploaded_file_path, sr=16000, mono=True)
152
+ else:
153
+ # EdgeTTS processing
154
+ if limitation and len(tts_text) > 12000:
155
+ return (
156
+ f"Text characters should be at most 12000 in this huggingface space, but got {len(tts_text)} characters.",
157
+ None,
158
+ None,
159
+ )
160
+
161
+ # Invoke Edge TTS
162
+ t0 = time.time()
163
+ speed_str = f"+{speed}%" if speed >= 0 else f"{speed}%"
164
+ await edge_tts.Communicate(
165
+ tts_text, tts_voice, rate=speed_str
166
+ ).save(edge_output_filename)
167
+ t1 = time.time()
168
+ edge_time = t1 - t0
169
+
170
+ audio, sr = librosa.load(edge_output_filename, sr=16000, mono=True)
171
+
172
+ # Common processing after loading the audio
173
+ duration = len(audio) / sr
174
+ print(f"Audio duration: {duration}s")
175
+ if limitation and duration >= 20000:
176
+ return (
177
+ f"Audio should be less than 20 seconds in this huggingface space, but got {duration}s.",
178
+ None,
179
+ None,
180
+ )
181
+
182
+ f0_up_key = int(f0_up_key)
183
+ tgt_sr, net_g, vc, version, index_file, if_f0 = model_data(model_name)
184
+
185
+ # Setup for RMVPE or other pitch extraction methods
186
+ if f0_method == "rmvpe":
187
+ vc.model_rmvpe = rmvpe_model
188
+
189
+ # Perform voice conversion pipeline
190
+ times = [0, 0, 0]
191
+ audio_opt = vc.pipeline(
192
+ hubert_model,
193
+ net_g,
194
+ 0,
195
+ audio,
196
+ edge_output_filename if not use_uploaded_voice else uploaded_file_path,
197
+ times,
198
+ f0_up_key,
199
+ f0_method,
200
+ index_file,
201
+ index_rate,
202
+ if_f0,
203
+ filter_radius,
204
+ tgt_sr,
205
+ resample_sr,
206
+ rms_mix_rate,
207
+ version,
208
+ protect,
209
+ None,
210
+ )
211
+
212
+ if tgt_sr != resample_sr and resample_sr >= 16000:
213
+ tgt_sr = resample_sr
214
 
215
  info = f"Success. Time: tts: {edge_time}s, npy: {times[0]}s, f0: {times[1]}s, infer: {times[2]}s"
216
  print(info)
 
281
 
282
  def parallel_tts_wrapper(tasks):
283
  loop = asyncio.get_event_loop()
284
+ return loop.run_until_complete(parallel_tts(tasks))