Spaces:

mazalaai
/

tts

Sleeping

App Files Files Community

MAZALA2024 commited on Oct 15, 2024

Commit

e0c5b9e

verified ·

1 Parent(s): ba883e9

Update voice_processing.py

Browse files

Files changed (1) hide show

voice_processing.py +74 -4

voice_processing.py CHANGED Viewed

@@ -108,7 +108,7 @@ def load_hubert():
     return hubert_model.eval()
 def get_model_names():
-    model_root = "weights"
     return [d for d in os.listdir(model_root) if os.path.isdir(f"{model_root}/{d}")]
 def run_async_in_thread(fn, *args):
@@ -139,7 +139,78 @@ async def tts(
     edge_output_filename = get_unique_filename("mp3")
     try:
-        # ... (keep the existing implementation)
         info = f"Success. Time: tts: {edge_time}s, npy: {times[0]}s, f0: {times[1]}s, infer: {times[2]}s"
         print(info)
@@ -210,5 +281,4 @@ async def parallel_tts(tasks):
 def parallel_tts_wrapper(tasks):
     loop = asyncio.get_event_loop()
-    return loop.run_until_complete(parallel_tts(tasks))

     return hubert_model.eval()
 def get_model_names():
+    model_root = "weights"  # Assuming this is where your models are stored
     return [d for d in os.listdir(model_root) if os.path.isdir(f"{model_root}/{d}")]
 def run_async_in_thread(fn, *args):
     edge_output_filename = get_unique_filename("mp3")
     try:
+        if use_uploaded_voice:
+            if uploaded_voice is None:
+                return "No voice file uploaded.", None, None
+            # Process the uploaded voice file
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
+                tmp_file.write(uploaded_voice)
+                uploaded_file_path = tmp_file.name
+            audio, sr = librosa.load(uploaded_file_path, sr=16000, mono=True)
+        else:
+            # EdgeTTS processing
+            if limitation and len(tts_text) > 12000:
+                return (
+                    f"Text characters should be at most 12000 in this huggingface space, but got {len(tts_text)} characters.",
+                    None,
+                    None,
+                )
+            # Invoke Edge TTS
+            t0 = time.time()
+            speed_str = f"+{speed}%" if speed >= 0 else f"{speed}%"
+            await edge_tts.Communicate(
+                tts_text, tts_voice, rate=speed_str
+            ).save(edge_output_filename)
+            t1 = time.time()
+            edge_time = t1 - t0
+            audio, sr = librosa.load(edge_output_filename, sr=16000, mono=True)
+        # Common processing after loading the audio
+        duration = len(audio) / sr
+        print(f"Audio duration: {duration}s")
+        if limitation and duration >= 20000:
+            return (
+                f"Audio should be less than 20 seconds in this huggingface space, but got {duration}s.",
+                None,
+                None,
+            )
+        f0_up_key = int(f0_up_key)
+        tgt_sr, net_g, vc, version, index_file, if_f0 = model_data(model_name)
+        # Setup for RMVPE or other pitch extraction methods
+        if f0_method == "rmvpe":
+            vc.model_rmvpe = rmvpe_model
+        # Perform voice conversion pipeline
+        times = [0, 0, 0]
+        audio_opt = vc.pipeline(
+            hubert_model,
+            net_g,
+            0,
+            audio,
+            edge_output_filename if not use_uploaded_voice else uploaded_file_path,
+            times,
+            f0_up_key,
+            f0_method,
+            index_file,
+            index_rate,
+            if_f0,
+            filter_radius,
+            tgt_sr,
+            resample_sr,
+            rms_mix_rate,
+            version,
+            protect,
+            None,
+        )
+        if tgt_sr != resample_sr and resample_sr >= 16000:
+            tgt_sr = resample_sr
         info = f"Success. Time: tts: {edge_time}s, npy: {times[0]}s, f0: {times[1]}s, infer: {times[2]}s"
         print(info)
 def parallel_tts_wrapper(tasks):
     loop = asyncio.get_event_loop()
+    return loop.run_until_complete(parallel_tts(tasks))