Spaces:

mazalaai
/

tts

Sleeping

App Files Files Community

MAZALA2024 commited on Oct 18, 2024

Commit

21e20b7

verified ·

1 Parent(s): 11df9cd

Update voice_processing.py

Browse files

Files changed (1) hide show

voice_processing.py +31 -26

voice_processing.py CHANGED Viewed

@@ -23,7 +23,11 @@ from lib.infer_pack.models import (
 from rmvpe import RMVPE
 from vc_infer_pipeline import VC
-# Set logging levels
 logging.getLogger("fairseq").setLevel(logging.WARNING)
 logging.getLogger("numba").setLevel(logging.WARNING)
 logging.getLogger("markdown_it").setLevel(logging.WARNING)
@@ -52,7 +56,7 @@ def model_data(model_name):
         for f in os.listdir(f"{model_root}/{model_name}")
         if f.endswith(".pth")
     ][0]
-    print(f"Loading {pth_path}")
     cpt = torch.load(pth_path, map_location="cpu")
     tgt_sr = cpt["config"][-1]
     cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]  # n_spk
@@ -72,7 +76,7 @@ def model_data(model_name):
         raise ValueError("Unknown version")
     del net_g.enc_q
     net_g.load_state_dict(cpt["weight"], strict=False)
-    print("Model loaded")
     net_g.eval().to(config.device)
     if config.is_half:
         net_g = net_g.half()
@@ -86,11 +90,11 @@ def model_data(model_name):
         if f.endswith(".index")
     ]
     if len(index_files) == 0:
-        print("No index file found")
         index_file = ""
     else:
         index_file = index_files[0]
-        print(f"Index file found: {index_file}")
     return tgt_sr, net_g, vc, version, index_file, if_f0
@@ -119,6 +123,8 @@ def run_async_in_thread(fn, *args):
     loop.close()
     return result
 def parallel_tts(tasks):
     with ThreadPoolExecutor() as executor:
         futures = [executor.submit(run_async_in_thread, tts, *task) for task in tasks]
@@ -133,21 +139,21 @@ async def tts(
     use_uploaded_voice,
     uploaded_voice,
 ):
-    # Default values for parameters used in EdgeTTS
-    speed = 0  # Default speech speed
-    f0_up_key = 0  # Default pitch adjustment
-    f0_method = "rmvpe"  # Default pitch extraction method
-    protect = 0.33  # Default protect value
-    filter_radius = 3
-    resample_sr = 0
-    rms_mix_rate = 0.25
-    edge_time = 0  # Initialize edge_time
-    edge_output_filename = get_unique_filename("mp3")
     try:
         if use_uploaded_voice:
             if uploaded_voice is None:
                 return "No voice file uploaded.", None, None
             # Process the uploaded voice file
@@ -159,6 +165,7 @@ async def tts(
         else:
             # EdgeTTS processing
             if limitation and len(tts_text) > 12000:
                 return (
                     f"Text characters should be at most 12000 in this huggingface space, but got {len(tts_text)} characters.",
                     None,
@@ -178,8 +185,9 @@ async def tts(
         # Common processing after loading the audio
         duration = len(audio) / sr
-        print(f"Audio duration: {duration}s")
         if limitation and duration >= 20000:
             return (
                 f"Audio should be less than 20 seconds in this huggingface space, but got {duration}s.",
                 None,
@@ -219,8 +227,8 @@ async def tts(
         if tgt_sr != resample_sr and resample_sr >= 16000:
             tgt_sr = resample_sr
-        info = f"Success. Time: tts: {edge_time}s, npy: {times[0]}s, f0: {times[1]}s, infer: {times[2]}s"
-        print(info)
         return (
             info,
             edge_output_filename if not use_uploaded_voice else None,
@@ -228,14 +236,11 @@ async def tts(
         )
     except EOFError:
-        info = (
-            "output not valid. This may occur when input text and speaker do not match."
-        )
-        print(info)
         return info, None, None
     except Exception as e:
-        traceback_info = traceback.format_exc()
-        print(traceback_info)
         return str(e), None, None
 voice_mapping = {

 from rmvpe import RMVPE
 from vc_infer_pipeline import VC
+# Set up logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+# Set logging levels for other libraries
 logging.getLogger("fairseq").setLevel(logging.WARNING)
 logging.getLogger("numba").setLevel(logging.WARNING)
 logging.getLogger("markdown_it").setLevel(logging.WARNING)
         for f in os.listdir(f"{model_root}/{model_name}")
         if f.endswith(".pth")
     ][0]
+    logger.info(f"Loading {pth_path}")
     cpt = torch.load(pth_path, map_location="cpu")
     tgt_sr = cpt["config"][-1]
     cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]  # n_spk
         raise ValueError("Unknown version")
     del net_g.enc_q
     net_g.load_state_dict(cpt["weight"], strict=False)
+    logger.info("Model loaded")
     net_g.eval().to(config.device)
     if config.is_half:
         net_g = net_g.half()
         if f.endswith(".index")
     ]
     if len(index_files) == 0:
+        logger.info("No index file found")
         index_file = ""
     else:
         index_file = index_files[0]
+        logger.info(f"Index file found: {index_file}")
     return tgt_sr, net_g, vc, version, index_file, if_f0
     loop.close()
     return result
+executor = ThreadPoolExecutor(max_workers=config.n_cpu)
 def parallel_tts(tasks):
     with ThreadPoolExecutor() as executor:
         futures = [executor.submit(run_async_in_thread, tts, *task) for task in tasks]
     use_uploaded_voice,
     uploaded_voice,
 ):
     try:
+        # Default values for parameters used in EdgeTTS
+        speed = 0  # Default speech speed
+        f0_up_key = 0  # Default pitch adjustment
+        f0_method = "rmvpe"  # Default pitch extraction method
+        protect = 0.33  # Default protect value
+        filter_radius = 3
+        resample_sr = 0
+        rms_mix_rate = 0.25
+        edge_output_filename = get_unique_filename("mp3")
         if use_uploaded_voice:
             if uploaded_voice is None:
+                logger.error("No voice file uploaded.")
                 return "No voice file uploaded.", None, None
             # Process the uploaded voice file
         else:
             # EdgeTTS processing
             if limitation and len(tts_text) > 12000:
+                logger.error(f"Text characters exceed limit: {len(tts_text)} characters.")
                 return (
                     f"Text characters should be at most 12000 in this huggingface space, but got {len(tts_text)} characters.",
                     None,
         # Common processing after loading the audio
         duration = len(audio) / sr
+        logger.info(f"Audio duration: {duration}s")
         if limitation and duration >= 20000:
+            logger.error(f"Audio duration exceeds limit: {duration}s")
             return (
                 f"Audio should be less than 20 seconds in this huggingface space, but got {duration}s.",
                 None,
         if tgt_sr != resample_sr and resample_sr >= 16000:
             tgt_sr = resample_sr
+        info = f"Success. Time: tts: {edge_time:.2f}s, npy: {times[0]:.2f}s, f0: {times[1]:.2f}s, infer: {times[2]:.2f}s"
+        logger.info(info)
         return (
             info,
             edge_output_filename if not use_uploaded_voice else None,
         )
     except EOFError:
+        info = "output not valid. This may occur when input text and speaker do not match."
+        logger.error(info)
         return info, None, None
     except Exception as e:
+        logger.exception("Error in TTS processing")
         return str(e), None, None
 voice_mapping = {