Spaces:

projecte-aina
/

matxa-alvocat-tts-ca

Running

App Files Files Community

Baybars commited on Mar 19, 2024

Commit

40b17fc

1 Parent(s): 1b8b2e7

hifigan removed, frontend changed

Browse files

Files changed (2) hide show

infer_onnx.py +15 -22
spk_to_id.json +49 -0

infer_onnx.py CHANGED Viewed

@@ -8,6 +8,7 @@ import gradio as gr
 import soundfile as sf
 import tempfile
 import yaml
 from time import perf_counter
@@ -33,12 +34,15 @@ MODEL_PATH_MATCHA_MEL="matcha_multispeaker_cat_opset_15_10_steps.onnx"
 MODEL_PATH_MATCHA="matcha_hifigan_multispeaker_cat.onnx"
 MODEL_PATH_VOCOS="mel_spec_22khz_v2.onnx"
 CONFIG_PATH="config_22khz.yaml"
 sess_options = onnxruntime.SessionOptions()
 model_matcha_mel= onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA_MEL), sess_options=sess_options, providers=["CPUExecutionProvider"])
 model_vocos = onnxruntime.InferenceSession(str(MODEL_PATH_VOCOS), sess_options=sess_options, providers=["CPUExecutionProvider"])
 model_matcha = onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA), sess_options=sess_options, providers=["CPUExecutionProvider"])
 def vocos_inference(mel,denoise):
@@ -123,7 +127,8 @@ def vocos_inference(mel,denoise):
     return y
-def tts(text:str, spk_id:int, temperature:float, length_scale:float, denoise:bool):
     sid = np.array([int(spk_id)]) if spk_id is not None else None
     text_matcha , text_lengths = process_text(0,text,"cpu")
@@ -158,17 +163,8 @@ def tts(text:str, spk_id:int, temperature:float, length_scale:float, denoise:boo
         "spks": sid
     }
     hifigan_t0 = perf_counter()
-    # matcha hifigan inference
-    wavs, wav_lengths = model_matcha.run(None, inputs)
-    hifigan_infer_secs = perf_counter() - hifigan_t0
-    print("Matcha + Hifigan",hifigan_infer_secs)
-    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False, dir="/home/user/app") as fp_matcha:
-        sf.write(fp_matcha.name, wavs.squeeze(0), 22050, "PCM_24")
-    print(f"RTF matcha + hifigan { hifigan_infer_secs/ (wavs.shape[1]/22050) }")
-    print(f"RTF matcha + vocos { (mel_infer_secs + vocos_infer_secs) / (wavs.shape[1]/22050) }")
-    return fp_matcha_vocos.name, fp_matcha.name
 ## GUI space
@@ -201,13 +197,11 @@ vits2_inference = gr.Interface(
             max_lines=1,
             label="Input text",
         ),
-        gr.Slider(
-            1,
-            47,
-            value=10,
-            step=1,
             label="Speaker id",
-            info=f"Models are trained on 47 speakers. You can prompt the model using one of these speaker ids.",
         ),
         gr.Slider(
             0.1,
@@ -225,10 +219,9 @@ vits2_inference = gr.Interface(
             label="Length scale",
             info=f"Controls speech pace, larger values for slower pace and smaller values for faster pace",
         ),
-        gr.Checkbox(label="Denoise", info="Removes model bias from vocos"),
     ],
-    outputs=[gr.Audio(label="Matcha vocos", interactive=False, type="filepath"),
-             gr.Audio(label="Matcha hifigan", interactive=False, type="filepath")]
 )
 demo = gr.Blocks()

 import soundfile as sf
 import tempfile
 import yaml
+import json
 from time import perf_counter
 MODEL_PATH_MATCHA="matcha_hifigan_multispeaker_cat.onnx"
 MODEL_PATH_VOCOS="mel_spec_22khz_v2.onnx"
 CONFIG_PATH="config_22khz.yaml"
+SPEAKER_ID_DICT="spk_to_id.json"
 sess_options = onnxruntime.SessionOptions()
 model_matcha_mel= onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA_MEL), sess_options=sess_options, providers=["CPUExecutionProvider"])
 model_vocos = onnxruntime.InferenceSession(str(MODEL_PATH_VOCOS), sess_options=sess_options, providers=["CPUExecutionProvider"])
 model_matcha = onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA), sess_options=sess_options, providers=["CPUExecutionProvider"])
+speaker_id_dict = json.load(open(SPEAKER_ID_DICT))
+speakers = [sp for sp in speaker_id_dict.keys()]
+speakers.sort()
 def vocos_inference(mel,denoise):
     return y
+def tts(text:str, spk_name:str, temperature:float, length_scale:float, denoise:bool):
+    spk_id = speaker_id_dict[spk_name]
     sid = np.array([int(spk_id)]) if spk_id is not None else None
     text_matcha , text_lengths = process_text(0,text,"cpu")
         "spks": sid
     }
     hifigan_t0 = perf_counter()
+    print(f"RTF matcha + vocos { (mel_infer_secs + vocos_infer_secs) / (wavs_vocos.shape[1]/22050) }")
+    return fp_matcha_vocos.name
 ## GUI space
             max_lines=1,
             label="Input text",
         ),
+        gr.Dropdown(
+            choices=speakers,
             label="Speaker id",
+            value='caf_09204',
+            info=f"Models are trained on 47 speakers. You can prompt the model using one of these speaker ids."
         ),
         gr.Slider(
             0.1,
             label="Length scale",
             info=f"Controls speech pace, larger values for slower pace and smaller values for faster pace",
         ),
+        gr.Checkbox(label="Denoise", info="Removes model bias from vocos", value=True),
     ],
+    outputs=[gr.Audio(label="Matcha vocos", interactive=False, type="filepath")]
 )
 demo = gr.Blocks()

spk_to_id.json ADDED Viewed

	@@ -0,0 +1,49 @@

+{
+  "cam_03115": 0,
+  "caf_04247": 1,
+  "caf_05450": 2,
+  "cam_08935": 3,
+  "caf_09901": 4,
+  "ona": 5,
+  "pol": 6,
+  "cam_02689": 7,
+  "caf_06042": 8,
+  "jan": 9,
+  "caf_08106": 10,
+  "cam_04910": 11,
+  "cam_08664": 12,
+  "caf_07803": 13,
+  "cam_06582": 14,
+  "caf_06311": 15,
+  "caf_07245": 16,
+  "cam_06279": 17,
+  "caf_09598": 18,
+  "caf_09796": 19,
+  "eva": 20,
+  "cam_00762": 21,
+  "caf_09204": 22,
+  "caf_03944": 23,
+  "caf_05147": 24,
+  "uri": 25,
+  "mar": 26,
+  "cam_00459": 27,
+  "teo": 28,
+  "caf_03655": 29,
+  "bet": 30,
+  "cam_06705": 31,
+  "caf_05739": 32,
+  "caf_06008": 33,
+  "cam_04484": 34,
+  "cam_03386": 35,
+  "cam_08967": 36,
+  "caf_06942": 37,
+  "cam_07140": 38,
+  "pau": 39,
+  "caf_08001": 40,
+  "pep": 41,
+  "cam_04787": 42,
+  "eli": 43,
+  "caf_01591": 44,
+  "caf_02452": 45,
+  "cam_02992": 46
+}