xtts-v2

Runtime error

App Files Files Community

JacobLinCool commited on Nov 24, 2023

Commit

ecd3224

1 Parent(s): bd8dcd1

perf: model lazy load

Browse files

Files changed (1) hide show

app.py +49 -31

app.py CHANGED Viewed

@@ -27,38 +27,48 @@ from huggingface_hub import HfApi
 # will use api to restart space on a unrecoverable error
 api = HfApi(token=HF_TOKEN)
-repo_id = "coqui/xtts"
-print("loading model")
-model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
-model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
-config = XttsConfig()
-config.load_json(os.path.join(model_path, "config.json"))
-model = Xtts.init_from_config(config)
-model.load_checkpoint(
-    config,
-    checkpoint_path=os.path.join(model_path, "model.pth"),
-    vocab_path=os.path.join(model_path, "vocab.json"),
-    eval=True,
-    use_deepspeed=False,
-)
-if torch.cuda.is_available():
-    model.cuda()
-else:
-    model.cpu()
-print("Model loaded")
 # This is for debugging purposes only
 DEVICE_ASSERT_DETECTED = 0
 DEVICE_ASSERT_PROMPT = None
 DEVICE_ASSERT_LANG = None
-supported_languages = config.languages
 def predict(
     prompt,
@@ -68,6 +78,9 @@ def predict(
     no_lang_auto_detect,
     agree,
 ):
     if agree == True:
         if language not in supported_languages:
             gr.Warning(
@@ -184,7 +197,7 @@ def predict(
             # HF Space specific.. This error is unrecoverable need to restart space
             space = api.get_space_runtime(repo_id=repo_id)
-            if space.stage!="BUILDING":
                 api.restart_space(repo_id=repo_id)
             else:
                 print("TRIED TO RESTART but space is building")
@@ -198,7 +211,9 @@ def predict(
                 (
                     gpt_cond_latent,
                     speaker_embedding,
-                ) = model.get_conditioning_latents(audio_path=speaker_wav, gpt_cond_len=30, max_ref_length=60)
             except Exception as e:
                 print("Speaker encoding error", str(e))
                 gr.Warning(
@@ -215,7 +230,7 @@ def predict(
             # metrics_text=f"Embedding calculation time: {latent_calculation_time:.2f} seconds\n"
             # temporary comma fix
-            prompt= re.sub("([^\x00-\x7F]|\w)(\.|\。|\?)",r"\1 \2\2",prompt)
             wav_chunks = []
             ## Direct mode
@@ -260,9 +275,9 @@ def predict(
             print(
                 f"I: Time to generate audio: {round(inference_time*1000)} milliseconds"
             )
-            #metrics_text += (
             #    f"Time to generate audio: {round(inference_time*1000)} milliseconds\n"
-            #)
             wav = torch.cat(wav_chunks, dim=0)
             print(wav.shape)
@@ -330,11 +345,11 @@ def predict(
                 # HF Space specific.. This error is unrecoverable need to restart space
                 space = api.get_space_runtime(repo_id=repo_id)
-                if space.stage!="BUILDING":
                     api.restart_space(repo_id=repo_id)
                 else:
                     print("TRIED TO RESTART but space is building")
             else:
                 if "Failed to decode" in str(e):
                     print("Speaker encoding error", str(e))
@@ -459,7 +474,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
                     "zh-cn",
                     "ja",
                     "ko",
-                    "hu"
                 ],
                 value="en",
             )
@@ -487,14 +502,17 @@ with gr.Blocks(analytics_enabled=False) as demo:
             tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
         with gr.Column():
             video_gr = gr.Video(label="Waveform Visual")
             audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
             out_text_gr = gr.Text(label="Metrics")
             ref_audio_gr = gr.Audio(label="Reference Audio Used")
-    tts_button.click(predict, [input_text_gr, language_gr, ref_gr, clean_ref_gr, auto_det_lang_gr, tos_gr], outputs=[video_gr, audio_gr, out_text_gr, ref_audio_gr])
 print("Starting server")
 demo.queue().launch(debug=True, show_api=True)

 # will use api to restart space on a unrecoverable error
 api = HfApi(token=HF_TOKEN)
+repo_id = "JacobLinCool/xtts-v2"
+model = None
+supported_languages = None
+def load_model():
+    global model
+    global supported_languages
+    print("loading model")
+    model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
+    model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
+    config = XttsConfig()
+    config.load_json(os.path.join(model_path, "config.json"))
+    model = Xtts.init_from_config(config)
+    model.load_checkpoint(
+        config,
+        checkpoint_path=os.path.join(model_path, "model.pth"),
+        vocab_path=os.path.join(model_path, "vocab.json"),
+        eval=True,
+        use_deepspeed=False,
+    )
+    if torch.cuda.is_available():
+        model.cuda()
+    else:
+        model.cpu()
+    supported_languages = config.languages
+    print("Model loaded")
 # This is for debugging purposes only
 DEVICE_ASSERT_DETECTED = 0
 DEVICE_ASSERT_PROMPT = None
 DEVICE_ASSERT_LANG = None
 def predict(
     prompt,
     no_lang_auto_detect,
     agree,
 ):
+    if model is None:
+        load_model()
     if agree == True:
         if language not in supported_languages:
             gr.Warning(
             # HF Space specific.. This error is unrecoverable need to restart space
             space = api.get_space_runtime(repo_id=repo_id)
+            if space.stage != "BUILDING":
                 api.restart_space(repo_id=repo_id)
             else:
                 print("TRIED TO RESTART but space is building")
                 (
                     gpt_cond_latent,
                     speaker_embedding,
+                ) = model.get_conditioning_latents(
+                    audio_path=speaker_wav, gpt_cond_len=30, max_ref_length=60
+                )
             except Exception as e:
                 print("Speaker encoding error", str(e))
                 gr.Warning(
             # metrics_text=f"Embedding calculation time: {latent_calculation_time:.2f} seconds\n"
             # temporary comma fix
+            prompt = re.sub("([^\x00-\x7F]|\w)(\.|\。|\?)", r"\1 \2\2", prompt)
             wav_chunks = []
             ## Direct mode
             print(
                 f"I: Time to generate audio: {round(inference_time*1000)} milliseconds"
             )
+            # metrics_text += (
             #    f"Time to generate audio: {round(inference_time*1000)} milliseconds\n"
+            # )
             wav = torch.cat(wav_chunks, dim=0)
             print(wav.shape)
                 # HF Space specific.. This error is unrecoverable need to restart space
                 space = api.get_space_runtime(repo_id=repo_id)
+                if space.stage != "BUILDING":
                     api.restart_space(repo_id=repo_id)
                 else:
                     print("TRIED TO RESTART but space is building")
             else:
                 if "Failed to decode" in str(e):
                     print("Speaker encoding error", str(e))
                     "zh-cn",
                     "ja",
                     "ko",
+                    "hu",
                 ],
                 value="en",
             )
             tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
         with gr.Column():
             video_gr = gr.Video(label="Waveform Visual")
             audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
             out_text_gr = gr.Text(label="Metrics")
             ref_audio_gr = gr.Audio(label="Reference Audio Used")
+    tts_button.click(
+        predict,
+        [input_text_gr, language_gr, ref_gr, clean_ref_gr, auto_det_lang_gr, tos_gr],
+        outputs=[video_gr, audio_gr, out_text_gr, ref_audio_gr],
+    )
 print("Starting server")
 demo.queue().launch(debug=True, show_api=True)