arible
/

narrator

Inference Endpoints

Model card Files Files and versions Community

simdi commited on Apr 19

Commit

cbf8a35

•

1 Parent(s): 41a54f3

Create handler.py

Browse files

Files changed (1) hide show

handler.py +61 -0

handler.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import time
+from TTS.api import TTS
+from TTS.utils.manage import ModelManager
+from TTS.utils.generic_utils import get_user_data_dir
+import torch
+import os
+from TTS.tts.configs.xtts_config import XttsConfig
+import torchaudio
+from TTS.tts.models.xtts import Xtts
+import io
+import base64
+class EndpointHandler:
+    def __init__(self, path=""):
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        config = XttsConfig()
+        config.load_json("./model/config.json")
+        model = Xtts.init_from_config(config)
+        model.load_checkpoint(
+            config,
+            checkpoint_path="./model/model.pth",
+            vocab_path="./model/vocab.json",
+            speaker_file_path="./model/speakers_xtts.pth",
+            eval=True,
+            use_deepspeed=device == "cuda",
+        )
+        model.to(device)
+        self.model = model
+    def __call__(self, model_input):
+        (
+            gpt_cond_latent,
+            speaker_embedding,
+        ) = self.model.get_conditioning_latents(
+            audio_path="attenborough.mp3",
+            gpt_cond_len=30,
+            gpt_cond_chunk_len=4,
+            max_ref_length=60,
+        )
+        print("Generating audio")
+        t0 = time.time()
+        out = self.model.inference(
+            text=model_input["text"],
+            speaker_embedding=speaker_embedding,
+            gpt_cond_latent=gpt_cond_latent,
+            temperature=0.75,
+            repetition_penalty=2.5,
+            language="en",
+            enable_text_splitting=True,
+        )
+        print(f"I: Time to generate audio: {inference_time} seconds")
+        audio_file = io.BytesIO()
+        torchaudio.save(audio_file, torch.tensor(out["wav"]).unsqueeze(0), 24000)
+        inference_time = time.time() - t0
+        audio_str = base64.b64encode(audio_file.getvalue()).decode("utf-8")
+        return {"data": audio_str, "format": "wav"}