|
os.environ["COQUI_TOS_AGREED"] = "1" |
|
import torch |
|
from TTS.api import TTS |
|
from TTS.utils.manage import ModelManager |
|
from TTS.utils.generic_utils import get_user_data_dir |
|
from TTS.tts.configs.xtts_config import XttsConfig |
|
from TTS.tts.models.xtts import Xtts |
|
import time |
|
import torchaudio |
|
import io |
|
import base64 |
|
import os |
|
|
|
|
|
class EndpointHandler: |
|
def __init__(self, path=""): |
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
config = XttsConfig() |
|
config.load_json("/repository/model/config.json") |
|
model = Xtts.init_from_config(config) |
|
model.load_checkpoint( |
|
config, |
|
checkpoint_path="/repository/model/model.pth", |
|
vocab_path="/repository/model/vocab.json", |
|
speaker_file_path="/repository/model/speakers_xtts.pth", |
|
eval=True, |
|
use_deepspeed=device == "cuda", |
|
) |
|
model.to(device) |
|
|
|
self.model = model |
|
|
|
def __call__(self, model_input): |
|
|
|
( |
|
gpt_cond_latent, |
|
speaker_embedding, |
|
) = self.model.get_conditioning_latents( |
|
audio_path="/repository/attenborough.mp3", |
|
gpt_cond_len=30, |
|
gpt_cond_chunk_len=4, |
|
max_ref_length=60, |
|
) |
|
|
|
print("Generating audio") |
|
t0 = time.time() |
|
out = self.model.inference( |
|
text=model_input["text"], |
|
speaker_embedding=speaker_embedding, |
|
gpt_cond_latent=gpt_cond_latent, |
|
temperature=0.75, |
|
repetition_penalty=2.5, |
|
language="en", |
|
enable_text_splitting=True, |
|
) |
|
print(f"I: Time to generate audio: {inference_time} seconds") |
|
audio_file = io.BytesIO() |
|
torchaudio.save(audio_file, torch.tensor(out["wav"]).unsqueeze(0), 24000) |
|
inference_time = time.time() - t0 |
|
audio_str = base64.b64encode(audio_file.getvalue()).decode("utf-8") |
|
return {"data": audio_str, "format": "wav"} |
|
|