from typing import Dict, Any import librosa import numpy as np import torch import pyewts import noisereduce as nr from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan from num2tib.core import convert from num2tib.core import convert2text import re converter = pyewts.pyewts() def replace_numbers_with_convert(sentence, wylie=True): pattern = r'\d+(\.\d+)?' def replace(match): return convert(match.group(), wylie) result = re.sub(pattern, replace, sentence) return result def cleanup_text(inputs): for src, dst in replacements: inputs = inputs.replace(src, dst) return inputs speaker_embeddings = { "Lhasa(female)": "female_2.npy", } replacements = [ ('_', '_'), ('*', 'v'), ('`', ';'), ('~', ','), ('+', ','), ('\\', ';'), ('|', ';'), ('╚',''), ('╗','') ] class EndpointHandler(): def __init__(self, path=""): # load the model self.processor = SpeechT5Processor.from_pretrained("TenzinGayche/TTS_run3_ep20_174k_b") self.model = SpeechT5ForTextToSpeech.from_pretrained("TenzinGayche/TTS_run3_ep20_174k_b") self.model.to('cuda') self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") def __call__(self, data: Dict[str, Any]) -> bytes: """_summary_ Args: data (Dict[str, Any]): _description_ Returns: bytes: _description_ """ # process input text = data.get("inputs", None) if len(text.strip()) == 0: return (16000, np.zeros(0).astype(np.int16)) text = converter.toWylie(text) text=cleanup_text(text) text=replace_numbers_with_convert(text) inputs = self.processor(text=text, return_tensors="pt") # limit input length input_ids = inputs["input_ids"] input_ids = input_ids[..., :self.model.config.max_text_positions] speaker_embedding = np.load(speaker_embeddings['Lhasa(female)']) speaker_embedding = torch.tensor(speaker_embedding) speech = self.model.generate_speech(input_ids.to('cuda'), speaker_embedding.to('cuda'), vocoder=vocoder.to('cuda')) speech = nr.reduce_noise(y=speech.to('cpu'), sr=16000) return speech.tobytes()