from typing import Dict, Any,Union import librosa import numpy as np import torch import pyewts import noisereduce as nr from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan from num2tib.core import convert from num2tib.core import convert2text import base64 import re import requests converter = pyewts.pyewts() def download_file(url, destination): response = requests.get(url) with open(destination, 'wb') as file: file.write(response.content) # Example usage: download_file('https://huggingface.co/openpecha/speecht5-tts-01/resolve/main/female_2.npy', 'female_2.npy') def replace_numbers_with_convert(sentence, wylie=True): pattern = r'\d+(\.\d+)?' def replace(match): return convert(match.group(), wylie) result = re.sub(pattern, replace, sentence) return result def cleanup_text(inputs): for src, dst in replacements: inputs = inputs.replace(src, dst) return inputs speaker_embeddings = { "Lhasa(female)": "female_2.npy", } replacements = [ ('_', '_'), ('*', 'v'), ('`', ';'), ('~', ','), ('+', ','), ('\\', ';'), ('|', ';'), ('╚',''), ('╗','') ] class EndpointHandler(): def __init__(self, path=""): # load the model self.processor = SpeechT5Processor.from_pretrained("TenzinGayche/TTS_run3_ep20_174k_b") self.model = SpeechT5ForTextToSpeech.from_pretrained("TenzinGayche/TTS_run3_ep20_174k_b") self.model.to('cuda') self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") def __call__(self, data: Dict[str, Any]) -> Dict[str, Union[int, str]]: """_summary_ Args: data (Dict[str, Any]): _description_ Returns: bytes: _description_ """ text = data.pop("inputs",data) # process input if len(text.strip()) == 0: return (16000, np.zeros(0).astype(np.int16)) text = converter.toWylie(text) text=cleanup_text(text) text=replace_numbers_with_convert(text) inputs = self.processor(text=text, return_tensors="pt") # limit input length input_ids = inputs["input_ids"] input_ids = input_ids[..., :self.model.config.max_text_positions] speaker_embedding = np.load(speaker_embeddings['Lhasa(female)']) speaker_embedding = torch.tensor(speaker_embedding) speech = self.model.generate_speech(input_ids.to('cuda'), speaker_embedding.to('cuda'), vocoder=self.vocoder.to('cuda')) speech = nr.reduce_noise(y=speech.to('cpu'), sr=16000) return { "sample_rate": 16000, "audio": base64.b64encode(speech.tostring()).decode("utf-8"), }