speecht5-tts-01 / handler.py
TenzinGayche's picture
Update handler.py
75700af
raw
history blame
No virus
3.1 kB
from typing import Dict, Any,Union
import librosa
import tempfile
import numpy as np
import torch
import pyewts
import noisereduce as nr
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from num2tib.core import convert
from num2tib.core import convert2text
import base64
import re
import requests
import os
converter = pyewts.pyewts()
def download_file(url, destination):
response = requests.get(url)
with open(destination, 'wb') as file:
file.write(response.content)
# Example usage:
download_file('https://huggingface.co/openpecha/speecht5-tts-01/resolve/main/female_2.npy', 'female_2.npy')
def replace_numbers_with_convert(sentence, wylie=True):
pattern = r'\d+(\.\d+)?'
def replace(match):
return convert(match.group(), wylie)
result = re.sub(pattern, replace, sentence)
return result
def cleanup_text(inputs):
for src, dst in replacements:
inputs = inputs.replace(src, dst)
return inputs
speaker_embeddings = {
"Lhasa(female)": "female_2.npy",
}
replacements = [
('_', '_'),
('*', 'v'),
('`', ';'),
('~', ','),
('+', ','),
('\\', ';'),
('|', ';'),
('â•š',''),
('â•—','')
]
class EndpointHandler():
def __init__(self, path=""):
# load the model
self.processor = SpeechT5Processor.from_pretrained("TenzinGayche/TTS_run3_ep20_174k_b")
self.model = SpeechT5ForTextToSpeech.from_pretrained("TenzinGayche/TTS_run3_ep20_174k_b")
self.model.to('cuda')
self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
def __call__(self, data: Dict[str, Any]) -> Dict[str, Union[int, str]]:
"""_summary_
Args:
data (Dict[str, Any]): _description_
Returns:
bytes: _description_
"""
text = data.pop("inputs",data)
# process input
if len(text.strip()) == 0:
return (16000, np.zeros(0).astype(np.int16))
text = converter.toWylie(text)
text=cleanup_text(text)
text=replace_numbers_with_convert(text)
inputs = self.processor(text=text, return_tensors="pt")
input_ids = inputs["input_ids"]
input_ids = input_ids[..., :self.model.config.max_text_positions]
speaker_embedding = np.load(speaker_embeddings['Lhasa(female)'])
speaker_embedding = torch.tensor(speaker_embedding)
speech = self.model.generate_speech(input_ids.to('cuda'), speaker_embedding.to('cuda'), vocoder=self.vocoder.to('cuda'))
speech = nr.reduce_noise(y=speech.to('cpu'), sr=16000)
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_wav_file:
temp_wav_path = temp_wav_file.name
librosa.output.write_wav(temp_wav_path, speech.numpy(), sr=16000)
with open(temp_wav_path, "rb") as wav_file:
audio_base64 = base64.b64encode(wav_file.read()).decode("utf-8")
os.remove(temp_wav_path)
return {
"sample_rate": 16000,
"audio_base64": audio_base64,
}