openpecha
/

speecht5-tts-01

Inference Endpoints

Model card Files Files and versions Community

speecht5-tts-01 / handler.py

TenzinGayche's picture

Update handler.py

75700af 10 months ago

No virus

3.1 kB

	from typing import Dict, Any,Union
	import librosa
	import tempfile
	import numpy as np
	import torch
	import pyewts
	import noisereduce as nr
	from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
	from num2tib.core import convert
	from num2tib.core import convert2text
	import base64
	import re
	import requests
	import os
	converter = pyewts.pyewts()
	def download_file(url, destination):
	response = requests.get(url)
	with open(destination, 'wb') as file:
	file.write(response.content)

	# Example usage:
	download_file('https://huggingface.co/openpecha/speecht5-tts-01/resolve/main/female_2.npy', 'female_2.npy')
	def replace_numbers_with_convert(sentence, wylie=True):
	pattern = r'\d+(\.\d+)?'
	def replace(match):
	return convert(match.group(), wylie)
	result = re.sub(pattern, replace, sentence)

	return result

	def cleanup_text(inputs):
	for src, dst in replacements:
	inputs = inputs.replace(src, dst)
	return inputs

	speaker_embeddings = {
	"Lhasa(female)": "female_2.npy",

	}

	replacements = [
	('_', '_'),
	('*', 'v'),
	('`', ';'),
	('~', ','),
	('+', ','),
	('\\', ';'),
	('\|', ';'),
	('╚',''),
	('╗','')
	]





	class EndpointHandler():
	def __init__(self, path=""):
	# load the model
	self.processor = SpeechT5Processor.from_pretrained("TenzinGayche/TTS_run3_ep20_174k_b")
	self.model = SpeechT5ForTextToSpeech.from_pretrained("TenzinGayche/TTS_run3_ep20_174k_b")
	self.model.to('cuda')
	self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")


	def __call__(self, data: Dict[str, Any]) -> Dict[str, Union[int, str]]:
	"""_summary_

	Args:
	data (Dict[str, Any]): _description_

	Returns:
	bytes: _description_
	"""
	text = data.pop("inputs",data)

	# process input

	if len(text.strip()) == 0:
	return (16000, np.zeros(0).astype(np.int16))
	text = converter.toWylie(text)
	text=cleanup_text(text)
	text=replace_numbers_with_convert(text)
	inputs = self.processor(text=text, return_tensors="pt")
	input_ids = inputs["input_ids"]
	input_ids = input_ids[..., :self.model.config.max_text_positions]
	speaker_embedding = np.load(speaker_embeddings['Lhasa(female)'])
	speaker_embedding = torch.tensor(speaker_embedding)
	speech = self.model.generate_speech(input_ids.to('cuda'), speaker_embedding.to('cuda'), vocoder=self.vocoder.to('cuda'))
	speech = nr.reduce_noise(y=speech.to('cpu'), sr=16000)
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_wav_file:
	temp_wav_path = temp_wav_file.name
	librosa.output.write_wav(temp_wav_path, speech.numpy(), sr=16000)
	with open(temp_wav_path, "rb") as wav_file:
	audio_base64 = base64.b64encode(wav_file.read()).decode("utf-8")
	os.remove(temp_wav_path)
	return {
	"sample_rate": 16000,
	"audio_base64": audio_base64,
	}