openpecha
/

speecht5-tts-01

Inference Endpoints

Model card Files Files and versions Community

speecht5-tts-01 / handler.py

TenzinGayche's picture

Update handler.py

4f18e92 about 1 year ago

3.31 kB

	from typing import Dict, Any,Union
	import tempfile
	import numpy as np
	import torch
	import pyewts
	import noisereduce as nr
	from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
	from num2tib.core import convert
	from num2tib.core import convert2text
	import soundfile as sf
	import base64
	import re
	import requests
	import os
	converter = pyewts.pyewts()
	def download_file(url, destination):
	response = requests.get(url)
	with open(destination, 'wb') as file:
	file.write(response.content)

	# Example usage:
	download_file('https://huggingface.co/openpecha/speecht5-tts-01/resolve/main/female_2.npy', 'female_2.npy')
	def replace_numbers_with_convert(sentence, wylie=True):
	pattern = r'\d+(\.\d+)?'
	def replace(match):
	return convert(match.group(), wylie)
	result = re.sub(pattern, replace, sentence)

	return result

	def cleanup_text(inputs):
	for src, dst in replacements:
	inputs = inputs.replace(src, dst)
	return inputs

	speaker_embeddings = {
	"Lhasa(female)": "female_2.npy",

	}

	replacements = [
	('_', '_'),
	('*', 'v'),
	('`', ';'),
	('~', ','),
	('+', ','),
	('\\', ';'),
	('\|', ';'),
	('╚',''),
	('╗','')
	]

	class EndpointHandler():
	def __init__(self, path=""):
	# load the model
	self.processor = SpeechT5Processor.from_pretrained("TenzinGayche/TTS_run3_ep20_174k_b")
	self.model = SpeechT5ForTextToSpeech.from_pretrained("TenzinGayche/TTS_run3_ep20_174k_b")
	self.model.to('cuda')
	self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")


	def __call__(self, data: Dict[str, Any]) -> Dict[str, Union[int, str]]:
	"""_summary_

	Args:
	data (Dict[str, Any]): _description_

	Returns:
	bytes: _description_
	"""
	text = data.pop("inputs",data)

	# process input

	if len(text.strip()) == 0:
	return (16000, np.zeros(0).astype(np.int16))
	text = converter.toWylie(text)
	text=cleanup_text(text)
	text=replace_numbers_with_convert(text)
	inputs = self.processor(text=text, return_tensors="pt")
	input_ids = inputs["input_ids"]
	input_ids = input_ids[..., :self.model.config.max_text_positions]
	speaker_embedding = np.load(speaker_embeddings['Lhasa(female)'])
	speaker_embedding = torch.tensor(speaker_embedding)
	speech = self.model.generate_speech(input_ids.to('cuda'), speaker_embedding.to('cuda'), vocoder=self.vocoder.to('cuda'))
	speech = nr.reduce_noise(y=speech.to('cpu'), sr=16000)
	# Create a unique temporary WAV file
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_wav_file:
	temp_wav_path = temp_wav_file.name
	sf.write(temp_wav_path, speech.numpy(), 16000, 'PCM_24') # Use sf.write to write the WAV file

	# Read the WAV file and encode it as base64
	with open(temp_wav_path, "rb") as wav_file:
	audio_base64 = base64.b64encode(wav_file.read()).decode("utf-8")

	# Clean up the temporary WAV file
	os.remove(temp_wav_path)

	return {
	"sample_rate": 16000,
	"audio_base64": audio_base64, # Base64-encoded audio data
	}