TenzinGayche
/

whisper-small-3

Automatic Speech Recognition

Inference Endpoints

Model card Files Files and versions Community

whisper-small-3 / handler.py

TenzinGayche's picture

Update handler.py

a6157cf 9 months ago

raw history blame

No virus

2.76 kB

	from typing import Dict, Any,Union
	import librosa
	import numpy as np
	import torch
	import pyewts
	import noisereduce as nr
	from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
	from num2tib.core import convert
	from num2tib.core import convert2text
	import base64
	import re
	import requests
	converter = pyewts.pyewts()
	def download_file(url, destination):
	response = requests.get(url)
	with open(destination, 'wb') as file:
	file.write(response.content)

	# Example usage:
	download_file('https://huggingface.co/openpecha/speecht5-tts-01/resolve/main/female_2.npy', 'female_2.npy')
	def replace_numbers_with_convert(sentence, wylie=True):
	pattern = r'\d+(\.\d+)?'
	def replace(match):
	return convert(match.group(), wylie)
	result = re.sub(pattern, replace, sentence)

	return result

	def cleanup_text(inputs):
	for src, dst in replacements:
	inputs = inputs.replace(src, dst)
	return inputs

	speaker_embeddings = {
	"Lhasa(female)": "female_2.npy",

	}

	replacements = [
	('_', '_'),
	('*', 'v'),
	('`', ';'),
	('~', ','),
	('+', ','),
	('\\', ';'),
	('\|', ';'),
	('╚',''),
	('╗','')
	]





	class EndpointHandler():
	def __init__(self, path=""):
	# load the model
	self.processor = SpeechT5Processor.from_pretrained("TenzinGayche/TTS_run3_ep20_174k_b")
	self.model = SpeechT5ForTextToSpeech.from_pretrained("TenzinGayche/TTS_run3_ep20_174k_b")
	self.model.to('cuda')
	self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")


	def __call__(self, data: Dict[str, Any]) -> Dict[str, Union[int, str]]:
	"""_summary_

	Args:
	data (Dict[str, Any]): _description_

	Returns:
	bytes: _description_
	"""
	text = data.pop("inputs",data)

	# process input

	if len(text.strip()) == 0:
	return (16000, np.zeros(0).astype(np.int16))
	text = converter.toWylie(text)
	text=cleanup_text(text)
	text=replace_numbers_with_convert(text)
	inputs = self.processor(text=text, return_tensors="pt")
	# limit input length
	input_ids = inputs["input_ids"]
	input_ids = input_ids[..., :self.model.config.max_text_positions]
	speaker_embedding = np.load(speaker_embeddings['Lhasa(female)'])
	speaker_embedding = torch.tensor(speaker_embedding)
	speech = self.model.generate_speech(input_ids.to('cuda'), speaker_embedding.to('cuda'), vocoder=self.vocoder.to('cuda'))
	speech = nr.reduce_noise(y=speech.to('cpu'), sr=16000)
	return {
	"sample_rate": 16000,
	"audio": base64.b64encode(speech.tostring()).decode("utf-8"),

	}