openpecha
/

speecht5-tts-01

Inference Endpoints

Model card Files Files and versions Community

TenzinGayche commited on Sep 28, 2023

Commit

1b84f28

•

1 Parent(s): 0cba583

Upload handle.py

Files changed (1) hide show

handle.py +78 -0

handle.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from typing import  Dict
+import librosa
+import numpy as np
+import torch
+import pyewts
+import noisereduce as nr
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+from num2tib.core import convert
+from num2tib.core import convert2text
+import re
+converter = pyewts.pyewts()
+def replace_numbers_with_convert(sentence, wylie=True):
+    pattern = r'\d+(\.\d+)?'
+    def replace(match):
+        return convert(match.group(), wylie)
+    result = re.sub(pattern, replace, sentence)
+    return result
+def cleanup_text(inputs):
+    for src, dst in replacements:
+        inputs = inputs.replace(src, dst)
+    return inputs
+speaker_embeddings = {
+    "Lhasa(female)": "female_2.npy",
+}
+replacements = [
+    ('_', '_'),
+    ('*', 'v'),
+    ('`', ';'),
+    ('~', ','),
+    ('+', ','),
+    ('\\', ';'),
+    ('|', ';'),
+    ('╚',''),
+    ('╗','')
+]
+class EndpointHandler():
+    def __init__(self, path=""):
+        # load the model
+        self.processor = SpeechT5Processor.from_pretrained("TenzinGayche/TTS_run3_ep20_174k_b")
+        self.model = SpeechT5ForTextToSpeech.from_pretrained("TenzinGayche/TTS_run3_ep20_174k_b")
+        self.model.to('cuda')
+        self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+    def __call__(self, data: Dict[str]) -> Dict[str, str]:
+        """
+        Args:
+            data (:obj:):
+                includes the deserialized audio file as bytes
+        Return:
+            A :obj:`dict`:. base64 encoded image
+        """
+        # process input
+        if len(text.strip()) == 0:
+            return (16000, np.zeros(0).astype(np.int16))
+        text = converter.toWylie(text)
+        text=cleanup_text(text)
+        text=replace_numbers_with_convert(text)
+        inputs = self.processor(text=text, return_tensors="pt")
+        # limit input length
+        input_ids = inputs["input_ids"]
+        input_ids = input_ids[..., :self.model.config.max_text_positions]
+        speaker_embedding = np.load(speaker_embeddings['Lhasa(female)'])
+        speaker_embedding = torch.tensor(speaker_embedding)
+        speech = self.model.generate_speech(input_ids.to('cuda'), speaker_embedding.to('cuda'), vocoder=vocoder.to('cuda'))
+        speech = nr.reduce_noise(y=speech.to('cpu'), sr=16000)
+        return (16000, speech)