File size: 2,762 Bytes
a6157cf
 
 
ef9dd5a
 
a6157cf
 
 
 
 
 
 
ef9dd5a
a6157cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef9dd5a
 
 
 
 
 
 
a6157cf
 
 
 
ef9dd5a
 
a6157cf
 
 
ef9dd5a
a6157cf
 
 
 
ef9dd5a
a6157cf
 
ef9dd5a
a6157cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from typing import  Dict, Any,Union
import librosa
import numpy as np
import torch
import pyewts
import noisereduce as nr
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from num2tib.core import convert
from num2tib.core import convert2text
import base64
import re
import requests
converter = pyewts.pyewts()
def download_file(url, destination):
    response = requests.get(url)
    with open(destination, 'wb') as file:
        file.write(response.content)

# Example usage:
download_file('https://huggingface.co/openpecha/speecht5-tts-01/resolve/main/female_2.npy', 'female_2.npy')
def replace_numbers_with_convert(sentence, wylie=True):
    pattern = r'\d+(\.\d+)?'
    def replace(match):
        return convert(match.group(), wylie)
    result = re.sub(pattern, replace, sentence)
    
    return result

def cleanup_text(inputs):
    for src, dst in replacements:
        inputs = inputs.replace(src, dst)
    return inputs

speaker_embeddings = {
    "Lhasa(female)": "female_2.npy",

}

replacements = [
    ('_', '_'),
    ('*', 'v'),
    ('`', ';'),
    ('~', ','),
    ('+', ','),
    ('\\', ';'),
    ('|', ';'),
    ('╚',''),
    ('╗','')
]





class EndpointHandler():
    def __init__(self, path=""):
        # load the model
        self.processor = SpeechT5Processor.from_pretrained("TenzinGayche/TTS_run3_ep20_174k_b")
        self.model = SpeechT5ForTextToSpeech.from_pretrained("TenzinGayche/TTS_run3_ep20_174k_b")
        self.model.to('cuda')
        self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")


    def __call__(self, data: Dict[str, Any]) -> Dict[str, Union[int, str]]:
        """_summary_

        Args:
            data (Dict[str, Any]): _description_

        Returns:
            bytes: _description_
        """
        text = data.pop("inputs",data)

        # process input

        if len(text.strip()) == 0:
            return (16000, np.zeros(0).astype(np.int16))
        text = converter.toWylie(text)
        text=cleanup_text(text)
        text=replace_numbers_with_convert(text)
        inputs = self.processor(text=text, return_tensors="pt")
        # limit input length
        input_ids = inputs["input_ids"]
        input_ids = input_ids[..., :self.model.config.max_text_positions]
        speaker_embedding = np.load(speaker_embeddings['Lhasa(female)'])
        speaker_embedding = torch.tensor(speaker_embedding)
        speech = self.model.generate_speech(input_ids.to('cuda'), speaker_embedding.to('cuda'), vocoder=self.vocoder.to('cuda'))
        speech = nr.reduce_noise(y=speech.to('cpu'), sr=16000)
        return {
            "sample_rate": 16000,
            "audio": base64.b64encode(speech.tostring()).decode("utf-8"),

        }