File size: 4,674 Bytes
dbf0dc1 43ff73d dbf0dc1 43ff73d dbf0dc1 05761ac dbf0dc1 a507c50 43ff73d dbf0dc1 43ff73d dbf0dc1 78510b3 dbf0dc1 78510b3 dbf0dc1 78510b3 dbf0dc1 a507c50 dbf0dc1 4fa08cf 43ff73d 4fa08cf dbf0dc1 4fa08cf dbf0dc1 43ff73d dbf0dc1 43ff73d dbf0dc1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
from typing import Dict, List, Text, Any
import os
import re
from transformers import SpeechT5ForTextToSpeech
from transformers import SpeechT5Processor
from transformers import SpeechT5HifiGan
from speechbrain.pretrained import EncoderClassifier
import soundfile as sf
import torch
import numpy as np
# set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
# set mixed precision dtype
dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] == 8 else torch.float16
else:
dtype = torch.float32
class EndpointHandler():
def __init__(self, path=""):
# Load all required models
self.model_id = "Oysiyl/speecht5_tts_common_voice_nl"
self.spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
self.model = SpeechT5ForTextToSpeech.from_pretrained(self.model_id, torch_dtype=dtype).to(device)
self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
self.speaker_model = EncoderClassifier.from_hparams(
source=self.spk_model_name,
run_opts={"device": device},
savedir=os.path.join("/tmp", self.spk_model_name)
)
waveform, samplerate = sf.read("speaker.wav")
self.speaker_embeddings = self.create_speaker_embedding(waveform)
@staticmethod
def remove_special_characters_s(text: Text) -> Text:
chars_to_remove_regex = '[\=\´\–\“\”\…\=]'
# remove special characters
text = re.sub(chars_to_remove_regex, '', text)
text = re.sub("‘", "'", text)
text = re.sub("’", "'", text)
text = re.sub("´", "'", text)
text = text.lower()
return text
@staticmethod
def cyrillic_to_latin(text: Text) -> Text:
replacements = [
('а', 'a'),
('б', 'b'),
('в', 'v'),
('г', 'h'),
('д', 'd'),
('е', 'e'),
('ж', 'zh'),
('з', 'z'),
('и', 'y'),
('й', 'j'),
('к', 'k'),
('л', 'l'),
('м', 'm'),
('н', 'n'),
('о', 'o'),
('п', 'p'),
('р', 'r'),
('с', 's'),
('т', 't'),
('у', 'u'),
('ф', 'f'),
('х', 'h'),
('ц', 'ts'),
('ч', 'ch'),
('ш', 'sh'),
('щ', 'sch'),
('ь', "'"),
('ю', 'ju'),
('я', 'ja'),
('є', 'je'),
('і', 'i'),
('ї', 'ji'),
('ґ', 'g')
]
for src, dst in replacements:
text = text.replace(src, dst)
return text
def create_speaker_embedding(self, waveform: np.ndarray) -> np.ndarray:
with torch.no_grad():
speaker_embeddings = self.speaker_model.encode_batch(torch.tensor(waveform))
speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
if device.type != 'cuda':
speaker_embeddings = speaker_embeddings.squeeze().numpy()
else:
speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
speaker_embeddings = torch.tensor(speaker_embeddings, dtype=dtype).unsqueeze(0).to(device)
return speaker_embeddings
def __call__(self, data: Any) -> np.ndarray:
"""
:param data: A dictionary contains `inputs`.
:return: audiofile.
"""
text = data.pop("inputs", None)
# Check if text is not provided
if text is None:
return {"error": "Please provide a text."}
waveform = data.pop("speaker_embeddings", None)
# Check if speaker_embeddings is not provided
if waveform is None:
speaker_embeddings = self.speaker_embeddings
else:
speaker_embeddings = self.create_speaker_embedding(waveform)
# run inference pipeline
text = self.remove_special_characters_s(text)
text = self.cyrillic_to_latin(text)
input_ids = self.processor(text=text, return_tensors="pt")['input_ids'].to(device)
spectrogram = self.model.generate_speech(input_ids, speaker_embeddings)
with torch.no_grad():
speech = self.vocoder(spectrogram)
if device.type != 'cuda':
out = speech.numpy()
else:
out = speech.cpu().numpy()
# return output audio in numpy format
return out
|