|
import datasets.arrow_dataset |
|
import numpy as np |
|
import torch |
|
import transformers.pipelines.text_to_audio |
|
from datasets import load_dataset |
|
from transformers import pipeline |
|
|
|
|
|
def load_model() -> transformers.pipelines.text_to_audio.TextToAudioPipeline: |
|
""" |
|
Подгрузка модели преобразования текста в речь |
|
:return: class TextToAudioPipeline |
|
""" |
|
return pipeline("text-to-speech", "microsoft/speecht5_tts") |
|
|
|
|
|
def load_speaker_dataset() -> datasets.arrow_dataset.Dataset: |
|
""" |
|
Подгрузка датасета для озвучивания текста |
|
:return: class Dataset |
|
""" |
|
return load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") |
|
|
|
|
|
def text_to_speech( |
|
text: str, |
|
synthesiser: transformers.pipelines.text_to_audio.TextToAudioPipeline, |
|
embeddings_dataset: datasets.arrow_dataset.Dataset |
|
) -> (np.ndarray, int): |
|
""" |
|
Преобразование текста в речь |
|
:param text: Текст |
|
:param synthesiser: pipeline для озвучивания текста |
|
:param embeddings_dataset: dataset для озвучивания текста |
|
:return: tuple (audio data, sampling rate) |
|
""" |
|
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) |
|
|
|
speech = synthesiser(text, forward_params={"speaker_embeddings": speaker_embedding}) |
|
|
|
return speech['audio'], speech['sampling_rate'] |
|
|