File size: 4,279 Bytes
8c4d22a
 
64fcafd
6449e88
8c4d22a
 
6edda28
8c4d22a
6449e88
 
 
8c4d22a
2ffc7e7
8c4d22a
64fcafd
2ffc7e7
6449e88
 
 
 
 
8c4d22a
 
c49c056
64fcafd
 
 
2ffc7e7
8c4d22a
 
 
 
 
2ffc7e7
 
6449e88
64fcafd
 
 
 
6449e88
 
8c4d22a
6449e88
64fcafd
 
 
 
c49c056
64fcafd
 
8c4d22a
c49c056
2ffc7e7
 
 
c49c056
 
 
 
 
8c4d22a
2ffc7e7
 
 
8c4d22a
c49c056
6edda28
8c4d22a
6449e88
8c4d22a
6449e88
9d153e7
 
 
6449e88
 
 
 
 
 
 
 
 
 
 
8c4d22a
 
 
a575152
8c4d22a
6449e88
64fcafd
8c4d22a
6449e88
 
 
8c4d22a
64fcafd
 
 
 
6449e88
8c4d22a
 
 
 
a9c23eb
 
 
 
 
 
 
 
 
8c4d22a
64fcafd
8c4d22a
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
from io import BytesIO
import requests
from os.path import exists, join
from espnet2.bin.tts_inference import Text2Speech
from enum import Enum
from .formatter import preprocess_text
from .stress import sentence_to_stress, stress_dict, stress_with_model
from torch import no_grad
import numpy as np
import time
import soundfile as sf


class Voices(Enum):
    """List of available voices for the model."""

    Olena = 4
    Mykyta = 3
    Lada = 2
    Dmytro = 1
    Olga = 5


class Stress(Enum):
    """Options how to stress sentence.
    - `dictionary` - performs lookup in dictionary, taking into account grammatical case of a word and its' neighbors
    - `model` - stress using transformer model"""

    Dictionary = "dictionary"
    Model = "model"


class TTS:
    """ """

    def __init__(self, cache_folder=None, device="cpu") -> None:
        """
        Class to setup a text-to-speech engine, from download to model creation.  \n
        Downloads or uses files from `cache_folder` directory.  \n
        By default stores in current directory."""
        self.device = device
        self.__setup_cache(cache_folder)

    def tts(self, text: str, voice: int, stress: str, output_fp=BytesIO(), speed=1.0):
        """
        Run a Text-to-Speech engine and output to `output_fp` BytesIO-like object.
        - `text` - your model input text.
        - `voice` - one of predefined voices from `Voices` enum.
        - `stress` - stress method options, predefined in `Stress` enum.
        - `output_fp` - file-like object output. Stores in RAM by default.
        """

        if stress not in [option.value for option in Stress]:
            raise ValueError(
                f"Invalid value for stress option selected! Please use one of the following values: {', '.join([option.value for option in Stress])}."
            )

        if stress == Stress.Model.value:
            stress = True
        else:
            stress = False
        if voice not in [option.value for option in Voices]:
            raise ValueError(
                f"Invalid value for voice selected! Please use one of the following values: {', '.join([option.value for option in Voices])}."
            )

        text = preprocess_text(text, stress)
        text = sentence_to_stress(text, stress_with_model if stress else stress_dict)

        # synthesis
        with no_grad():
            start = time.time()
            wav = self.synthesizer(
                text, sids=np.array(voice), decode_conf={"alpha": 1 / speed}
            )["wav"]

        rtf = (time.time() - start) / (len(wav) / self.synthesizer.fs)
        print(f"RTF = {rtf:5f}")

        sf.write(
            output_fp,
            wav.view(-1).cpu().numpy(),
            self.synthesizer.fs,
            "PCM_16",
            format="wav",
        )

        output_fp.seek(0)

        return output_fp, text

    def __setup_cache(self, cache_folder=None):
        """Downloads models and stores them into `cache_folder`. By default stores in current directory."""
        print("downloading uk/mykyta/vits-tts")
        release_number = "v4.0.0"
        model_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/model.pth"
        config_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/config.yaml"

        if cache_folder is None:
            cache_folder = "."

        model_path = join(cache_folder, "model.pth")
        config_path = join(cache_folder, "config.yaml")

        self.__download(model_link, model_path)
        self.__download(config_link, config_path)

        self.synthesizer = Text2Speech(
            train_config="config.yaml",
            model_file="model.pth",
            device=self.device,
            # Only for VITS
            noise_scale=0.333,
            noise_scale_dur=0.333,
        )

    def __download(self, url, file_name):
        """Downloads file from `url` into local `file_name` file."""
        if not exists(file_name):
            print(f"Downloading {file_name}")
            r = requests.get(url, allow_redirects=True)
            with open(file_name, "wb") as file:
                file.write(r.content)
        else:
            print(f"Found {file_name}. Skipping download...")