File size: 5,866 Bytes
d1701ad
 
 
 
 
 
 
 
 
 
 
 
 
e0d9c8e
d1701ad
 
 
 
 
 
 
e0d9c8e
4df6e8a
d1701ad
 
 
 
 
 
 
 
 
 
e0d9c8e
d1701ad
 
 
 
 
 
 
e0d9c8e
d1701ad
 
 
e0d9c8e
d1701ad
 
4b9cf05
d1701ad
e0d9c8e
 
 
 
 
d1701ad
 
4b9cf05
e0d9c8e
d1701ad
 
e0d9c8e
d1701ad
e0d9c8e
 
 
 
 
 
d1701ad
 
e0d9c8e
d1701ad
 
 
 
 
 
 
 
 
 
4b9cf05
e0d9c8e
d1701ad
 
 
 
e0d9c8e
 
 
 
d1701ad
e0d9c8e
d1701ad
 
 
 
 
 
 
 
 
e0d9c8e
d1701ad
e0d9c8e
 
 
 
d1701ad
 
 
e0d9c8e
d1701ad
 
e0d9c8e
d1701ad
 
e0d9c8e
d1701ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e0d9c8e
d1701ad
e0d9c8e
 
 
 
d1701ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import re
import os
import nltk
import torch
import pickle
import torchaudio
import numpy as np
from TTS.tts.models.xtts import Xtts
from nltk.tokenize import sent_tokenize
from TTS.tts.configs.xtts_config import XttsConfig


def _load_array(filename):
    """ Opens a file a returns it, used with numpy files """
    with open(filename, 'rb') as f:
        return pickle.load(f)


os.environ['COQUI_TOS_AGREED'] = '1'

# Used to generate audio based on a sample
nltk.download('punkt')
model_path = os.path.join("tts_model")

config = XttsConfig()
config.load_json(os.path.join(model_path, "config.json"))

model = Xtts.init_from_config(config)
model.load_checkpoint(
    config,
    checkpoint_path=os.path.join(model_path, "model.pth"),
    vocab_path=os.path.join(model_path, "vocab.json"),
    eval=True,
    use_deepspeed=True,
)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

# Speaker latent
path_latents = 'assets/gpt_cond_latent.npy'
gpt_cond_latent = _load_array(path_latents)

# Speaker embedding
path_embedding = 'assets/speaker_embedding.npy'
speaker_embedding = _load_array(path_embedding)


def get_audio(text: str, language: str = 'es', saving_path: str = 'output') -> None:
    """
    Creates an audio
    :param text: text to convert to audio
    :param language: 'es', 'en' or 'pt', language used for the audio file
    :param saving_path: path to save the audio
    :return: None
    """
    # Creates an audio with the answer and saves it as output.wav
    _save_audio(text, language, saving_path)
    return


def _save_audio(text: str, language: str, path_audio: str) -> None:
    """
    Splits the text into sentences, clean and creates an audio for each one, then concatenates
    all the audios and saves them into a file.
    :param text: input text
    :param language: language used in the audio
    :param path_audio: saving path of the audio
    :return: None
    """
    # Split the answer into sentences and clean it
    sentences = _get_clean_text(text, language)

    # Get the voice of each sentence
    audio_segments = []
    for sentence in sentences:
        audio_stream = _get_voice(sentence, language)
        audio_stream = torch.tensor(audio_stream)
        audio_segments.append(audio_stream)

    # Concatenate and save all audio segments
    concatenated_audio = torch.cat(audio_segments, dim=0)
    torchaudio.save(f'{path_audio}.wav', concatenated_audio.unsqueeze(0), 24000)
    return


def _get_voice(sentence: str, language: str) -> np.ndarray:
    """
    Gets a numpy array with a wav of an audio with the given sentence and language
    :param sentence: input sentence
    :param language: languages used in the audio
    :return: numpy array with the audio
    """
    out = model.inference(
        sentence,
        language=language,
        gpt_cond_latent=gpt_cond_latent,
        speaker_embedding=speaker_embedding,
        temperature=0.1
    )
    return out['wav']


def _get_clean_text(text: str, language: str) -> list[str]:
    """
    Splits the text into smaller sentences using nltk and removes links.
    :param text: input text for the audio
    :param language: language used for the audio ('es', 'en', 'pt')
    :return: list of sentences
    """
    # Remove the links in the audio and add another sentence
    if language == 'en':
        clean_answer = re.sub(r'http[s]?://\S+', 'the following link', text)
        max_characters = 250
    elif language == 'es':
        clean_answer = re.sub(r'http[s]?://\S+', 'el siguiente link', text)
        max_characters = 239
    else:
        clean_answer = re.sub(r'http[s]?://\S+', 'o seguinte link', text)
        max_characters = 203

    # Change the name from Bella to Bela
    clean_answer = clean_answer.replace('Bella', 'Bela')

    # Remove Florida and zipcode
    clean_answer = re.sub(r', FL \d+', "", clean_answer)

    # Split the answer into sentences with nltk and make sure they are shorter than the maximum possible
    # characters
    split_sentences = sent_tokenize(clean_answer)
    sentences = []
    for sentence in split_sentences:
        if len(sentence) > max_characters:
            sentences.extend(_split_sentence(sentence, max_characters))
        else:
            sentences.append(sentence)

    return sentences


def _split_sentence(sentence: str, max_characters: int) -> list[str]:
    """
    Used when the sentences are still to long. The split point is the nearest comma to the middle
    of the sentence, if there is no comma then a space is used or just the middle. If the
    remaining sentences are still too long, another iteration is run.
    :param sentence: sentence to be split
    :param max_characters: max number of characters a sentence can have
    :return: list of sentences
    """
    # Get index of each comma
    sentences = []
    commas = [i for i, c in enumerate(sentence) if c == ',']

    # No commas, search for spaces
    if len(commas) == 0:
        commas = [i for i, c in enumerate(sentence) if c == ' ']

    # No commas or spaces, split it in the middle
    if len(commas) == 0:
        sentences.append(sentence[:len(sentence) // 2])
        sentences.append(sentence[len(sentence) // 2:])
        return sentences

    # Nearest index to the middle
    split_point = min(commas, key=lambda x: abs(x - (len(sentence) // 2)))

    if sentence[split_point] == ',':
        left = sentence[:split_point]
        right = sentence[split_point + 2:]
    else:
        left = sentence[:split_point]
        right = sentence[split_point + 1:]

    if len(left) > max_characters:
        sentences.extend(_split_sentence(left, max_characters))
    else:
        sentences.append(left)
    if len(right) > max_characters:
        sentences.extend(_split_sentence(right, max_characters))
    else:
        sentences.append(right)

    return sentences