File size: 8,197 Bytes
b585c7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
import textwrap
import re

from src.utils import flatten_list, have_emoji, have_langid


def setup_nltk():
    import nltk  # we'll use this to split into sentences
    nltk.download("punkt")


# if followed installation, then should already be done, don't break air-gap
# setup_nltk()

sentence_keys = ['sentence_list', 'index']


def init_sentence_state():
    sentence_state = dict(sentence_list=[], index=0)
    return sentence_state


def unpack_state(sentence_state):
    rets = []
    for key in sentence_keys:
        rets.append(sentence_state[key])
    return tuple(rets)


def pack_state(sentence_state, *args):
    # don't change dict reference so parent can reuse.  Ok to lose reference for list
    for keyi, key in enumerate(sentence_keys):
        if isinstance(sentence_state[key], list):
            sentence_state[key] = args[keyi]
        else:
            sentence_state[key] = args[keyi]
    return sentence_state


def split_sentences(sentence, n=250):
    """
    Splits a sentence by spaces into smaller sentences, each with a maximum length of n characters,
    while preserving whitespace characters like new lines.
    # 250 due to [!] Warning: The text length exceeds the character limit of 250 for language 'en', this might cause truncated audio.
    """
    # Splitting on spaces while preserving all whitespace characters in a list
    words = re.split('(\s+)', sentence)
    sentences = []
    current_sentence = []
    current_length = 0

    for word in words:
        # Skip empty strings which can occur due to consecutive whitespace
        if word == '':
            continue

        # Check if the word is a whitespace character
        if word.isspace():
            if word == '\n':
                # If it's a newline, end the current sentence and start a new one
                sentences.append("".join(current_sentence))
                current_sentence = []
                current_length = 0
            else:
                # For other whitespace characters, add them to the current sentence
                current_sentence.append(word)
                current_length += len(word)
        else:
            # Check if adding the next word would exceed the limit
            if current_length + len(word) > n:
                if current_sentence:
                    sentences.append("".join(current_sentence))
                    current_sentence = [word]
                    current_length = len(word)
                else:
                    # If the word itself is longer than n and there's no current sentence
                    sentences.append(word)
                    current_length = 0
            else:
                current_sentence.append(word)
                current_length += len(word)

    # Add the last sentence if it exists
    if current_sentence:
        sentences.append("".join(current_sentence))

    return sentences


def _get_sentences(response, verbose=False, min_start=15, max_length=250):
    # no mutations of characters allowed here, only breaking apart or merging
    import nltk
    # refuse to tokenize first 15 characters into sentence, so language detection works and logic simpler
    sentences = nltk.sent_tokenize(response[min_start:])
    # split any long sentences
    sentences = flatten_list([split_sentences(x, max_length) for x in sentences])
    # drop empty sentences
    sentences = [x for x in sentences if x.strip()]
    # restore first min_start if set
    if sentences and min_start > 0:
        sentences[0] = response[:min_start] + sentences[0]
    elif min_start > 0:
        sentences.append(response[:min_start])

    return sentences


def get_sentence(response, sentence_state, is_final=False, verbose=False):
    # get state items
    sentence_list, index = unpack_state(sentence_state)
    sentences = _get_sentences(response[index:], min_start=15 if index == 0 else 0, verbose=verbose)

    if len(sentences) >= 2:
        # detected new completed sentence
        # find new index
        index_delta = response[index:].index(sentences[0])
        index += index_delta + len(sentences[0])
        sentence_list.append(sentences[0])
        # only clean for result, to avoid mis-handling of sentences index
        cleaned_sentence = clean_sentence(sentences[0], verbose=verbose)
        return cleaned_sentence, pack_state(sentence_state, sentence_list, index), False
    elif is_final:
        # then just return last sentence
        cleaned_sentence = clean_sentence(' '.join(sentences), verbose=verbose)
        sentence_list.append(' '.join(sentences))
        return cleaned_sentence, pack_state(sentence_state, sentence_list, index), True
    else:
        return None, pack_state(sentence_state, sentence_list, index), True


def clean_sentence(sentence, verbose=False):
    if sentence is None or len(sentence) == 0:
        if verbose:
            print("empty sentence")
        return ''

    # Remove code blocks
    sentence = re.sub("```.*?```", "", sentence, flags=re.DOTALL)
    sentence = re.sub("`.*?`", "", sentence, flags=re.DOTALL)
    sentence = re.sub("\(.*?\)", "", sentence, flags=re.DOTALL)

    # remove marks
    sentence = sentence.replace("```", "")
    sentence = sentence.replace("...", " ")
    sentence = sentence.replace("(", " ")
    sentence = sentence.replace(")", " ")

    sentence = sentence.replace("Dr. ", "Doctor ")
    sentence = sentence.replace(" w/ ", " with ")

    sentence = sentence.replace('H2O.ai', "aych two oh ae eye.")
    sentence = sentence.replace('H2O.AI', "aych two oh ae eye.")
    sentence = sentence.replace('h2o.ai', "aych two oh ae eye.")
    sentence = sentence.replace('h2o.ai', "aych two oh ae eye.")

    # filter out emojis
    if have_emoji:
        import emoji
        sentence = ''.join([x for x in sentence if not emoji.is_emoji(x)])

    # fix floating expressions
    sentence = re.sub(r'(\d+)\.(\d+)', r"\1 dot \2", sentence)

    # Fix last bad characters
    sentence = re.sub("([^\x00-\x7F]|\w)(\.|\。|\?|\!)", r"\1\2", sentence)

    sentence = sentence.strip()

    if sentence.startswith('. ') or sentence.startswith('? ') or sentence.startswith('! ') or sentence.startswith(', '):
        sentence = sentence[2:]
    if sentence.startswith('.') or sentence.startswith('?') or sentence.startswith('!') or sentence.startswith(','):
        sentence = sentence[1:]

    if sentence == '1.':
        sentence = 'One'
    if sentence == '2.':
        sentence = 'Two'
    if sentence == '3.':
        sentence = 'Three'
    if sentence == '4.':
        sentence = 'Four'
    if sentence == '5.':
        sentence = 'Five'
    if sentence == '6.':
        sentence = 'Six'
    if sentence == '7.':
        sentence = 'Seven'
    if sentence == '8.':
        sentence = 'Eight'
    if sentence == '9.':
        sentence = 'Nine'
    if sentence == '10.':
        sentence = 'Ten'

    if len(sentence) == 0:
        if verbose:
            print("EMPTY SENTENCE after processing")
        return ''

    if verbose:
        print("Sentence for speech: %s" % sentence)

    return sentence


def detect_language(prompt, supported_languages, verbose=False):
    if not have_langid:
        # if no package, just return english
        return "en"

    import langid
    # Fast language autodetection
    if len(prompt) > 15:
        language_predicted = langid.classify(prompt)[0].strip()  # strip need as there is space at end!
        if language_predicted == "zh":
            # we use zh-cn on xtts
            language_predicted = "zh-cn"

        if language_predicted not in supported_languages:
            print(f"Detected a language not supported by xtts :{language_predicted}, switching to english for now")
            language = "en"
        else:
            language = language_predicted
        if verbose:
            print(f"Language: Predicted sentence language:{language_predicted} , using language for xtts:{language}")
    else:
        # Hard to detect language fast in short sentence, use english default
        language = "en"
        if verbose:
            print(f"Language: Prompt is short or autodetect language disabled using english for xtts")

    return language