Spaces:

iblfe
/

test

Runtime error

File size: 8,197 Bytes

b585c7f

import textwrap
import re

from src.utils import flatten_list, have_emoji, have_langid


def setup_nltk():
    import nltk  # we'll use this to split into sentences
    nltk.download("punkt")


# if followed installation, then should already be done, don't break air-gap
# setup_nltk()

sentence_keys = ['sentence_list', 'index']


def init_sentence_state():
    sentence_state = dict(sentence_list=[], index=0)
    return sentence_state


def unpack_state(sentence_state):
    rets = []
    for key in sentence_keys:
        rets.append(sentence_state[key])
    return tuple(rets)


def pack_state(sentence_state, *args):
    # don't change dict reference so parent can reuse.  Ok to lose reference for list
    for keyi, key in enumerate(sentence_keys):
        if isinstance(sentence_state[key], list):
            sentence_state[key] = args[keyi]
        else:
            sentence_state[key] = args[keyi]
    return sentence_state


def split_sentences(sentence, n=250):
    """
    Splits a sentence by spaces into smaller sentences, each with a maximum length of n characters,
    while preserving whitespace characters like new lines.
    # 250 due to [!] Warning: The text length exceeds the character limit of 250 for language 'en', this might cause truncated audio.
    """
    # Splitting on spaces while preserving all whitespace characters in a list
    words = re.split('(\s+)', sentence)
    sentences = []
    current_sentence = []
    current_length = 0

    for word in words:
        # Skip empty strings which can occur due to consecutive whitespace
        if word == '':
            continue

        # Check if the word is a whitespace character
        if word.isspace():
            if word == '\n':
                # If it's a newline, end the current sentence and start a new one
                sentences.append("".join(current_sentence))
                current_sentence = []
                current_length = 0
            else:
                # For other whitespace characters, add them to the current sentence
                current_sentence.append(word)
                current_length += len(word)
        else:
            # Check if adding the next word would exceed the limit
            if current_length + len(word) > n:
                if current_sentence:
                    sentences.append("".join(current_sentence))
                    current_sentence = [word]
                    current_length = len(word)
                else:
                    # If the word itself is longer than n and there's no current sentence
                    sentences.append(word)
                    current_length = 0
            else:
                current_sentence.append(word)
                current_length += len(word)

    # Add the last sentence if it exists
    if current_sentence:
        sentences.append("".join(current_sentence))

    return sentences


def _get_sentences(response, verbose=False, min_start=15, max_length=250):
    # no mutations of characters allowed here, only breaking apart or merging
    import nltk
    # refuse to tokenize first 15 characters into sentence, so language detection works and logic simpler
    sentences = nltk.sent_tokenize(response[min_start:])
    # split any long sentences
    sentences = flatten_list([split_sentences(x, max_length) for x in sentences])
    # drop empty sentences
    sentences = [x for x in sentences if x.strip()]
    # restore first min_start if set
    if sentences and min_start > 0:
        sentences[0] = response[:min_start] + sentences[0]
    elif min_start > 0:
        sentences.append(response[:min_start])

    return sentences


def get_sentence(response, sentence_state, is_final=False, verbose=False):
    # get state items
    sentence_list, index = unpack_state(sentence_state)
    sentences = _get_sentences(response[index:], min_start=15 if index == 0 else 0, verbose=verbose)

    if len(sentences) >= 2:
        # detected new completed sentence
        # find new index
        index_delta = response[index:].index(sentences[0])
        index += index_delta + len(sentences[0])
        sentence_list.append(sentences[0])
        # only clean for result, to avoid mis-handling of sentences index
        cleaned_sentence = clean_sentence(sentences[0], verbose=verbose)
        return cleaned_sentence, pack_state(sentence_state, sentence_list, index), False
    elif is_final:
        # then just return last sentence
        cleaned_sentence = clean_sentence(' '.join(sentences), verbose=verbose)
        sentence_list.append(' '.join(sentences))
        return cleaned_sentence, pack_state(sentence_state, sentence_list, index), True
    else:
        return None, pack_state(sentence_state, sentence_list, index), True


def clean_sentence(sentence, verbose=False):
    if sentence is None or len(sentence) == 0:
        if verbose:
            print("empty sentence")
        return ''

    # Remove code blocks
    sentence = re.sub("```.*?```", "", sentence, flags=re.DOTALL)
    sentence = re.sub("`.*?`", "", sentence, flags=re.DOTALL)
    sentence = re.sub("\(.*?\)", "", sentence, flags=re.DOTALL)

    # remove marks
    sentence = sentence.replace("```", "")
    sentence = sentence.replace("...", " ")
    sentence = sentence.replace("(", " ")
    sentence = sentence.replace(")", " ")

    sentence = sentence.replace("Dr. ", "Doctor ")
    sentence = sentence.replace(" w/ ", " with ")

    sentence = sentence.replace('H2O.ai', "aych two oh ae eye.")
    sentence = sentence.replace('H2O.AI', "aych two oh ae eye.")
    sentence = sentence.replace('h2o.ai', "aych two oh ae eye.")
    sentence = sentence.replace('h2o.ai', "aych two oh ae eye.")

    # filter out emojis
    if have_emoji:
        import emoji
        sentence = ''.join([x for x in sentence if not emoji.is_emoji(x)])

    # fix floating expressions
    sentence = re.sub(r'(\d+)\.(\d+)', r"\1 dot \2", sentence)

    # Fix last bad characters
    sentence = re.sub("([^\x00-\x7F]|\w)(\.|\。|\?|\!)", r"\1\2", sentence)

    sentence = sentence.strip()

    if sentence.startswith('. ') or sentence.startswith('? ') or sentence.startswith('! ') or sentence.startswith(', '):
        sentence = sentence[2:]
    if sentence.startswith('.') or sentence.startswith('?') or sentence.startswith('!') or sentence.startswith(','):
        sentence = sentence[1:]

    if sentence == '1.':
        sentence = 'One'
    if sentence == '2.':
        sentence = 'Two'
    if sentence == '3.':
        sentence = 'Three'
    if sentence == '4.':
        sentence = 'Four'
    if sentence == '5.':
        sentence = 'Five'
    if sentence == '6.':
        sentence = 'Six'
    if sentence == '7.':
        sentence = 'Seven'
    if sentence == '8.':
        sentence = 'Eight'
    if sentence == '9.':
        sentence = 'Nine'
    if sentence == '10.':
        sentence = 'Ten'

    if len(sentence) == 0:
        if verbose:
            print("EMPTY SENTENCE after processing")
        return ''

    if verbose:
        print("Sentence for speech: %s" % sentence)

    return sentence


def detect_language(prompt, supported_languages, verbose=False):
    if not have_langid:
        # if no package, just return english
        return "en"

    import langid
    # Fast language autodetection
    if len(prompt) > 15:
        language_predicted = langid.classify(prompt)[0].strip()  # strip need as there is space at end!
        if language_predicted == "zh":
            # we use zh-cn on xtts
            language_predicted = "zh-cn"

        if language_predicted not in supported_languages:
            print(f"Detected a language not supported by xtts :{language_predicted}, switching to english for now")
            language = "en"
        else:
            language = language_predicted
        if verbose:
            print(f"Language: Predicted sentence language:{language_predicted} , using language for xtts:{language}")
    else:
        # Hard to detect language fast in short sentence, use english default
        language = "en"
        if verbose:
            print(f"Language: Prompt is short or autodetect language disabled using english for xtts")

    return language