File size: 3,336 Bytes
c5ed230
 
 
d94ccbe
c5ed230
35c29ec
 
 
c5ed230
5854014
c5ed230
 
 
 
 
 
 
35c29ec
 
 
 
 
 
 
 
 
 
 
 
c5ed230
 
 
5854014
35c29ec
5854014
c5ed230
 
 
 
 
 
 
 
 
 
 
 
 
 
5854014
c5ed230
5854014
 
 
 
 
 
 
c5ed230
5854014
c5ed230
 
5854014
 
c5ed230
 
5854014
c5ed230
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5854014
 
c5ed230
 
 
 
5854014
 
c5ed230
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import regex as re
import config
from .utils import check_is_none
from logger import logger

# 读取配置选择语种识别库
clf = getattr(config, "LANGUAGE_IDENTIFICATION_LIBRARY", "fastlid")


def clasify_lang(text, speaker_lang):
    pattern = r'[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`' \
              r'\!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」' \
              r'『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+'
    words = re.split(pattern, text)

    pre = ""
    p = 0

    if clf.upper() == "FASTLID" or clf.upper() == "FASTTEXT":
        from fastlid import fastlid
        detect = fastlid
        if speaker_lang != None: fastlid.set_languages = speaker_lang
    elif clf.upper() == "LANGID":
        import langid
        detect = langid.classify
        if speaker_lang != None: langid.set_languages(speaker_lang)
    else:
        raise ValueError(f"Wrong LANGUAGE_IDENTIFICATION_LIBRARY in config.py")

    for word in words:

        if check_is_none(word): continue

        lang = detect(word)[0]

        if pre == "":
            text = text[:p] + text[p:].replace(word, f'[{lang.upper()}]' + word, 1)
            p += len(f'[{lang.upper()}]')
        elif pre != lang:
            text = text[:p] + text[p:].replace(word, f'[{pre.upper()}][{lang.upper()}]' + word, 1)
            p += len(f'[{pre.upper()}][{lang.upper()}]')
        pre = lang
        p += text[p:].index(word) + len(word)
    text += f"[{pre.upper()}]"

    return text


def cut(text, max):
    pattern = r'[!(),—+\-.:;??。,、;:]+'
    sentences = re.split(pattern, text)
    discarded_chars = re.findall(pattern, text)

    sentence_list, count, p = [], 0, 0

    # 按被分割的符号遍历
    for i, discarded_chars in enumerate(discarded_chars):
        count += len(sentences[i]) + len(discarded_chars)
        if count >= max:
            sentence_list.append(text[p:p + count].strip())
            p += count
            count = 0

    # 加入最后剩余的文本
    if p < len(text):
        sentence_list.append(text[p:])

    return sentence_list


def sentence_split(text, max=50, lang="auto", speaker_lang=None):
    # 如果该speaker只支持一种语言
    if speaker_lang is not None and len(speaker_lang) == 1:
        if lang.upper() not in ["AUTO", "MIX"] and lang.lower() != speaker_lang[0]:
            logger.debug(
                f"lang \"{lang}\" is not in speaker_lang {speaker_lang},automatically set lang={speaker_lang[0]}")
        lang = speaker_lang[0]

    sentence_list = []
    if lang.upper() != "MIX":
        if max <= 0:
            sentence_list.append(
                clasify_lang(text,
                             speaker_lang) if lang.upper() == "AUTO" else f"[{lang.upper()}]{text}[{lang.upper()}]")
        else:
            for i in cut(text, max):
                if check_is_none(i): continue
                sentence_list.append(
                    clasify_lang(i,
                                 speaker_lang) if lang.upper() == "AUTO" else f"[{lang.upper()}]{i}[{lang.upper()}]")
    else:
        sentence_list.append(text)

    for i in sentence_list:
        logger.debug(i)

    return sentence_list