import re from typing import Callable from tools.i18n.i18n import I18nAuto i18n = I18nAuto() METHODS = dict() def get_method(name:str)->Callable: method = METHODS.get(name, None) if method is None: raise ValueError(f"Method {name} not found") return method def register_method(name): def decorator(func): METHODS[name] = func return func return decorator splits = {",", "。", "?", "!", ",", ".", "?", "!", "~", ":", ":", "—", "…", } def split_big_text(text, max_len=510): # 定义全角和半角标点符号 punctuation = "".join(splits) # 切割文本 segments = re.split('([' + punctuation + '])', text) # 初始化结果列表和当前片段 result = [] current_segment = '' for segment in segments: # 如果当前片段加上新的片段长度超过max_len,就将当前片段加入结果列表,并重置当前片段 if len(current_segment + segment) > max_len: result.append(current_segment) current_segment = segment else: current_segment += segment # 将最后一个片段加入结果列表 if current_segment: result.append(current_segment) return result def split(todo_text): todo_text = todo_text.replace("……", "。").replace("——", ",") if todo_text[-1] not in splits: todo_text += "。" i_split_head = i_split_tail = 0 len_text = len(todo_text) todo_texts = [] while 1: if i_split_head >= len_text: break # 结尾一定有标点,所以直接跳出即可,最后一段在上次已加入 if todo_text[i_split_head] in splits: i_split_head += 1 todo_texts.append(todo_text[i_split_tail:i_split_head]) i_split_tail = i_split_head else: i_split_head += 1 return todo_texts def cut_sentence_multilang(text, max_length=30): # 初始化计数器 word_count = 0 in_word = False for index, char in enumerate(text): if char.isspace(): # 如果当前字符是空格 in_word = False elif char.isascii() and not in_word: # 如果是ASCII字符(英文)并且不在单词内 word_count += 1 # 新的英文单词 in_word = True elif not char.isascii(): # 如果字符非英文 word_count += 1 # 每个非英文字符单独计为一个字 if word_count > max_length: return text[:index], text[index:] return text, "" # contributed by XTer # 简单的按长度切分,不希望出现超长的句子 def split_long_sentence(text, max_length=510): opts = [] sentences = text.split('\n') for sentence in sentences: prev_text , sentence = cut_sentence_multilang(sentence, max_length) while sentence.strip() != "": opts.append(prev_text) prev_text , sentence = cut_sentence_multilang(sentence, max_length) opts.append(prev_text) return "\n".join(opts) # 不切 @register_method("cut0") def cut0(inp): return inp # 凑四句一切 @register_method("cut1") def cut1(inp): inp = inp.strip("\n") inps = split(inp) split_idx = list(range(0, len(inps), 4)) split_idx[-1] = None if len(split_idx) > 1: opts = [] for idx in range(len(split_idx) - 1): opts.append("".join(inps[split_idx[idx]: split_idx[idx + 1]])) else: opts = [inp] return "\n".join(opts) # 凑50字一切 @register_method("cut2") def cut2(inp, max_length=50): inp = split_long_sentence(inp).strip("\n") inps = split(inp) if len(inps) < 2: return inp opts = [] summ = 0 tmp_str = "" for i in range(len(inps)): summ += len(inps[i]) tmp_str += inps[i] if summ > max_length: summ = 0 opts.append(tmp_str) tmp_str = "" if tmp_str != "": opts.append(tmp_str) # print(opts) if len(opts) > 1 and len(opts[-1]) < 50: ##如果最后一个太短了,和前一个合一起 opts[-2] = opts[-2] + opts[-1] opts = opts[:-1] return "\n".join(opts) # 按中文句号。切 @register_method("cut3") def cut3(inp): inp = split_long_sentence(inp).strip("\n") return "\n".join(["%s" % item for item in inp.strip("。").split("。")]) # 按英文句号.切 @register_method("cut4") def cut4(inp): inp = inp.strip("\n") return "\n".join(["%s" % item for item in inp.strip(".").split(".")]) # 按标点符号切 # contributed by https://github.com/AI-Hobbyist/GPT-SoVITS/blob/main/GPT_SoVITS/inference_webui.py @register_method("cut5") def cut5(inp): # if not re.search(r'[^\w\s]', inp[-1]): # inp += '。' inp = inp.strip("\n") punds = r'[,.;?!、,。?!;:…]' items = re.split(f'({punds})', inp) mergeitems = ["".join(group) for group in zip(items[::2], items[1::2])] # 在句子不存在符号或句尾无符号的时候保证文本完整 if len(items)%2 == 1: mergeitems.append(items[-1]) opt = "\n".join(mergeitems) return opt def count_words_multilang(text): # 初始化计数器 word_count = 0 in_word = False for char in text: if char.isspace(): # 如果当前字符是空格 in_word = False elif char.isascii() and not in_word: # 如果是ASCII字符(英文)并且不在单词内 word_count += 1 # 新的英文单词 in_word = True elif not char.isascii(): # 如果字符非英文 word_count += 1 # 每个非英文字符单独计为一个字 return word_count # contributed by https://github.com/X-T-E-R/GPT-SoVITS-Inference/blob/main/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py @register_method("auto_cut") def auto_cut(inp, max_length=30): # if not re.search(r'[^\w\s]', inp[-1]): # inp += '。' inp = inp.strip("\n") inp = inp.replace(". ", "。") erase_punds = r'[“”"‘’\'()()【】[\]{}<>《》〈〉〔〕〖〗〘〙〚〛〛〞〟]' inp = re.sub(erase_punds, '', inp) split_punds = r'[?!。?!~:]' if inp[-1] not in split_punds: inp+="。" items = re.split(f'({split_punds})', inp) items = ["".join(group) for group in zip(items[::2], items[1::2])] def process_commas(text, max_length): # Define separators and the regular expression for splitting separators = [',', ',', '、', '——', '…'] # 使用正则表达式的捕获组来保留分隔符,分隔符两边的括号就是所谓的捕获组 regex_pattern = '(' + '|'.join(map(re.escape, separators)) + ')' # 使用re.split函数分割文本,由于使用了捕获组,分隔符也会作为分割结果的一部分返回 sentences = re.split(regex_pattern, text) processed_text = "" current_line = "" final_sentences = [] for sentence in sentences: if count_words_multilang(sentence)>max_length: final_sentences+=split_long_sentence(sentence,max_length=max_length).split("\n") else: final_sentences.append(sentence) for sentence in final_sentences: # Add the length of the sentence plus one for the space or newline that will follow if count_words_multilang(current_line + sentence) <= max_length: # If adding the next sentence does not exceed max length, add it to the current line current_line += sentence else: # If the current line is too long, start a new line processed_text += current_line.strip() + '\n' current_line = sentence + " " # Start the new line with the current sentence # Add any remaining text in current_line to processed_text processed_text += current_line.strip() return processed_text final_items = [] for item in items: final_items+=process_commas(item,max_length=max_length).split("\n") final_items = [item for item in final_items if item.strip() and not (len(item.strip()) == 1 and item.strip() in "?!,,。?!~:")] return "\n".join(final_items) if __name__ == '__main__': str1 = """我 有i一个j k 1""" print(count_words_multilang(str1)) print(cut_sentence_multilang(str1, 20))