|
from functools import lru_cache |
|
from modules.utils.zh_normalization.text_normlization import * |
|
import emojiswitch |
|
from modules.utils.markdown import markdown_to_text |
|
from modules import models |
|
import re |
|
|
|
|
|
|
|
DISABLE_UNK_TOKEN_CHECK = False |
|
|
|
|
|
@lru_cache(maxsize=64) |
|
def is_chinese(text): |
|
|
|
chinese_pattern = re.compile(r"[\u4e00-\u9fff]") |
|
return bool(chinese_pattern.search(text)) |
|
|
|
|
|
@lru_cache(maxsize=64) |
|
def is_eng(text): |
|
eng_pattern = re.compile(r"[a-zA-Z]") |
|
return bool(eng_pattern.search(text)) |
|
|
|
|
|
@lru_cache(maxsize=64) |
|
def guess_lang(text): |
|
if is_chinese(text): |
|
return "zh" |
|
if is_eng(text): |
|
return "en" |
|
return "zh" |
|
|
|
|
|
post_normalize_pipeline = [] |
|
pre_normalize_pipeline = [] |
|
|
|
|
|
def post_normalize(): |
|
def decorator(func): |
|
post_normalize_pipeline.append(func) |
|
return func |
|
|
|
return decorator |
|
|
|
|
|
def pre_normalize(): |
|
def decorator(func): |
|
pre_normalize_pipeline.append(func) |
|
return func |
|
|
|
return decorator |
|
|
|
|
|
def apply_pre_normalize(text): |
|
for func in pre_normalize_pipeline: |
|
text = func(text) |
|
return text |
|
|
|
|
|
def apply_post_normalize(text): |
|
for func in post_normalize_pipeline: |
|
text = func(text) |
|
return text |
|
|
|
|
|
def is_markdown(text): |
|
markdown_patterns = [ |
|
r"(^|\s)#[^#]", |
|
r"\*\*.*?\*\*", |
|
r"\*.*?\*", |
|
r"!\[.*?\]\(.*?\)", |
|
r"\[.*?\]\(.*?\)", |
|
r"`[^`]+`", |
|
r"```[\s\S]*?```", |
|
r"(^|\s)\* ", |
|
r"(^|\s)\d+\. ", |
|
r"(^|\s)> ", |
|
r"(^|\s)---", |
|
] |
|
|
|
for pattern in markdown_patterns: |
|
if re.search(pattern, text, re.MULTILINE): |
|
return True |
|
|
|
return False |
|
|
|
|
|
character_map = { |
|
":": ",", |
|
";": ",", |
|
"!": "。", |
|
"(": ",", |
|
")": ",", |
|
"【": ",", |
|
"】": ",", |
|
"『": ",", |
|
"』": ",", |
|
"「": ",", |
|
"」": ",", |
|
"《": ",", |
|
"》": ",", |
|
"-": ",", |
|
"‘": " ", |
|
"“": " ", |
|
"’": " ", |
|
"”": " ", |
|
'"': " ", |
|
"'": " ", |
|
":": ",", |
|
";": ",", |
|
"!": ".", |
|
"(": ",", |
|
")": ",", |
|
"[": ",", |
|
"]": ",", |
|
">": ",", |
|
"<": ",", |
|
"-": ",", |
|
"~": " ", |
|
"~": " ", |
|
"/": " ", |
|
} |
|
|
|
character_to_word = { |
|
" & ": " and ", |
|
} |
|
|
|
|
|
|
|
|
|
@post_normalize() |
|
def apply_character_to_word(text): |
|
for k, v in character_to_word.items(): |
|
text = text.replace(k, v) |
|
return text |
|
|
|
|
|
@post_normalize() |
|
def apply_character_map(text): |
|
translation_table = str.maketrans(character_map) |
|
return text.translate(translation_table) |
|
|
|
|
|
@post_normalize() |
|
def apply_emoji_map(text): |
|
lang = guess_lang(text) |
|
return emojiswitch.demojize(text, delimiters=("", ""), lang=lang) |
|
|
|
|
|
@post_normalize() |
|
def insert_spaces_between_uppercase(s): |
|
|
|
return re.sub( |
|
r"(?<=[A-Z])(?=[A-Z])|(?<=[a-z])(?=[A-Z])|(?<=[\u4e00-\u9fa5])(?=[A-Z])|(?<=[A-Z])(?=[\u4e00-\u9fa5])", |
|
" ", |
|
s, |
|
) |
|
|
|
|
|
@post_normalize() |
|
def replace_unk_tokens(text): |
|
""" |
|
把不在字典里的字符替换为 " , " |
|
""" |
|
if DISABLE_UNK_TOKEN_CHECK: |
|
return text |
|
chat_tts = models.load_chat_tts() |
|
if "tokenizer" not in chat_tts.pretrain_models: |
|
|
|
|
|
return text |
|
tokenizer = chat_tts.pretrain_models["tokenizer"] |
|
vocab = tokenizer.get_vocab() |
|
vocab_set = set(vocab.keys()) |
|
|
|
vocab_set.update(set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")) |
|
vocab_set.update(set(" \n\r\t")) |
|
replaced_chars = [char if char in vocab_set else " , " for char in text] |
|
output_text = "".join(replaced_chars) |
|
return output_text |
|
|
|
|
|
|
|
|
|
|
|
@pre_normalize() |
|
def apply_markdown_to_text(text): |
|
if is_markdown(text): |
|
text = markdown_to_text(text) |
|
return text |
|
|
|
|
|
|
|
|
|
@pre_normalize() |
|
def replace_quotes(text): |
|
repl = r"\n\1\n" |
|
patterns = [ |
|
['"', '"'], |
|
["'", "'"], |
|
["“", "”"], |
|
["‘", "’"], |
|
] |
|
for p in patterns: |
|
text = re.sub(rf"({p[0]}[^{p[0]}{p[1]}]+?{p[1]})", repl, text) |
|
return text |
|
|
|
|
|
def ensure_suffix(a: str, b: str, c: str): |
|
a = a.strip() |
|
if not a.endswith(b): |
|
a += c |
|
return a |
|
|
|
|
|
email_domain_map = { |
|
"outlook.com": "Out look", |
|
"hotmail.com": "Hot mail", |
|
"yahoo.com": "雅虎", |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
def email_detect(text): |
|
email_pattern = re.compile(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})") |
|
|
|
def replace(match): |
|
email = match.group(1) |
|
name, domain = email.split("@") |
|
name = " ".join(name) |
|
if domain in email_domain_map: |
|
domain = email_domain_map[domain] |
|
domain = domain.replace(".", " dot ") |
|
return f"{name} at {domain}" |
|
|
|
return email_pattern.sub(replace, text) |
|
|
|
|
|
def sentence_normalize(sentence_text: str): |
|
|
|
tx = TextNormalizer() |
|
|
|
|
|
pattern = re.compile(r"(\[.+?\])|([^[]+)") |
|
|
|
def normalize_part(part): |
|
sentences = tx.normalize(part) if guess_lang(part) == "zh" else [part] |
|
dest_text = "" |
|
for sentence in sentences: |
|
sentence = apply_post_normalize(sentence) |
|
dest_text += sentence |
|
return dest_text |
|
|
|
def replace(match): |
|
if match.group(1): |
|
return f" {match.group(1)} " |
|
else: |
|
return normalize_part(match.group(2)) |
|
|
|
result = pattern.sub(replace, sentence_text) |
|
|
|
|
|
|
|
|
|
|
|
|
|
return result |
|
|
|
|
|
def text_normalize(text, is_end=False): |
|
text = apply_pre_normalize(text) |
|
lines = text.split("\n") |
|
lines = [line.strip() for line in lines] |
|
lines = [line for line in lines if line] |
|
lines = [sentence_normalize(line) for line in lines] |
|
content = "\n".join(lines) |
|
return content |
|
|
|
|
|
if __name__ == "__main__": |
|
test_cases = [ |
|
"ChatTTS是专门为对话场景设计的文本转语音模型,例如LLM助手对话任务。它支持英文和中文两种语言。最大的模型使用了10万小时以上的中英文数据进行训练。在HuggingFace中开源的版本为4万小时训练且未SFT的版本.", |
|
" [oral_9] [laugh_0] [break_0] 电 [speed_0] 影 [speed_0] 中 梁朝伟 [speed_9] 扮演的陈永仁的编号27149", |
|
" 明天有62%的概率降雨", |
|
"大🍌,一条大🍌,嘿,你的感觉真的很奇妙 [lbreak]", |
|
""" |
|
# 你好,世界 |
|
```js |
|
console.log('1') |
|
``` |
|
**加粗** |
|
|
|
*一条文本* |
|
""", |
|
""" |
|
在沙漠、岩石、雪地上行走了很长的时间以后,小王子终于发现了一条大路。所有的大路都是通往人住的地方的。 |
|
“你们好。”小王子说。 |
|
这是一个玫瑰盛开的花园。 |
|
“你好。”玫瑰花说道。 |
|
小王子瞅着这些花,它们全都和他的那朵花一样。 |
|
“你们是什么花?”小王子惊奇地问。 |
|
“我们是玫瑰花。”花儿们说道。 |
|
“啊!”小王子说……。 |
|
""", |
|
""" |
|
State-of-the-art Machine Learning for PyTorch, TensorFlow, and JAX. |
|
|
|
🤗 Transformers provides APIs and tools to easily download and train state-of-the-art pretrained models. Using pretrained models can reduce your compute costs, carbon footprint, and save you the time and resources required to train a model from scratch. These models support common tasks in different modalities, such as: |
|
|
|
📝 Natural Language Processing: text classification, named entity recognition, question answering, language modeling, summarization, translation, multiple choice, and text generation. |
|
🖼️ Computer Vision: image classification, object detection, and segmentation. |
|
🗣️ Audio: automatic speech recognition and audio classification. |
|
🐙 Multimodal: table question answering, optical character recognition, information extraction from scanned documents, video classification, and visual question answering. |
|
""", |
|
""" |
|
120米 |
|
有12%的概率会下雨 |
|
""", |
|
] |
|
|
|
for i, test_case in enumerate(test_cases): |
|
print(f"case {i}:\n", {"x": text_normalize(test_case, is_end=True)}) |
|
|