Spaces:
Sleeping
Sleeping
from functools import lru_cache | |
from modules.utils.zh_normalization.text_normlization import * | |
import emojiswitch | |
from modules.utils.markdown import markdown_to_text | |
from modules import models | |
import re | |
# 是否关闭 unk token 检查 | |
# NOTE: 单测的时候用于跳过模型加载 | |
DISABLE_UNK_TOKEN_CHECK = False | |
def is_chinese(text): | |
# 中文字符的 Unicode 范围是 \u4e00-\u9fff | |
chinese_pattern = re.compile(r"[\u4e00-\u9fff]") | |
return bool(chinese_pattern.search(text)) | |
def is_eng(text): | |
eng_pattern = re.compile(r"[a-zA-Z]") | |
return bool(eng_pattern.search(text)) | |
def guess_lang(text): | |
if is_chinese(text): | |
return "zh" | |
if is_eng(text): | |
return "en" | |
return "zh" | |
post_normalize_pipeline = [] | |
pre_normalize_pipeline = [] | |
def post_normalize(): | |
def decorator(func): | |
post_normalize_pipeline.append(func) | |
return func | |
return decorator | |
def pre_normalize(): | |
def decorator(func): | |
pre_normalize_pipeline.append(func) | |
return func | |
return decorator | |
def apply_pre_normalize(text): | |
for func in pre_normalize_pipeline: | |
text = func(text) | |
return text | |
def apply_post_normalize(text): | |
for func in post_normalize_pipeline: | |
text = func(text) | |
return text | |
def is_markdown(text): | |
markdown_patterns = [ | |
r"(^|\s)#[^#]", # 标题 | |
r"\*\*.*?\*\*", # 加粗 | |
r"\*.*?\*", # 斜体 | |
r"!\[.*?\]\(.*?\)", # 图片 | |
r"\[.*?\]\(.*?\)", # 链接 | |
r"`[^`]+`", # 行内代码 | |
r"```[\s\S]*?```", # 代码块 | |
r"(^|\s)\* ", # 无序列表 | |
r"(^|\s)\d+\. ", # 有序列表 | |
r"(^|\s)> ", # 引用 | |
r"(^|\s)---", # 分隔线 | |
] | |
for pattern in markdown_patterns: | |
if re.search(pattern, text, re.MULTILINE): | |
return True | |
return False | |
character_map = { | |
":": ",", | |
";": ",", | |
"!": "。", | |
"(": ",", | |
")": ",", | |
"【": ",", | |
"】": ",", | |
"『": ",", | |
"』": ",", | |
"「": ",", | |
"」": ",", | |
"《": ",", | |
"》": ",", | |
"-": ",", | |
"‘": " ", | |
"“": " ", | |
"’": " ", | |
"”": " ", | |
'"': " ", | |
"'": " ", | |
":": ",", | |
";": ",", | |
"!": ".", | |
"(": ",", | |
")": ",", | |
"[": ",", | |
"]": ",", | |
">": ",", | |
"<": ",", | |
"-": ",", | |
"~": " ", | |
"~": " ", | |
"/": " ", | |
} | |
character_to_word = { | |
" & ": " and ", | |
} | |
## ---------- post normalize ---------- | |
def apply_character_to_word(text): | |
for k, v in character_to_word.items(): | |
text = text.replace(k, v) | |
return text | |
def apply_character_map(text): | |
translation_table = str.maketrans(character_map) | |
return text.translate(translation_table) | |
def apply_emoji_map(text): | |
lang = guess_lang(text) | |
return emojiswitch.demojize(text, delimiters=("", ""), lang=lang) | |
def insert_spaces_between_uppercase(s): | |
# 使用正则表达式在每个相邻的大写字母之间插入空格 | |
return re.sub( | |
r"(?<=[A-Z])(?=[A-Z])|(?<=[a-z])(?=[A-Z])|(?<=[\u4e00-\u9fa5])(?=[A-Z])|(?<=[A-Z])(?=[\u4e00-\u9fa5])", | |
" ", | |
s, | |
) | |
def replace_unk_tokens(text): | |
""" | |
把不在字典里的字符替换为 " , " | |
""" | |
if DISABLE_UNK_TOKEN_CHECK: | |
return text | |
chat_tts = models.load_chat_tts() | |
if "tokenizer" not in chat_tts.pretrain_models: | |
# 这个地方只有在 huggingface spaces 中才会触发 | |
# 因为 hugggingface 自动处理模型卸载加载,所以如果拿不到就算了... | |
return text | |
tokenizer = chat_tts.pretrain_models["tokenizer"] | |
vocab = tokenizer.get_vocab() | |
vocab_set = set(vocab.keys()) | |
# 添加所有英语字符 | |
vocab_set.update(set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")) | |
vocab_set.update(set(" \n\r\t")) | |
replaced_chars = [char if char in vocab_set else " , " for char in text] | |
output_text = "".join(replaced_chars) | |
return output_text | |
## ---------- pre normalize ---------- | |
def apply_markdown_to_text(text): | |
if is_markdown(text): | |
text = markdown_to_text(text) | |
return text | |
# 将 "xxx" => \nxxx\n | |
# 将 'xxx' => \nxxx\n | |
def replace_quotes(text): | |
repl = r"\n\1\n" | |
patterns = [ | |
['"', '"'], | |
["'", "'"], | |
["“", "”"], | |
["‘", "’"], | |
] | |
for p in patterns: | |
text = re.sub(rf"({p[0]}[^{p[0]}{p[1]}]+?{p[1]})", repl, text) | |
return text | |
def ensure_suffix(a: str, b: str, c: str): | |
a = a.strip() | |
if not a.endswith(b): | |
a += c | |
return a | |
email_domain_map = { | |
"outlook.com": "Out look", | |
"hotmail.com": "Hot mail", | |
"yahoo.com": "雅虎", | |
} | |
# 找到所有 email 并将 name 分割为单个字母,@替换为 at ,. 替换为 dot,常见域名替换为单词 | |
# | |
# 例如: | |
# zhzluke96@outlook.com => z h z l u k e 9 6 at out look dot com | |
def email_detect(text): | |
email_pattern = re.compile(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})") | |
def replace(match): | |
email = match.group(1) | |
name, domain = email.split("@") | |
name = " ".join(name) | |
if domain in email_domain_map: | |
domain = email_domain_map[domain] | |
domain = domain.replace(".", " dot ") | |
return f"{name} at {domain}" | |
return email_pattern.sub(replace, text) | |
def sentence_normalize(sentence_text: str): | |
# https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization | |
tx = TextNormalizer() | |
# 匹配 \[.+?\] 的部分 | |
pattern = re.compile(r"(\[.+?\])|([^[]+)") | |
def normalize_part(part): | |
sentences = tx.normalize(part) if guess_lang(part) == "zh" else [part] | |
dest_text = "" | |
for sentence in sentences: | |
sentence = apply_post_normalize(sentence) | |
dest_text += sentence | |
return dest_text | |
def replace(match): | |
if match.group(1): | |
return f" {match.group(1)} " | |
else: | |
return normalize_part(match.group(2)) | |
result = pattern.sub(replace, sentence_text) | |
# NOTE: 加了会有杂音... | |
# if is_end: | |
# 加这个是为了防止吞字 | |
# result = ensure_suffix(result, "[uv_break]", "。。。[uv_break]。。。") | |
return result | |
def text_normalize(text, is_end=False): | |
text = apply_pre_normalize(text) | |
lines = text.split("\n") | |
lines = [line.strip() for line in lines] | |
lines = [line for line in lines if line] | |
lines = [sentence_normalize(line) for line in lines] | |
content = "\n".join(lines) | |
return content | |
if __name__ == "__main__": | |
test_cases = [ | |
"ChatTTS是专门为对话场景设计的文本转语音模型,例如LLM助手对话任务。它支持英文和中文两种语言。最大的模型使用了10万小时以上的中英文数据进行训练。在HuggingFace中开源的版本为4万小时训练且未SFT的版本.", | |
" [oral_9] [laugh_0] [break_0] 电 [speed_0] 影 [speed_0] 中 梁朝伟 [speed_9] 扮演的陈永仁的编号27149", | |
" 明天有62%的概率降雨", | |
"大🍌,一条大🍌,嘿,你的感觉真的很奇妙 [lbreak]", | |
""" | |
# 你好,世界 | |
```js | |
console.log('1') | |
``` | |
**加粗** | |
*一条文本* | |
""", | |
""" | |
在沙漠、岩石、雪地上行走了很长的时间以后,小王子终于发现了一条大路。所有的大路都是通往人住的地方的。 | |
“你们好。”小王子说。 | |
这是一个玫瑰盛开的花园。 | |
“你好。”玫瑰花说道。 | |
小王子瞅着这些花,它们全都和他的那朵花一样。 | |
“你们是什么花?”小王子惊奇地问。 | |
“我们是玫瑰花。”花儿们说道。 | |
“啊!”小王子说……。 | |
""", | |
""" | |
State-of-the-art Machine Learning for PyTorch, TensorFlow, and JAX. | |
🤗 Transformers provides APIs and tools to easily download and train state-of-the-art pretrained models. Using pretrained models can reduce your compute costs, carbon footprint, and save you the time and resources required to train a model from scratch. These models support common tasks in different modalities, such as: | |
📝 Natural Language Processing: text classification, named entity recognition, question answering, language modeling, summarization, translation, multiple choice, and text generation. | |
🖼️ Computer Vision: image classification, object detection, and segmentation. | |
🗣️ Audio: automatic speech recognition and audio classification. | |
🐙 Multimodal: table question answering, optical character recognition, information extraction from scanned documents, video classification, and visual question answering. | |
""", | |
""" | |
120米 | |
有12%的概率会下雨 | |
""", | |
] | |
for i, test_case in enumerate(test_cases): | |
print(f"case {i}:\n", {"x": text_normalize(test_case, is_end=True)}) | |