Spaces:
Running
Running
| from modules.utils.zh_normalization.text_normlization import * | |
| character_map = { | |
| ":": ",", | |
| ";": ",", | |
| "!": "。", | |
| "(": ",", | |
| ")": ",", | |
| "【": ",", | |
| "】": ",", | |
| "『": ",", | |
| "』": ",", | |
| "「": ",", | |
| "」": ",", | |
| "《": ",", | |
| "》": ",", | |
| "-": ",", | |
| "‘": " ", | |
| "“": " ", | |
| "’": " ", | |
| "”": " ", | |
| ":": ",", | |
| ";": ",", | |
| "!": ".", | |
| "(": ",", | |
| ")": ",", | |
| # '[': ',', | |
| # ']': ',', | |
| ">": ",", | |
| "<": ",", | |
| "-": ",", | |
| } | |
| character_to_word = { | |
| " & ": " and ", | |
| } | |
| def apply_character_to_word(text): | |
| for k, v in character_to_word.items(): | |
| text = text.replace(k, v) | |
| return text | |
| def apply_character_map(text): | |
| translation_table = str.maketrans(character_map) | |
| return text.translate(translation_table) | |
| def insert_spaces_between_uppercase(s): | |
| # 使用正则表达式在每个相邻的大写字母之间插入空格 | |
| return re.sub( | |
| r"(?<=[A-Z])(?=[A-Z])|(?<=[a-z])(?=[A-Z])|(?<=[\u4e00-\u9fa5])(?=[A-Z])|(?<=[A-Z])(?=[\u4e00-\u9fa5])", | |
| " ", | |
| s, | |
| ) | |
| def ensure_suffix(a: str, b: str, c: str): | |
| a = a.strip() | |
| if not a.endswith(b): | |
| a += c | |
| return a | |
| email_domain_map = { | |
| "outlook.com": "Out look", | |
| "hotmail.com": "Hot mail", | |
| "yahoo.com": "雅虎", | |
| } | |
| # 找到所有 email 并将 name 分割为单个字母,@替换为 at ,. 替换为 dot,常见域名替换为单词 | |
| # | |
| # 例如: | |
| # zhzluke96@outlook.com => z h z l u k e 9 6 at out look dot com | |
| def email_detect(text): | |
| email_pattern = re.compile(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})") | |
| def replace(match): | |
| email = match.group(1) | |
| name, domain = email.split("@") | |
| name = " ".join(name) | |
| if domain in email_domain_map: | |
| domain = email_domain_map[domain] | |
| domain = domain.replace(".", " dot ") | |
| return f"{name} at {domain}" | |
| return email_pattern.sub(replace, text) | |
| def pre_normalize(text): | |
| # NOTE: 效果一般... | |
| # text = email_detect(text) | |
| return text | |
| def post_normalize(text): | |
| text = insert_spaces_between_uppercase(text) | |
| text = apply_character_map(text) | |
| text = apply_character_to_word(text) | |
| return text | |
| def text_normalize(text, is_end=False): | |
| # https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization | |
| tx = TextNormalizer() | |
| # 匹配 \[.+?\] 的部分 | |
| pattern = re.compile(r"(\[.+?\])|([^[]+)") | |
| def normalize_part(part): | |
| part = pre_normalize(part) | |
| sentences = tx.normalize(part) | |
| dest_text = "" | |
| for sentence in sentences: | |
| dest_text += post_normalize(sentence) | |
| return dest_text | |
| def replace(match): | |
| if match.group(1): | |
| return f" {match.group(1)} " | |
| else: | |
| return normalize_part(match.group(2)) | |
| result = pattern.sub(replace, text) | |
| # NOTE: 加了会有杂音... | |
| # if is_end: | |
| # 加这个是为了防止吞字 | |
| # result = ensure_suffix(result, "[uv_break]", "。。。[uv_break]。。。") | |
| return result | |
| if __name__ == "__main__": | |
| print( | |
| text_normalize( | |
| "ChatTTS是专门为对话场景设计的文本转语音模型,例如LLM助手对话任务。它支持英文和中文两种语言。最大的模型使用了10万小时以上的中英文数据进行训练。在HuggingFace中开源的版本为4万小时训练且未SFT的版本." | |
| ) | |
| ) | |
| print( | |
| text_normalize( | |
| " [oral_9] [laugh_0] [break_0] 电 [speed_0] 影 [speed_0] 中 梁朝伟 [speed_9] 扮演的陈永仁的编号27149" | |
| ) | |
| ) | |
| print(text_normalize(" 明天有62%的概率降雨")) | |