chattts

Running

chattts / modules /utils /zh_normalization /text_normlization.py

zhzluke96

update

d2b7e94 6 months ago

6.47 kB

	# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	import re
	from typing import List

	from .char_convert import tranditional_to_simplified
	from .chronology import (
	RE_DATE,
	RE_DATE2,
	RE_TIME,
	RE_TIME_RANGE,
	replace_date,
	replace_date2,
	replace_time,
	)
	from .constants import F2H_ASCII_LETTERS, F2H_DIGITS, F2H_SPACE
	from .num import (
	RE_DECIMAL_NUM,
	RE_DEFAULT_NUM,
	RE_FRAC,
	RE_INTEGER,
	RE_NUMBER,
	RE_PERCENTAGE,
	RE_POSITIVE_QUANTIFIERS,
	RE_RANGE,
	replace_default_num,
	replace_frac,
	replace_negative_num,
	replace_number,
	replace_percentage,
	replace_positive_quantifier,
	replace_range,
	)
	from .phonecode import (
	RE_MOBILE_PHONE,
	RE_NATIONAL_UNIFORM_NUMBER,
	RE_TELEPHONE,
	replace_mobile,
	replace_phone,
	)
	from .quantifier import RE_TEMPERATURE, replace_measure, replace_temperature


	class TextNormalizer:
	def __init__(self):
	self.SENTENCE_SPLITOR = re.compile(r"([：、，；。？！,;?!][”’]?)")

	def _split(self, text: str, lang="zh") -> List[str]:
	"""Split long text into sentences with sentence-splitting punctuations.
	Args:
	text (str): The input text.
	Returns:
	List[str]: Sentences.
	"""
	# Only for pure Chinese here
	if lang == "zh":
	text = text.replace(" ", "")
	# 过滤掉特殊字符
	text = re.sub(r"[——《》【】<=>{}()（）#&@“”^_\|…\\]", "", text)
	text = self.SENTENCE_SPLITOR.sub(r"\1\n", text)
	text = text.strip()
	sentences = [sentence.strip() for sentence in re.split(r"\n+", text)]
	return sentences

	def _post_replace(self, sentence: str) -> str:
	# sentence = sentence.replace('/', '每')
	# sentence = sentence.replace('~', '至')
	# sentence = sentence.replace('～', '至')
	sentence = sentence.replace("①", "一")
	sentence = sentence.replace("②", "二")
	sentence = sentence.replace("③", "三")
	sentence = sentence.replace("④", "四")
	sentence = sentence.replace("⑤", "五")
	sentence = sentence.replace("⑥", "六")
	sentence = sentence.replace("⑦", "七")
	sentence = sentence.replace("⑧", "八")
	sentence = sentence.replace("⑨", "九")
	sentence = sentence.replace("⑩", "十")
	sentence = sentence.replace("α", "阿尔法")
	sentence = sentence.replace("β", "贝塔")
	sentence = sentence.replace("γ", "伽玛").replace("Γ", "伽玛")
	sentence = sentence.replace("δ", "德尔塔").replace("Δ", "德尔塔")
	sentence = sentence.replace("ε", "艾普西龙")
	sentence = sentence.replace("ζ", "捷塔")
	sentence = sentence.replace("η", "依塔")
	sentence = sentence.replace("θ", "西塔").replace("Θ", "西塔")
	sentence = sentence.replace("ι", "艾欧塔")
	sentence = sentence.replace("κ", "喀帕")
	sentence = sentence.replace("λ", "拉姆达").replace("Λ", "拉姆达")
	sentence = sentence.replace("μ", "缪")
	sentence = sentence.replace("ν", "拗")
	sentence = sentence.replace("ξ", "克西").replace("Ξ", "克西")
	sentence = sentence.replace("ο", "欧米克伦")
	sentence = sentence.replace("π", "派").replace("Π", "派")
	sentence = sentence.replace("ρ", "肉")
	sentence = (
	sentence.replace("ς", "西格玛")
	.replace("Σ", "西格玛")
	.replace("σ", "西格玛")
	)
	sentence = sentence.replace("τ", "套")
	sentence = sentence.replace("υ", "宇普西龙")
	sentence = sentence.replace("φ", "服艾").replace("Φ", "服艾")
	sentence = sentence.replace("χ", "器")
	sentence = sentence.replace("ψ", "普赛").replace("Ψ", "普赛")
	sentence = sentence.replace("ω", "欧米伽").replace("Ω", "欧米伽")
	# re filter special characters, have one more character "-" than line 68
	# sentence = re.sub(r'[-——《》【】<=>{}()（）#&@“”^_\|…\\]', '', sentence)
	return sentence

	def normalize_sentence(self, sentence: str) -> str:
	# basic character conversions
	sentence = tranditional_to_simplified(sentence)
	sentence = (
	sentence.translate(F2H_ASCII_LETTERS)
	.translate(F2H_DIGITS)
	.translate(F2H_SPACE)
	)

	# number related NSW verbalization
	sentence = RE_DATE.sub(replace_date, sentence)
	sentence = RE_DATE2.sub(replace_date2, sentence)

	# range first
	sentence = RE_TIME_RANGE.sub(replace_time, sentence)
	sentence = RE_TIME.sub(replace_time, sentence)

	sentence = RE_TEMPERATURE.sub(replace_temperature, sentence)
	sentence = replace_measure(sentence)
	sentence = RE_FRAC.sub(replace_frac, sentence)
	sentence = RE_PERCENTAGE.sub(replace_percentage, sentence)
	sentence = RE_MOBILE_PHONE.sub(replace_mobile, sentence)

	sentence = RE_TELEPHONE.sub(replace_phone, sentence)
	sentence = RE_NATIONAL_UNIFORM_NUMBER.sub(replace_phone, sentence)

	sentence = RE_RANGE.sub(replace_range, sentence)
	sentence = RE_INTEGER.sub(replace_negative_num, sentence)
	sentence = RE_DECIMAL_NUM.sub(replace_number, sentence)
	sentence = RE_POSITIVE_QUANTIFIERS.sub(replace_positive_quantifier, sentence)
	sentence = RE_DEFAULT_NUM.sub(replace_default_num, sentence)
	sentence = RE_NUMBER.sub(replace_number, sentence)
	sentence = self._post_replace(sentence)

	return sentence

	def normalize(self, text: str, lang="") -> List[str]:
	sentences = self._split(text, lang)
	sentences = [self.normalize_sentence(sent) for sent in sentences]
	return sentences