|
import re |
|
|
|
import cn2an |
|
|
|
from style_bert_vits2.nlp.symbols import PUNCTUATIONS |
|
|
|
|
|
__REPLACE_MAP = { |
|
"οΌ": ",", |
|
"οΌ": ",", |
|
"οΌ": ",", |
|
"γ": ".", |
|
"οΌ": "!", |
|
"οΌ": "?", |
|
"\n": ".", |
|
"Β·": ",", |
|
"γ": ",", |
|
"...": "β¦", |
|
"$": ".", |
|
"β": "'", |
|
"β": "'", |
|
'"': "'", |
|
"β": "'", |
|
"β": "'", |
|
"οΌ": "'", |
|
"οΌ": "'", |
|
"(": "'", |
|
")": "'", |
|
"γ": "'", |
|
"γ": "'", |
|
"γ": "'", |
|
"γ": "'", |
|
"[": "'", |
|
"]": "'", |
|
"β": "-", |
|
"ο½": "-", |
|
"~": "-", |
|
"γ": "'", |
|
"γ": "'", |
|
} |
|
|
|
|
|
def normalize_text(text: str) -> str: |
|
numbers = re.findall(r"\d+(?:\.?\d+)?", text) |
|
for number in numbers: |
|
text = text.replace(number, cn2an.an2cn(number), 1) |
|
text = replace_punctuation(text) |
|
return text |
|
|
|
|
|
def replace_punctuation(text: str) -> str: |
|
|
|
text = text.replace("ε―", "ζ©").replace("ε£", "ζ―") |
|
pattern = re.compile("|".join(re.escape(p) for p in __REPLACE_MAP)) |
|
|
|
replaced_text = pattern.sub(lambda x: __REPLACE_MAP[x.group()], text) |
|
|
|
replaced_text = re.sub( |
|
r"[^\u4e00-\u9fa5" + "".join(PUNCTUATIONS) + r"]+", "", replaced_text |
|
) |
|
|
|
return replaced_text |
|
|