Spaces:
Runtime error
Runtime error
File size: 2,477 Bytes
d358e26 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
import re
from pypinyin import lazy_pinyin, Style
from .custom_pypinyin_dict import phrase_pinyin_data
phrase_pinyin_data.load()
import jieba
from .cn2an import an2cn
# ζ ηΉη¬¦ε·ζ£ε
punc_map = {
"οΌ": ",",
"οΌ": ",",
"οΌ": ",",
"γ": ".",
"οΌ": "!",
"οΌ": "?",
"\n": ".",
"Β·": ",",
"γ": ",",
"$": ".",
"β": "'",
"β": "'",
'"': "'",
"β": "'",
"β": "'",
"οΌ": "'",
"οΌ": "'",
"(": "'",
")": "'",
"γ": "'",
"γ": "'",
"γ": "'",
"γ": "'",
"[": "'",
"]": "'",
"β": "-",
"ο½": "~",
"γ": "'",
"γ": "'",
"γ": "'",
"γ": "'",
}
punc_table = str.maketrans(punc_map)
# ζ°εζ£εε
number_pattern = re.compile(r'\d+(?:\.?\d+)?')
def replace_number(match):
return an2cn(match.group())
def normalize_number(text):
return number_pattern.sub(replace_number, text)
# get symbols of phones
def load_pinyin_symbols(path):
pinyin_dict={}
temp = []
with open(path, "r", encoding='utf-8') as f:
content = f.readlines()
for line in content:
cuts = line.strip().split(',')
pinyin = cuts[0]
phones = cuts[1].split(' ')
pinyin_dict[pinyin] = phones
temp.extend(phones)
temp = list(set(temp))
tone = []
for phone in temp:
for i in range(1, 6):
phone2 = phone + str(i)
tone.append(phone2)
print(sorted(tone, key=lambda x: len(x)))
return pinyin_dict
def load_pinyin_dict(path):
pinyin_dict = {}
with open(path, "r", encoding='utf-8') as f:
for line in f:
key, value = line.strip().split(',', 1)
pinyin_dict[key] = value.split()
return pinyin_dict
pinyin_dict = load_pinyin_dict('text/cnm3/ds_CNM3.txt')
def chinese_to_cnm3(text: str):
text = text.translate(punc_table)
text = normalize_number(text)
words = jieba.lcut(text, cut_all=False)
phones = []
for word in words:
pinyin_list = lazy_pinyin(word, style=Style.TONE3, neutral_tone_with_five=True)
for pinyin in pinyin_list:
if pinyin[-1].isdigit():
tone = pinyin[-1]
syllable = pinyin[:-1]
phone = pinyin_dict[syllable]
phones.extend([ph + tone for ph in phone])
elif pinyin[-1].isalpha():
pass
else:
phones.extend(pinyin)
return phones |