Spaces:
Runtime error
Runtime error
import os | |
import re | |
import sys | |
import jieba | |
import cn2an | |
import logging | |
from pypinyin import lazy_pinyin, BOPOMOFO | |
# logging.getLogger('jieba').setLevel(logging.WARNING) | |
# jieba.set_dictionary(os.path.dirname(sys.argv[0]) + '/jieba/dict.txt') | |
# List of (Latin alphabet, bopomofo) pairs: | |
_latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ | |
('a', 'ㄟˉ'), | |
('b', 'ㄅㄧˋ'), | |
('c', 'ㄙㄧˉ'), | |
('d', 'ㄉㄧˋ'), | |
('e', 'ㄧˋ'), | |
('f', 'ㄝˊㄈㄨˋ'), | |
('g', 'ㄐㄧˋ'), | |
('h', 'ㄝˇㄑㄩˋ'), | |
('i', 'ㄞˋ'), | |
('j', 'ㄐㄟˋ'), | |
('k', 'ㄎㄟˋ'), | |
('l', 'ㄝˊㄛˋ'), | |
('m', 'ㄝˊㄇㄨˋ'), | |
('n', 'ㄣˉ'), | |
('o', 'ㄡˉ'), | |
('p', 'ㄆㄧˉ'), | |
('q', 'ㄎㄧㄡˉ'), | |
('r', 'ㄚˋ'), | |
('s', 'ㄝˊㄙˋ'), | |
('t', 'ㄊㄧˋ'), | |
('u', 'ㄧㄡˉ'), | |
('v', 'ㄨㄧˉ'), | |
('w', 'ㄉㄚˋㄅㄨˋㄌㄧㄡˋ'), | |
('x', 'ㄝˉㄎㄨˋㄙˋ'), | |
('y', 'ㄨㄞˋ'), | |
('z', 'ㄗㄟˋ') | |
]] | |
# List of (bopomofo, romaji) pairs: | |
_bopomofo_to_romaji = [(re.compile('%s' % x[0]), x[1]) for x in [ | |
('ㄅㄛ', 'p⁼wo'), | |
('ㄆㄛ', 'pʰwo'), | |
('ㄇㄛ', 'mwo'), | |
('ㄈㄛ', 'fwo'), | |
('ㄅ', 'p⁼'), | |
('ㄆ', 'pʰ'), | |
('ㄇ', 'm'), | |
('ㄈ', 'f'), | |
('ㄉ', 't⁼'), | |
('ㄊ', 'tʰ'), | |
('ㄋ', 'n'), | |
('ㄌ', 'l'), | |
('ㄍ', 'k⁼'), | |
('ㄎ', 'kʰ'), | |
('ㄏ', 'h'), | |
('ㄐ', 'ʧ⁼'), | |
('ㄑ', 'ʧʰ'), | |
('ㄒ', 'ʃ'), | |
('ㄓ', 'ʦ`⁼'), | |
('ㄔ', 'ʦ`ʰ'), | |
('ㄕ', 's`'), | |
('ㄖ', 'ɹ`'), | |
('ㄗ', 'ʦ⁼'), | |
('ㄘ', 'ʦʰ'), | |
('ㄙ', 's'), | |
('ㄚ', 'a'), | |
('ㄛ', 'o'), | |
('ㄜ', 'ə'), | |
('ㄝ', 'e'), | |
('ㄞ', 'ai'), | |
('ㄟ', 'ei'), | |
('ㄠ', 'au'), | |
('ㄡ', 'ou'), | |
('ㄧㄢ', 'yeNN'), | |
('ㄢ', 'aNN'), | |
('ㄧㄣ', 'iNN'), | |
('ㄣ', 'əNN'), | |
('ㄤ', 'aNg'), | |
('ㄧㄥ', 'iNg'), | |
('ㄨㄥ', 'uNg'), | |
('ㄩㄥ', 'yuNg'), | |
('ㄥ', 'əNg'), | |
('ㄦ', 'əɻ'), | |
('ㄧ', 'i'), | |
('ㄨ', 'u'), | |
('ㄩ', 'ɥ'), | |
('ˉ', '→'), | |
('ˊ', '↑'), | |
('ˇ', '↓↑'), | |
('ˋ', '↓'), | |
('˙', ''), | |
(',', ','), | |
('。', '.'), | |
('!', '!'), | |
('?', '?'), | |
('—', '-') | |
]] | |
# List of (romaji, ipa) pairs: | |
_romaji_to_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ | |
('ʃy', 'ʃ'), | |
('ʧʰy', 'ʧʰ'), | |
('ʧ⁼y', 'ʧ⁼'), | |
('NN', 'n'), | |
('Ng', 'ŋ'), | |
('y', 'j'), | |
('h', 'x') | |
]] | |
def number_to_chinese(text): | |
numbers = re.findall(r'\d+(?:\.?\d+)?', text) | |
for number in numbers: | |
text = text.replace(number, cn2an.an2cn(number), 1) | |
return text | |
def chinese_to_bopomofo(text): | |
text = text.replace('、', ',').replace(';', ',').replace(':', ',') | |
words = jieba.lcut(text, cut_all=False) | |
text = '' | |
for word in words: | |
bopomofos = lazy_pinyin(word, BOPOMOFO) | |
if not re.search('[\u4e00-\u9fff]', word): | |
text += word | |
continue | |
for i in range(len(bopomofos)): | |
if re.match('[\u3105-\u3129]', bopomofos[i][-1]): | |
bopomofos[i] += 'ˉ' | |
if text != '': | |
text += ' ' | |
text += ''.join(bopomofos) | |
return text | |
def latin_to_bopomofo(text): | |
for regex, replacement in _latin_to_bopomofo: | |
text = re.sub(regex, replacement, text) | |
return text | |
def bopomofo_to_romaji(text): | |
for regex, replacement in _bopomofo_to_romaji: | |
text = re.sub(regex, replacement, text) | |
return text | |
def chinese_to_romaji(text): | |
text = number_to_chinese(text) | |
text = chinese_to_bopomofo(text) | |
text = latin_to_bopomofo(text) | |
text = bopomofo_to_romaji(text) | |
text = re.sub('i[aoe]', lambda x: 'y' + x.group(0)[1:], text) | |
text = re.sub('u[aoəe]', lambda x: 'w' + x.group(0)[1:], text) | |
text = re.sub('([ʦsɹ]`[⁼ʰ]?)([→↓↑ ]+|$)', lambda x: x.group(1) + | |
'ɹ`' + x.group(2), text).replace('ɻ', 'ɹ`') | |
text = re.sub('([ʦs][⁼ʰ]?)([→↓↑ ]+|$)', | |
lambda x: x.group(1) + 'ɹ' + x.group(2), text) | |
return text | |
def chinese_to_lazy_ipa(text): | |
text = chinese_to_romaji(text) | |
for regex, replacement in _romaji_to_ipa: | |
text = re.sub(regex, replacement, text) | |
return text | |