import os import re import sys import jieba import cn2an import logging from pypinyin import lazy_pinyin, BOPOMOFO # logging.getLogger('jieba').setLevel(logging.WARNING) # jieba.set_dictionary(os.path.dirname(sys.argv[0]) + '/jieba/dict.txt') # List of (Latin alphabet, bopomofo) pairs: _latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ ('a', 'ㄟˉ'), ('b', 'ㄅㄧˋ'), ('c', 'ㄙㄧˉ'), ('d', 'ㄉㄧˋ'), ('e', 'ㄧˋ'), ('f', 'ㄝˊㄈㄨˋ'), ('g', 'ㄐㄧˋ'), ('h', 'ㄝˇㄑㄩˋ'), ('i', 'ㄞˋ'), ('j', 'ㄐㄟˋ'), ('k', 'ㄎㄟˋ'), ('l', 'ㄝˊㄛˋ'), ('m', 'ㄝˊㄇㄨˋ'), ('n', 'ㄣˉ'), ('o', 'ㄡˉ'), ('p', 'ㄆㄧˉ'), ('q', 'ㄎㄧㄡˉ'), ('r', 'ㄚˋ'), ('s', 'ㄝˊㄙˋ'), ('t', 'ㄊㄧˋ'), ('u', 'ㄧㄡˉ'), ('v', 'ㄨㄧˉ'), ('w', 'ㄉㄚˋㄅㄨˋㄌㄧㄡˋ'), ('x', 'ㄝˉㄎㄨˋㄙˋ'), ('y', 'ㄨㄞˋ'), ('z', 'ㄗㄟˋ') ]] # List of (bopomofo, romaji) pairs: _bopomofo_to_romaji = [(re.compile('%s' % x[0]), x[1]) for x in [ ('ㄅㄛ', 'p⁼wo'), ('ㄆㄛ', 'pʰwo'), ('ㄇㄛ', 'mwo'), ('ㄈㄛ', 'fwo'), ('ㄅ', 'p⁼'), ('ㄆ', 'pʰ'), ('ㄇ', 'm'), ('ㄈ', 'f'), ('ㄉ', 't⁼'), ('ㄊ', 'tʰ'), ('ㄋ', 'n'), ('ㄌ', 'l'), ('ㄍ', 'k⁼'), ('ㄎ', 'kʰ'), ('ㄏ', 'h'), ('ㄐ', 'ʧ⁼'), ('ㄑ', 'ʧʰ'), ('ㄒ', 'ʃ'), ('ㄓ', 'ʦ`⁼'), ('ㄔ', 'ʦ`ʰ'), ('ㄕ', 's`'), ('ㄖ', 'ɹ`'), ('ㄗ', 'ʦ⁼'), ('ㄘ', 'ʦʰ'), ('ㄙ', 's'), ('ㄚ', 'a'), ('ㄛ', 'o'), ('ㄜ', 'ə'), ('ㄝ', 'e'), ('ㄞ', 'ai'), ('ㄟ', 'ei'), ('ㄠ', 'au'), ('ㄡ', 'ou'), ('ㄧㄢ', 'yeNN'), ('ㄢ', 'aNN'), ('ㄧㄣ', 'iNN'), ('ㄣ', 'əNN'), ('ㄤ', 'aNg'), ('ㄧㄥ', 'iNg'), ('ㄨㄥ', 'uNg'), ('ㄩㄥ', 'yuNg'), ('ㄥ', 'əNg'), ('ㄦ', 'əɻ'), ('ㄧ', 'i'), ('ㄨ', 'u'), ('ㄩ', 'ɥ'), ('ˉ', '→'), ('ˊ', '↑'), ('ˇ', '↓↑'), ('ˋ', '↓'), ('˙', ''), (',', ','), ('。', '.'), ('!', '!'), ('?', '?'), ('—', '-') ]] # List of (romaji, ipa) pairs: _romaji_to_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ ('ʃy', 'ʃ'), ('ʧʰy', 'ʧʰ'), ('ʧ⁼y', 'ʧ⁼'), ('NN', 'n'), ('Ng', 'ŋ'), ('y', 'j'), ('h', 'x') ]] def number_to_chinese(text): numbers = re.findall(r'\d+(?:\.?\d+)?', text) for number in numbers: text = text.replace(number, cn2an.an2cn(number), 1) return text def chinese_to_bopomofo(text): text = text.replace('、', ',').replace(';', ',').replace(':', ',') words = jieba.lcut(text, cut_all=False) text = '' for word in words: bopomofos = lazy_pinyin(word, BOPOMOFO) if not re.search('[\u4e00-\u9fff]', word): text += word continue for i in range(len(bopomofos)): if re.match('[\u3105-\u3129]', bopomofos[i][-1]): bopomofos[i] += 'ˉ' if text != '': text += ' ' text += ''.join(bopomofos) return text def latin_to_bopomofo(text): for regex, replacement in _latin_to_bopomofo: text = re.sub(regex, replacement, text) return text def bopomofo_to_romaji(text): for regex, replacement in _bopomofo_to_romaji: text = re.sub(regex, replacement, text) return text def chinese_to_romaji(text): text = number_to_chinese(text) text = chinese_to_bopomofo(text) text = latin_to_bopomofo(text) text = bopomofo_to_romaji(text) text = re.sub('i[aoe]', lambda x: 'y' + x.group(0)[1:], text) text = re.sub('u[aoəe]', lambda x: 'w' + x.group(0)[1:], text) text = re.sub('([ʦsɹ]`[⁼ʰ]?)([→↓↑ ]+|$)', lambda x: x.group(1) + 'ɹ`' + x.group(2), text).replace('ɻ', 'ɹ`') text = re.sub('([ʦs][⁼ʰ]?)([→↓↑ ]+|$)', lambda x: x.group(1) + 'ɹ' + x.group(2), text) return text def chinese_to_lazy_ipa(text): text = chinese_to_romaji(text) for regex, replacement in _romaji_to_ipa: text = re.sub(regex, replacement, text) return text