import re ############################################################### # X-SAMPA _xsampa2ipa = { k:re.sub(r'◌','',v) for (k,v) in { '#':'#', '=':'◌̩', '>':'◌ʼ', '`':'◌˞', '~':'◌̃', 'a':'a', 'b':'b', 'b_<':'ɓ', 'c':'c', 'd':'d', 'd`':'ɖ', 'd_<':'ɗ', 'e':'e', 'f':'f', 'g':'ɡ', 'g_<':'ɠ', 'h':'h', 'h\\':'ɦ', 'i':'i', 'j':'j', 'j\\':'ʝ', 'k':'k', 'l':'l', 'l`':'ɭ', 'l\\':'ɺ', 'm':'m', 'n':'n', 'n_d':'nd', 'n`':'ɳ', 'o':'o', 'p':'p', 'p\\':'ɸ', 'p_<':'ɓ̥', 'q':'q', 'r':'r', 'r`':'ɽ', 'r\\':'ɹ', 'r\\`':'ɻ', 's':'s', 's`':'ʂ', 's\\':'ɕ', 't':'t', 't`':'ʈ', 'u':'u', 'v':'v', 'v\\':'ʋ', 'w':'w', 'x':'x', 'x\\':'ɧ', 'y':'y', 'z':'z', 'z`':'ʐ', 'z\\':'ʑ', 'A':'ɑ', 'B':'β', 'B\\':'ʙ', 'C':'ç', 'D':'ð', 'E':'ɛ', 'F':'ɱ', 'G':'ɣ', 'G\\':'ɢ', 'G\\_<':'ʛ', 'H':'ɥ', 'H\\':'ʜ', 'I':'ɪ', 'I\\':'ɪ̈ ', 'J':'ɲ', 'J\\':'ɟ', 'J\\_<':'ʄ', 'K':'ɬ', 'K\\':'ɮ', 'L':'ʎ', 'L\\':'ʟ', 'M':'ɯ', 'M\\':'ɰ', 'N':'ŋ', 'N_g':'ŋɡ', 'N\\':'ɴ', 'O':'ɔ', 'O\\':'ʘ', 'P':'ʋ', 'Q':'ɒ', 'R':'ʁ', 'R\\':'ʀ', 'S':'ʃ', 'T':'θ', 'U':'ʊ', 'U\\':'ʊ̈ ', 'V':'ʌ', 'W':'ʍ', 'X':'χ', 'X\\':'ħ', 'Y':'ʏ', 'Z':'ʒ', '.':'.', '"':'ˈ', '%':'ˌ', '\'':'ʲ', ':':'ː', ':\\':'ˑ', '-':'', '@':'ə', '@\\':'ɘ', '{':'æ', '}':'ʉ', '1':'ɨ', '2':'ø', '3':'ɜ', '3\\':'ɞ', '4':'ɾ', '5':'ɫ', '6':'ɐ', '7':'ɤ', '8':'ɵ', '9':'œ', '&':'ɶ', '?':'ʔ', '?\\':'ʕ', '*':'', '/':'', '<\\':'ʢ', '>\\':'ʡ', '^':'ꜛ', '!':'ꜜ', '!\\':'ǃ', '|':'|', '|\\':'ǀ', '||':'‖', '|\\|\\':'ǁ', '=\\':'ǂ', '-\\':'‿' }.items() } _xsampa_vowels=set('aeiouyAEIOUYQV@123}{6789&')|set(('I\\','U\\','@\\','3\\')) _xdiacritics2ipa = { k:re.sub(r'◌','',v) for (k,v) in { '"':'◌̈', '+':'◌̟', '-':'◌̠', '/':'◌̌', '0':'◌̥', '=':'◌̩', '>':'◌ʼ', '?\\':'◌ˤ', '\\':'◌̂', '^':'◌̯', '}':'◌̚', '`':'◌˞', '~':'◌̃', 'A':'◌̘', 'a':'◌̺', 'B':'◌̏', 'B_L':'◌᷅', 'c':'◌̜', 'd':'◌̪', 'e':'◌̴', 'F':'◌̂', 'G':'◌ˠ', 'H':'◌́', 'H_T':'◌᷄', 'h':'◌ʰ', 'j':'◌ʲ', 'k':'◌̰', 'L':'◌̀', 'l':'◌ˡ', 'M':'◌̄', 'm':'◌̻', 'N':'◌̼', 'n':'◌ⁿ', 'O':'◌̹', 'o':'◌̞', 'q':'◌̙', 'R':'◌̌', 'R_F':'◌᷈', 'r':'◌̝', 'T':'◌̋', 't':'◌̤', 'v':'◌̬', 'w':'◌ʷ', 'X':'◌̆', 'x':'◌̽', '1':'˥', '2':'˦', '3':'˧', '4':'˨', '5':'˩', }.items() } # Create and _xsampa2ipa with '_'+k for each diacritic _xsampa_and_diac2ipa = _xsampa2ipa.copy() _xsampa_and_diac2ipa.update({ ('_'+k):v for (k,v) in _xdiacritics2ipa.items() }) _ipa2xsampa = { v:k for (k,v) in _xsampa_and_diac2ipa.items() } ################################################################## # Language-dependent tone numbers _tone2ipa = { 'arz' : { '0':'', '1':'ˈ', '2':'ˌ' }, 'eng' : { '0':'', '1':'ˈ', '2':'ˌ' }, 'yue' : { '0':'', '1':'˥', '2':'˧˥', '3':'˧', '4':'˨˩', '5':'˩˧', '6':'˨' }, 'lao' : { '0':'', '1':'˧', '2':'˥˧', '3':'˧˩', '4':'˥', '5':'˩˧', '6':'˩' }, 'cmn' : { '0':'', '1':'˥', '2':'˧˥', '3':'˨˩˦', '4':'˥˩', '5':'' }, 'spa' : { '0':'', '1':'ˈ', '2':'ˌ' }, 'vie' : { '0':'', '1':'˧', '2':'˨˩h', '3':'˧˥', '4':'˨˩˨', '5':'˧ʔ˥', '6':'˧˨ʔ' }, } ##################################################################### # DISC, the code used by CELEX, # is a kind of modified X-SAMPA, # modified to include a lot of one-character shortcuts for phones # that would require two characters in X-SAMPA. # Some of the one-character shortcuts are language-dependent, # in the sense that the same ASCII character is re-used for different IPA # symbols in different languages. # The language-independent table, below, includes only the symbols that # are not part of X-SAMPA. _disc2ipa = { k:re.sub(r'◌','',v) for (k,v) in { '_':'dʒ', 'a':'aː', 'b':'b', 'c':'æ◌̃', 'd':'d', 'e':'eː', 'f':'f', 'g':'ɡ', 'h':'h', 'i':'iː', 'j':'j', 'k':'k', 'l':'l', 'm':'m', 'n':'n', 'o':'oː', 'p':'p', 'q':'ɑ◌̃ː', 'r':'r', 's':'s', 't':'t', 'u':'uː', 'v':'v', 'w':'w', 'x':'x', 'y':'y', 'y':'yː', 'A':'ɑ', 'B':'au', 'C':'ŋ◌̩', 'D':'ð', 'E':'ɛ', 'F':'m◌̩', 'G':'ɣ', 'H':'n◌̩', 'I':'ɪ', 'J':'ɲ', 'K':'ɛɪ', 'L':'œɪ', 'M':'ɯ', 'N':'ŋ', 'O':'ɔ', 'P':'ʋ', 'P':'l◌̩', 'Q':'ɒ', 'R':'ɜ◌˞', 'S':'ʃ', 'T':'θ', 'U':'ʊ', 'V':'ʌ', 'W':'ai', 'X':'ɔy', 'Y':'ʏ', 'Z':'ʒ', '0':'æ◌̃ː', '1':'eɪ', '2':'aɪ', '3':'ɜː', '4':'ɔɪ', '5':'əʊ', '6':'aʊ', '7':'ɪə', '8':'ɛə', '9':'ʊə', '|':'øː', '!':'iːː', '(':'yːː', ')':'ɛː', '*':'œː', '<':'ɒː', '+':'pf', '=':'ts', '-':'.', '#':'ɑː', '$':'ɔː', '&':'a', '^':'œ◌̃', '~':'ɔ◌̃ː', "'":'ˈ', '@':'ə', '{':'æ', '}':'ʉ', }.items() } _disc_vowels=_xsampa_vowels|set('|!()*KL#$WBX^46cq~CFHPR5789') _ipa2disc = { v:k for (k,v) in _disc2ipa.items() } _ipa2disc['#'] = '' _disc2ipa_dutch = _disc2ipa.copy() _disc2ipa_dutch['w']='ʋ' _ipa2disc['ʋ']='w' _disc2ipa_english = _disc2ipa.copy() _disc2ipa_english['r']='ɻ' _ipa2disc['ɻ']='r' ####################################################################### # Callhome phone codes are completely language-dependent. # I know of three: Egyptian Arabic, Mandarin, and Spanish _callhome2ipa = {} _callhome2ipa['arz'] = { 'C':'ʔ', 'b':'b', 't':'t', 'g':'g', 'H':'ħ', 'x':'x', 'd':'d', 'r':'ɾ', 'z':'z', 's':'s', '$':'ʃ', 'S':'sˤ', 'D':'dˤ', 'T':'tˤ', 'Z':'ðˤ', 'c':'ʕ', 'G':'ɣ', 'f':'f', 'q':'ʔ', 'Q':'q', 'k':'k', 'l':'l', 'm':'m', 'n':'n', 'h':'h', 'w':'w', 'y':'j', 'v':'v', 'j':'dʒ', '@':'æ', 'a':'a', 'B':'a', 'i':'i', 'u':'u', '%':'æː', 'A':'aː', 'I':'iː', 'O':'oː', 'U':'uː', 'E':'eː', 'ay':'aj', 'aw':'aw' } _callhome2ipa['arz'].update(_tone2ipa['arz']) _callhome_vowels = dict() _callhome_vowels['arz'] = set('@aBiu%AIOUE')|set(('ay','aw')) _callhome2ipa['cmn'] = { 'b':'p', 'p':'pʰ', 'm':'m', 'd':'t', 't':'tʰ', 'l':'l', 'n':'n', 'g':'k', 'k':'kʰ', 'h':'h', 'N':'ŋ', 'z':'ts', 'c':'tsʰ', 's':'s', 'j':'tɕ', 'q':'tɕʰ', 'x':'ɕ', 'r':'ɻ', 'Z':'ʈʂ', 'C':'ʈʂʰ', 'S':'ʂ', 'f':'f', 'y':'j', 'w':'w', 'W':'ɥ', 'i':'i', 'I':'ɨ', '%':'ɯ', 'e':'e', 'E':'ɛ', 'U':'y', '&':'ə', 'a':'ɑ', '@':'a', 'o':'o', '>':'ɔ', 'u':'u', 'R':'ɚ' } _callhome2ipa['cmn'].update(_tone2ipa['cmn']) _callhome_vowels['cmn']=set('iI%eEU&a@o>uR') _callhome2ipa['spa'] = { 'a':'a', 'i':'i', 'e':'e', 'o':'o', 'u':'u', 'h':'h', 'p':'p', 'b':'b', 'B':'β', 'f':'f', 'v':'v', 'l':'l', 'm':'m', 'w':'w', 't':'t', 'd':'d', 'D':'ð', 's':'s', 'S':'ʃ', 'C':'tʃ', 'J':'dʒ', 'n':'n', 'y':'j', 'r':'ɾ', 'R':'r', 'x':'x', 'N':'ɲ', 'k':'k', 'g':'g', 'G':'ɣ', '9':'ŋ', 'z':'z' } _callhome2ipa['spa'].update(_tone2ipa['spa']) _callhome_vowels['spa']=set('aieou') _ipa2callhome={l:{v:k for (k,v) in d.items()}for (l,d) in _callhome2ipa.items()} #special cases, e.g., define best choice for ambiguous mappings _ipa2callhome['arz']['a']='a' ######################################################################## # ARPABET was invented for English. # The standard dictionary written in ARPABET is the CMU dictionary. # TIMIT is written in a variant of ARPABET that includes a couple # of non-standard allophones, and most significantly, includes # separate symbols for the closure and release portions of each stop. _arpabet2ipa = { 'AA':'ɑ', 'AE':'æ', 'AH':'ʌ', 'AH0':'ə', 'AO':'ɔ', 'AW':'aʊ', 'AY':'aɪ', 'EH':'ɛ', 'ER':'ɝ', 'ER0':'ɚ', 'EY':'eɪ', 'IH':'ɪ', 'IH0':'ɨ', 'IY':'i', 'OW':'oʊ', 'OY':'ɔɪ', 'UH':'ʊ', 'UW':'u', 'B':'b', 'CH':'tʃ', 'D':'d', 'DH':'ð', 'EL':'l̩ ', 'EM':'m̩', 'EN':'n̩', 'F':'f', 'G':'ɡ', 'HH':'h', 'JH':'dʒ', 'K':'k', 'L':'l', 'M':'m', 'N':'n', 'NG':'ŋ', 'P':'p', 'Q':'ʔ', 'R':'ɹ', 'S':'s', 'SH':'ʃ', 'T':'t', 'TH':'θ', 'V':'v', 'W':'w', 'WH':'ʍ', 'Y':'j', 'Z':'z', 'ZH':'ʒ' } _arpabet2ipa.update(_tone2ipa['eng']) # Add the English stress labels _arpabet_vowels=set((k for k in _arpabet2ipa.keys() if k[0] in 'AEIOU')) _ipa2arpabet = { v: k for k, v in _arpabet2ipa.items() } _ipa2tone = {l:{v:k for k,v in d.items()} for l,d in _tone2ipa.items()} _timit2ipa = _arpabet2ipa.copy() _timit2ipa.update({ 'AX':'ə', 'AX-H':'ə̥', 'AXR':'ɚ', 'B':'', 'BCL':'b', 'D':'', 'DCL':'d', 'DX':'ɾ', 'ENG':'ŋ̍', 'EPI':'', 'G':'', 'GCL':'g', 'HV':'ɦ', 'H#':'', 'IX':'ɨ', 'KCL':'k', 'K':'', 'NX':'ɾ̃', 'P':'', 'PAU':'', 'PCL':'p', 'T':'', 'TCL':'t', 'UX':'ʉ', }) ####################################################################### # IPA _ipa_vowels = set('aeiouyɑɒɛɪɔʘʊʌʏəɘæʉɨøɜɞɐɤɵœɶ')|set(('ɪ̈','ʊ̈')) _ipa_consonants = set('bɓcdɖɗfɡɠhɦjʝklɭɺmnɳpɸqrɽɹɻsʂɕtʈvʋwxɧzʐʑβʙçðɱɣɢʛɥʜɲɟʄɬɮʎʟɯɰŋɴʋɒʁʀʃθʍχħʒɾɫʔʕʢʡꜛꜜǃ|ǀ‖ǁǂ') _ipa_diacritics = set(re.sub(r'◌','','◌̈◌̟◌̠◌̌◌̥◌̩◌◌◌̂◌̯◌̚◌◌̃◌̘◌̺◌̏◌◌̜◌̪◌̴◌̂◌◌́◌◌◌◌̰◌̀◌◌̄◌̻◌̼◌◌̹◌̞◌̙◌̌◌◌̝◌̋◌̤◌̬◌◌̆◌̽ːʰˀʷʱʼʲˤ')) _ipa_stressmarkers = set("ˈˌ") _ipa_tonecharacters = set('˥˦˧˨˩˥˧') # A bit of recursion to generate all tones of up to 4 components _ipa_tones = _ipa_tonecharacters.copy() _ipa_tones |= set(x+y for x in _ipa_tones for y in _ipa_tones) _ipa_tones |= set(x+y for x in _ipa_tones for y in _ipa_tones) _ipa_symbols=_ipa_vowels|_ipa_consonants|_ipa_diacritics