EnglishToucan / Preprocessing /articulatory_features.py
Flux9665's picture
initial commit
6faeba1
raw
history blame
34.3 kB
# -*- coding: utf-8 -*-
# partly derived from an open-source resource provided by Papercup Technologies Limited
# Resource-Author: Marlene Staib
# Modified by Florian Lux, 2021
# Further modified by Florian Lux, 2022
"""
All phonemes in the IPA standard are supported.
zero-width characters are generally not supported, as
well as some other modifiers. Tone, stress and
lengthening are represented with placeholder dimensions,
however they need to be set manually, this conversion
from phonemes to features works on a character by
character basis. In a few cases, the place of
articulation is approximated because only one phoneme
had such a combination, which does not warrant a new
dimension.
"""
def generate_feature_lookup():
return {
'~': {'symbol_type': 'silence'},
'#': {'symbol_type': 'end of sentence'},
'?': {'symbol_type': 'questionmark'},
'!': {'symbol_type': 'exclamationmark'},
'.': {'symbol_type': 'fullstop'},
' ': {'symbol_type': 'word-boundary'},
'ɜ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'central',
'vowel_openness' : 'open-mid',
'vowel_roundedness': 'unrounded',
},
'ə': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'central',
'vowel_openness' : 'mid',
'vowel_roundedness': 'unrounded',
},
'a': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'front',
'vowel_openness' : 'open',
'vowel_roundedness': 'unrounded',
},
'ð': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'dental',
'consonant_manner': 'fricative'
},
'ɛ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'front',
'vowel_openness' : 'open-mid',
'vowel_roundedness': 'unrounded',
},
'ɪ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'front_central',
'vowel_openness' : 'close_close-mid',
'vowel_roundedness': 'unrounded',
},
'ŋ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'velar',
'consonant_manner': 'nasal'
},
'ɔ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'back',
'vowel_openness' : 'open-mid',
'vowel_roundedness': 'rounded',
},
'ɒ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'back',
'vowel_openness' : 'open',
'vowel_roundedness': 'rounded',
},
'ɾ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'alveolar',
'consonant_manner': 'flap'
},
'ʃ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'postalveolar',
'consonant_manner': 'fricative'
},
'θ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'dental',
'consonant_manner': 'fricative'
},
'ʊ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'central_back',
'vowel_openness' : 'close_close-mid',
'vowel_roundedness': 'unrounded'
},
'ʌ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'back',
'vowel_openness' : 'open-mid',
'vowel_roundedness': 'unrounded'
},
'ʒ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'postalveolar',
'consonant_manner': 'fricative'
},
'æ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'front',
'vowel_openness' : 'open-mid_open',
'vowel_roundedness': 'unrounded'
},
'b': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'bilabial',
'consonant_manner': 'plosive'
},
'ʔ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'glottal',
'consonant_manner': 'plosive'
},
'd': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'alveolar',
'consonant_manner': 'plosive'
},
'e': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'front',
'vowel_openness' : 'close-mid',
'vowel_roundedness': 'unrounded'
},
'f': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'labiodental',
'consonant_manner': 'fricative'
},
'ɡ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'velar',
'consonant_manner': 'plosive'
},
'h': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'glottal',
'consonant_manner': 'fricative'
},
'i': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'front',
'vowel_openness' : 'close',
'vowel_roundedness': 'unrounded'
},
'j': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'palatal',
'consonant_manner': 'approximant'
},
'k': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'velar',
'consonant_manner': 'plosive'
},
'l': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'alveolar',
'consonant_manner': 'lateral-approximant'
},
'm': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'bilabial',
'consonant_manner': 'nasal'
},
'n': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'alveolar',
'consonant_manner': 'nasal'
},
'ɳ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'retroflex',
'consonant_manner': 'nasal'
},
'o': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'back',
'vowel_openness' : 'close-mid',
'vowel_roundedness': 'rounded'
},
'p': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'bilabial',
'consonant_manner': 'plosive'
},
'ɹ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'alveolar',
'consonant_manner': 'approximant'
},
'r': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'alveolar',
'consonant_manner': 'trill'
},
's': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'alveolar',
'consonant_manner': 'fricative'
},
't': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'alveolar',
'consonant_manner': 'plosive'
},
'u': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'back',
'vowel_openness' : 'close',
'vowel_roundedness': 'rounded',
},
'v': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'labiodental',
'consonant_manner': 'fricative'
},
'w': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'labial-velar',
'consonant_manner': 'approximant'
},
'x': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'velar',
'consonant_manner': 'fricative'
},
'z': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'alveolar',
'consonant_manner': 'fricative'
},
'ʀ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'uvular',
'consonant_manner': 'trill'
},
'ø': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'front',
'vowel_openness' : 'close-mid',
'vowel_roundedness': 'rounded'
},
'ç': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'palatal',
'consonant_manner': 'fricative'
},
'ɐ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'central',
'vowel_openness' : 'open',
'vowel_roundedness': 'unrounded'
},
'œ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'front',
'vowel_openness' : 'open-mid',
'vowel_roundedness': 'rounded'
},
'y': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'front',
'vowel_openness' : 'close',
'vowel_roundedness': 'rounded'
},
'ʏ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'front_central',
'vowel_openness' : 'close_close-mid',
'vowel_roundedness': 'rounded'
},
'ɑ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'back',
'vowel_openness' : 'open',
'vowel_roundedness': 'unrounded'
},
'c': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'palatal',
'consonant_manner': 'plosive'
},
'ɲ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'palatal',
'consonant_manner': 'nasal'
},
'ɣ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'velar',
'consonant_manner': 'fricative'
},
'ʎ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'palatal',
'consonant_manner': 'lateral-approximant'
},
'β': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'bilabial',
'consonant_manner': 'fricative'
},
'ʝ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'palatal',
'consonant_manner': 'fricative'
},
'ɟ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'palatal',
'consonant_manner': 'plosive'
},
'q': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'uvular',
'consonant_manner': 'plosive'
},
'ɕ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'alveolopalatal',
'consonant_manner': 'fricative'
},
'ɭ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'retroflex',
'consonant_manner': 'lateral-approximant'
},
'ɵ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'central',
'vowel_openness' : 'close-mid',
'vowel_roundedness': 'rounded'
},
'ʑ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'alveolopalatal',
'consonant_manner': 'fricative'
},
'ʋ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'labiodental',
'consonant_manner': 'approximant'
},
'ʁ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'uvular',
'consonant_manner': 'fricative'
},
'ɨ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'central',
'vowel_openness' : 'close',
'vowel_roundedness': 'unrounded'
},
'ʂ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'retroflex',
'consonant_manner': 'fricative'
},
'ɓ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'bilabial',
'consonant_manner': 'implosive'
},
'ʙ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'bilabial',
'consonant_manner': 'vibrant'
},
'ɗ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'dental',
'consonant_manner': 'implosive'
},
'ɖ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'retroflex',
'consonant_manner': 'plosive'
},
'χ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'uvular',
'consonant_manner': 'fricative'
},
'ʛ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'uvular',
'consonant_manner': 'implosive'
},
'ʟ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'velar',
'consonant_manner': 'lateral-approximant'
},
'ɽ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'retroflex',
'consonant_manner': 'flap'
},
'ɢ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'uvular',
'consonant_manner': 'plosive'
},
'ɠ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'velar',
'consonant_manner': 'implosive'
},
'ǂ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'alveolopalatal',
'consonant_manner': 'click'
},
'ɦ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'glottal',
'consonant_manner': 'fricative'
},
'ǁ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'alveolar',
'consonant_manner': 'click'
},
'ĩ': { # identical description with i except nasal
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'front',
'vowel_openness' : 'close',
'vowel_roundedness': 'unrounded',
'consonant_manner' : 'nasal'
},
'ʍ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'labial-velar',
'consonant_manner': 'fricative'
},
'ʕ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'pharyngal',
'consonant_manner': 'fricative'
},
'ɻ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'retroflex',
'consonant_manner': 'approximant'
},
'ʄ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'palatal',
'consonant_manner': 'implosive'
},
'ũ': { # identical with u, but nasal
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'back',
'vowel_openness' : 'close',
'vowel_roundedness': 'rounded',
'consonant_manner' : 'nasal'
},
'ɤ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'back',
'vowel_openness' : 'close-mid',
'vowel_roundedness': 'unrounded',
},
'ɶ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'front',
'vowel_openness' : 'open',
'vowel_roundedness': 'rounded',
},
'õ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'back',
'vowel_openness' : 'close-mid',
'vowel_roundedness': 'rounded',
'consonant_manner' : 'nasal'
},
'ʡ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'epiglottal',
'consonant_manner': 'plosive'
},
'ʈ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'retroflex',
'consonant_manner': 'plosive'
},
'ʜ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'epiglottal',
'consonant_manner': 'fricative'
},
'ɱ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'labiodental',
'consonant_manner': 'nasal'
},
'ɯ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'back',
'vowel_openness' : 'close',
'vowel_roundedness': 'unrounded'
},
'ǀ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'dental',
'consonant_manner': 'click'
},
'ɸ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'bilabial',
'consonant_manner': 'fricative'
},
'ʘ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'bilabial',
'consonant_manner': 'click'
},
'ʐ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'retroflex',
'consonant_manner': 'fricative'
},
'ɰ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'velar',
'consonant_manner': 'approximant'
},
'ɘ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'central',
'vowel_openness' : 'close-mid',
'vowel_roundedness': 'unrounded'
},
'ħ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'pharyngal',
'consonant_manner': 'fricative'
},
'ɞ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'central',
'vowel_openness' : 'open-mid',
'vowel_roundedness': 'rounded'
},
'ʉ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'central',
'vowel_openness' : 'close',
'vowel_roundedness': 'rounded'
},
'ɴ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'uvular',
'consonant_manner': 'nasal'
},
'ʢ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'epiglottal',
'consonant_manner': 'fricative'
},
'ѵ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'labiodental',
'consonant_manner': 'flap'
},
'ǃ': { # looks deceivingly like an exclamation mark, but it's a different unicode entry
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'postalveolar',
'consonant_manner': 'click'
},
} # REMEMBER to also add the phonemes added here to the ID lookup below as the new highest ID
def get_phone_to_id():
"""
for the states of the ctc loss and dijkstra/mas in the aligner
cannot be extracted trivially from above because sets are unordered and the IDs need to be consistent
"""
phone_to_id = dict()
for index, phone in enumerate("~#?!ǃ.ɜəaðɛɪŋɔɒɾʃθʊʌʒæbʔdefghijklmnɳopɡɹrstuvwxzʀøçɐœyʏɑcɲɣʎβʝɟqɕɭɵʑʋʁɨʂɓʙɗɖχʛʟɽɢɠǂɦǁĩʍʕɻʄũɤɶõʡʈʜɱɯǀɸʘʐɰɘħɞʉɴʢѵ"):
phone_to_id[phone] = index
# the following lines fix an issue with the aligner: While the different punctuation marks have
# different effects on their context, their realization in the signal is typically just silence.
# Since this is common for all of them, the CTC objective malfunctions for our purposes of
# alignment search. So it turned out that it's better to map all punctuation marks to silence.
phone_to_id["#"] = phone_to_id["~"]
phone_to_id["?"] = phone_to_id["~"]
phone_to_id["!"] = phone_to_id["~"]
phone_to_id["."] = phone_to_id["~"]
return phone_to_id
def get_feature_to_index_lookup():
return {
# MODIFIER
# -- stress: modified by the previous symbol
"stressed" : 0,
# -- tone: modified by the following symbol
"very-high-tone" : 1,
"high-tone" : 2,
"mid-tone" : 3,
"low-tone" : 4,
"very-low-tone" : 5,
"rising-tone" : 6,
"falling-tone" : 7,
"peaking-tone" : 8,
"dipping-tone" : 9,
# -- lengthening: modified by the following symbol
"lengthened" : 10,
"half-length" : 11,
"shortened" : 12,
# CATEGORIES
"consonant" : 13,
"vowel" : 14,
"phoneme" : 15,
# NON-SPEECH-MARKERS
"silence" : 16,
"end of sentence" : 17,
"questionmark" : 18,
"exclamationmark" : 19,
"fullstop" : 20,
"word-boundary" : 21,
# PLACE
"dental" : 22,
"postalveolar" : 23,
"velar" : 24,
"palatal" : 25,
"glottal" : 26,
"uvular" : 27,
"labiodental" : 28,
"labial-velar" : 29,
"alveolar" : 30,
"bilabial" : 31,
"alveolopalatal" : 32,
"retroflex" : 33,
"pharyngal" : 34,
"epiglottal" : 35,
# TONGUE POSITION
"central" : 36,
"back" : 37,
"front_central" : 38,
"front" : 39,
"central_back" : 40,
# MOUTH OPENNESS
"mid" : 41,
"close-mid" : 42,
"close" : 43,
"open-mid" : 44,
"close_close-mid" : 45,
"open-mid_open" : 46,
"open" : 47,
# MOUTH SHAPE
"rounded" : 48,
"unrounded" : 49,
# MANNER
"plosive" : 50,
"nasal" : 51,
"approximant" : 52,
"trill" : 53,
"flap" : 54,
"fricative" : 55,
"lateral-approximant": 56,
"implosive" : 57,
"vibrant" : 58,
"click" : 59,
"ejective" : 60,
# TYPE
"aspirated" : 61,
"unvoiced" : 62,
"voiced" : 63,
}
def generate_feature_table():
ipa_to_phonemefeats = generate_feature_lookup()
feat_types = set()
for ipa in ipa_to_phonemefeats:
if len(ipa) == 1:
[feat_types.add(feat) for feat in ipa_to_phonemefeats[ipa].keys()]
feat_to_val_set = dict()
for feat in feat_types:
feat_to_val_set[feat] = set()
for ipa in ipa_to_phonemefeats:
if len(ipa) == 1:
for feat in ipa_to_phonemefeats[ipa]:
feat_to_val_set[feat].add(ipa_to_phonemefeats[ipa][feat])
# print(feat_to_val_set)
value_list = set()
for val_set in [feat_to_val_set[feat] for feat in feat_to_val_set]:
for value in val_set:
value_list.add(value)
# print("{")
# for index, value in enumerate(list(value_list)):
# print('"{}":{},'.format(value,index))
# print("}")
value_to_index = get_feature_to_index_lookup()
phone_to_vector = dict()
for ipa in ipa_to_phonemefeats:
if len(ipa) == 1:
phone_to_vector[ipa] = [0] * (15 + sum([len(values) for values in [feat_to_val_set[feat] for feat in feat_to_val_set]]))
# 15 features come from modifiers, not from lexical sounds, so we have to add them to the ones we encounter naturally in the lexical sounds
for feat in ipa_to_phonemefeats[ipa]:
if ipa_to_phonemefeats[ipa][feat] in value_to_index:
phone_to_vector[ipa][value_to_index[ipa_to_phonemefeats[ipa][feat]]] = 1
if phone_to_vector[ipa][value_to_index["phoneme"]] != 1:
# it's not a phoneme, so we give it the silence marker, regardless of what it is.
phone_to_vector[ipa][value_to_index["silence"]] = 1
for feat in feat_to_val_set:
for value in feat_to_val_set[feat]:
if value not in value_to_index:
print(f"Unknown feature value in featureset! {value}")
# print(f"{sum([len(values) for values in [feat_to_val_set[feat] for feat in feat_to_val_set]])} should be 49")
return phone_to_vector
if __name__ == '__main__':
print(generate_feature_table())