Spaces:
Running
Running
# Derived from an open-source resource provided by Papercup Technologies Limited | |
# Resource-Author: Marlene Staib | |
# Modified by Florian Lux, 2021 | |
def generate_feature_lookup(): | |
return { | |
'~': {'symbol_type': 'silence'}, | |
'#': {'symbol_type': 'end of sentence'}, | |
'?': {'symbol_type': 'questionmark'}, | |
'!': {'symbol_type': 'exclamationmark'}, | |
'.': {'symbol_type': 'fullstop'}, | |
'ɜ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'vowel', | |
'VUV' : 'voiced', | |
'vowel_frontness' : 'central', | |
'vowel_openness' : 'open-mid', | |
'vowel_roundedness': 'unrounded', | |
}, | |
'ɫ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'voiced', | |
'consonant_place' : 'alveolar', | |
'consonant_manner': 'lateral-approximant', | |
}, | |
'ə': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'vowel', | |
'VUV' : 'voiced', | |
'vowel_frontness' : 'central', | |
'vowel_openness' : 'mid', | |
'vowel_roundedness': 'unrounded', | |
}, | |
'ɚ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'vowel', | |
'VUV' : 'voiced', | |
'vowel_frontness' : 'central', | |
'vowel_openness' : 'mid', | |
'vowel_roundedness': 'unrounded', | |
}, | |
'a': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'vowel', | |
'VUV' : 'voiced', | |
'vowel_frontness' : 'front', | |
'vowel_openness' : 'open', | |
'vowel_roundedness': 'unrounded', | |
}, | |
'ð': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'voiced', | |
'consonant_place' : 'dental', | |
'consonant_manner': 'fricative' | |
}, | |
'ɛ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'vowel', | |
'VUV' : 'voiced', | |
'vowel_frontness' : 'front', | |
'vowel_openness' : 'open-mid', | |
'vowel_roundedness': 'unrounded', | |
}, | |
'ɪ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'vowel', | |
'VUV' : 'voiced', | |
'vowel_frontness' : 'front_central', | |
'vowel_openness' : 'close_close-mid', | |
'vowel_roundedness': 'unrounded', | |
}, | |
'ᵻ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'vowel', | |
'VUV' : 'voiced', | |
'vowel_frontness' : 'central', | |
'vowel_openness' : 'close', | |
'vowel_roundedness': 'unrounded', | |
}, | |
'ŋ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'voiced', | |
'consonant_place' : 'velar', | |
'consonant_manner': 'nasal' | |
}, | |
'ɔ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'vowel', | |
'VUV' : 'voiced', | |
'vowel_frontness' : 'back', | |
'vowel_openness' : 'open-mid', | |
'vowel_roundedness': 'rounded', | |
}, | |
'ɒ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'vowel', | |
'VUV' : 'voiced', | |
'vowel_frontness' : 'back', | |
'vowel_openness' : 'open', | |
'vowel_roundedness': 'rounded', | |
}, | |
'ɾ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'voiced', | |
'consonant_place' : 'alveolar', | |
'consonant_manner': 'tap' | |
}, | |
'ʃ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'unvoiced', | |
'consonant_place' : 'postalveolar', | |
'consonant_manner': 'fricative' | |
}, | |
'θ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'unvoiced', | |
'consonant_place' : 'dental', | |
'consonant_manner': 'fricative' | |
}, | |
'ʊ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'vowel', | |
'VUV' : 'voiced', | |
'vowel_frontness' : 'central_back', | |
'vowel_openness' : 'close_close-mid', | |
'vowel_roundedness': 'unrounded' | |
}, | |
'ʌ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'vowel', | |
'VUV' : 'voiced', | |
'vowel_frontness' : 'back', | |
'vowel_openness' : 'open-mid', | |
'vowel_roundedness': 'unrounded' | |
}, | |
'ʒ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'voiced', | |
'consonant_place' : 'postalveolar', | |
'consonant_manner': 'fricative' | |
}, | |
'æ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'vowel', | |
'VUV' : 'voiced', | |
'vowel_frontness' : 'front', | |
'vowel_openness' : 'open-mid_open', | |
'vowel_roundedness': 'unrounded' | |
}, | |
'b': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'voiced', | |
'consonant_place' : 'bilabial', | |
'consonant_manner': 'stop' | |
}, | |
'ʔ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'unvoiced', | |
'consonant_place' : 'glottal', | |
'consonant_manner': 'stop' | |
}, | |
'd': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'voiced', | |
'consonant_place' : 'alveolar', | |
'consonant_manner': 'stop' | |
}, | |
'e': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'vowel', | |
'VUV' : 'voiced', | |
'vowel_frontness' : 'front', | |
'vowel_openness' : 'close-mid', | |
'vowel_roundedness': 'unrounded' | |
}, | |
'f': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'unvoiced', | |
'consonant_place' : 'labiodental', | |
'consonant_manner': 'fricative' | |
}, | |
'g': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'voiced', | |
'consonant_place' : 'velar', | |
'consonant_manner': 'stop' | |
}, | |
'h': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'unvoiced', | |
'consonant_place' : 'glottal', | |
'consonant_manner': 'fricative' | |
}, | |
'i': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'vowel', | |
'VUV' : 'voiced', | |
'vowel_frontness' : 'front', | |
'vowel_openness' : 'close', | |
'vowel_roundedness': 'unrounded' | |
}, | |
'j': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'voiced', | |
'consonant_place' : 'palatal', | |
'consonant_manner': 'approximant' | |
}, | |
'k': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'unvoiced', | |
'consonant_place' : 'velar', | |
'consonant_manner': 'stop' | |
}, | |
'l': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'voiced', | |
'consonant_place' : 'alveolar', | |
'consonant_manner': 'lateral-approximant' | |
}, | |
'm': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'voiced', | |
'consonant_place' : 'bilabial', | |
'consonant_manner': 'nasal' | |
}, | |
'n': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'voiced', | |
'consonant_place' : 'alveolar', | |
'consonant_manner': 'nasal' | |
}, | |
'ɳ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'voiced', | |
'consonant_place' : 'palatal', | |
'consonant_manner': 'nasal' | |
}, | |
'o': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'vowel', | |
'VUV' : 'voiced', | |
'vowel_frontness' : 'back', | |
'vowel_openness' : 'close-mid', | |
'vowel_roundedness': 'rounded' | |
}, | |
'p': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'unvoiced', | |
'consonant_place' : 'bilabial', | |
'consonant_manner': 'stop' | |
}, | |
'ɡ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'voiced', | |
'consonant_place' : 'velar', | |
'consonant_manner': 'stop' | |
}, | |
'ɹ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'voiced', | |
'consonant_place' : 'alveolar', | |
'consonant_manner': 'approximant' | |
}, | |
'r': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'voiced', | |
'consonant_place' : 'alveolar', | |
'consonant_manner': 'trill' | |
}, | |
's': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'unvoiced', | |
'consonant_place' : 'alveolar', | |
'consonant_manner': 'fricative' | |
}, | |
't': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'unvoiced', | |
'consonant_place' : 'alveolar', | |
'consonant_manner': 'stop' | |
}, | |
'u': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'vowel', | |
'VUV' : 'voiced', | |
'vowel_frontness' : 'back', | |
'vowel_openness' : 'close', | |
'vowel_roundedness': 'rounded', | |
}, | |
'v': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'voiced', | |
'consonant_place' : 'labiodental', | |
'consonant_manner': 'fricative' | |
}, | |
'w': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'voiced', | |
'consonant_place' : 'labial-velar', | |
'consonant_manner': 'approximant' | |
}, | |
'x': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'unvoiced', | |
'consonant_place' : 'velar', | |
'consonant_manner': 'fricative' | |
}, | |
'z': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'voiced', | |
'consonant_place' : 'alveolar', | |
'consonant_manner': 'fricative' | |
}, | |
'ʀ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'voiced', | |
'consonant_place' : 'uvular', | |
'consonant_manner': 'trill' | |
}, | |
'ø': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'vowel', | |
'VUV' : 'voiced', | |
'vowel_frontness' : 'front', | |
'vowel_openness' : 'close-mid', | |
'vowel_roundedness': 'rounded' | |
}, | |
'ç': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'unvoiced', | |
'consonant_place' : 'palatal', | |
'consonant_manner': 'fricative' | |
}, | |
'ɐ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'vowel', | |
'VUV' : 'voiced', | |
'vowel_frontness' : 'central', | |
'vowel_openness' : 'open', | |
'vowel_roundedness': 'unrounded' | |
}, | |
'œ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'vowel', | |
'VUV' : 'voiced', | |
'vowel_frontness' : 'front', | |
'vowel_openness' : 'open-mid', | |
'vowel_roundedness': 'rounded' | |
}, | |
'y': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'vowel', | |
'VUV' : 'voiced', | |
'vowel_frontness' : 'front', | |
'vowel_openness' : 'close', | |
'vowel_roundedness': 'rounded' | |
}, | |
'ʏ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'vowel', | |
'VUV' : 'voiced', | |
'vowel_frontness' : 'front_central', | |
'vowel_openness' : 'close_close-mid', | |
'vowel_roundedness': 'rounded' | |
}, | |
'ɑ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'vowel', | |
'VUV' : 'voiced', | |
'vowel_frontness' : 'back', | |
'vowel_openness' : 'open', | |
'vowel_roundedness': 'unrounded' | |
}, | |
'c': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'unvoiced', | |
'consonant_place' : 'palatal', | |
'consonant_manner': 'stop' | |
}, | |
'ɲ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'voiced', | |
'consonant_place' : 'palatal', | |
'consonant_manner': 'nasal' | |
}, | |
'ɣ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'voiced', | |
'consonant_place' : 'velar', | |
'consonant_manner': 'fricative' | |
}, | |
'ʎ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'voiced', | |
'consonant_place' : 'palatal', | |
'consonant_manner': 'lateral-approximant' | |
}, | |
'β': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'voiced', | |
'consonant_place' : 'bilabial', | |
'consonant_manner': 'fricative' | |
}, | |
'ʝ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'voiced', | |
'consonant_place' : 'palatal', | |
'consonant_manner': 'fricative' | |
}, | |
'ɟ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'voiced', | |
'consonant_place' : 'palatal', | |
'consonant_manner': 'stop' | |
}, | |
'q': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'unvoiced', | |
'consonant_place' : 'uvular', | |
'consonant_manner': 'stop' | |
}, | |
'ɕ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'unvoiced', | |
'consonant_place' : 'alveolopalatal', | |
'consonant_manner': 'fricative' | |
}, | |
'ʲ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'voiced', | |
'consonant_place' : 'palatal', | |
'consonant_manner': 'approximant' | |
}, | |
'ɭ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'voiced', | |
'consonant_place' : 'palatal', # should be retroflex, but palatal should be close enough | |
'consonant_manner': 'lateral-approximant' | |
}, | |
'ɵ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'vowel', | |
'VUV' : 'voiced', | |
'vowel_frontness' : 'central', | |
'vowel_openness' : 'open-mid', | |
'vowel_roundedness': 'rounded' | |
}, | |
'ʑ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'voiced', | |
'consonant_place' : 'alveolopalatal', | |
'consonant_manner': 'fricative' | |
}, | |
'ʋ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'voiced', | |
'consonant_place' : 'labiodental', | |
'consonant_manner': 'approximant' | |
}, | |
'ʁ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'voiced', | |
'consonant_place' : 'uvular', | |
'consonant_manner': 'fricative' | |
}, | |
'ɨ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'vowel', | |
'VUV' : 'voiced', | |
'vowel_frontness' : 'central', | |
'vowel_openness' : 'close', | |
'vowel_roundedness': 'unrounded' | |
}, | |
'ʂ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'unvoiced', | |
'consonant_place' : 'palatal', # should be retroflex, but palatal should be close enough | |
'consonant_manner': 'fricative' | |
}, | |
'ɬ': { | |
'symbol_type' : 'phoneme', | |
'vowel_consonant' : 'consonant', | |
'VUV' : 'unvoiced', | |
'consonant_place' : 'alveolar', # should be noted it's also lateral, but should be close enough | |
'consonant_manner': 'fricative' | |
}, | |
} # REMEMBER to also add the phonemes added here to the ID lookup table in the TextFrontend as the new highest ID | |
def generate_feature_table(): | |
ipa_to_phonemefeats = generate_feature_lookup() | |
feat_types = set() | |
for ipa in ipa_to_phonemefeats: | |
if len(ipa) == 1: | |
[feat_types.add(feat) for feat in ipa_to_phonemefeats[ipa].keys()] | |
feat_to_val_set = dict() | |
for feat in feat_types: | |
feat_to_val_set[feat] = set() | |
for ipa in ipa_to_phonemefeats: | |
if len(ipa) == 1: | |
for feat in ipa_to_phonemefeats[ipa]: | |
feat_to_val_set[feat].add(ipa_to_phonemefeats[ipa][feat]) | |
# print(feat_to_val_set) | |
value_list = set() | |
for val_set in [feat_to_val_set[feat] for feat in feat_to_val_set]: | |
for value in val_set: | |
value_list.add(value) | |
# print("{") | |
# for index, value in enumerate(list(value_list)): | |
# print('"{}":{},'.format(value,index)) | |
# print("}") | |
value_to_index = { | |
"dental" : 0, | |
"postalveolar" : 1, | |
"mid" : 2, | |
"close-mid" : 3, | |
"vowel" : 4, | |
"silence" : 5, | |
"consonant" : 6, | |
"close" : 7, | |
"velar" : 8, | |
"stop" : 9, | |
"palatal" : 10, | |
"nasal" : 11, | |
"glottal" : 12, | |
"central" : 13, | |
"back" : 14, | |
"approximant" : 15, | |
"uvular" : 16, | |
"open-mid" : 17, | |
"front_central" : 18, | |
"front" : 19, | |
"end of sentence" : 20, | |
"labiodental" : 21, | |
"close_close-mid" : 22, | |
"labial-velar" : 23, | |
"unvoiced" : 24, | |
"central_back" : 25, | |
"trill" : 26, | |
"rounded" : 27, | |
"open-mid_open" : 28, | |
"tap" : 29, | |
"alveolar" : 30, | |
"bilabial" : 31, | |
"phoneme" : 32, | |
"open" : 33, | |
"fricative" : 34, | |
"unrounded" : 35, | |
"lateral-approximant": 36, | |
"voiced" : 37, | |
"questionmark" : 38, | |
"exclamationmark" : 39, | |
"fullstop" : 40, | |
"alveolopalatal" : 41 | |
} | |
phone_to_vector = dict() | |
for ipa in ipa_to_phonemefeats: | |
if len(ipa) == 1: | |
phone_to_vector[ipa] = [0] * sum([len(values) for values in [feat_to_val_set[feat] for feat in feat_to_val_set]]) | |
for feat in ipa_to_phonemefeats[ipa]: | |
if ipa_to_phonemefeats[ipa][feat] in value_to_index: | |
phone_to_vector[ipa][value_to_index[ipa_to_phonemefeats[ipa][feat]]] = 1 | |
for feat in feat_to_val_set: | |
for value in feat_to_val_set[feat]: | |
if value not in value_to_index: | |
print(f"Unknown feature value in featureset! {value}") | |
# print(f"{sum([len(values) for values in [feat_to_val_set[feat] for feat in feat_to_val_set]])} should be 42") | |
return phone_to_vector | |
def generate_phone_to_id_lookup(): | |
ipa_to_phonemefeats = generate_feature_lookup() | |
count = 0 | |
phone_to_id = dict() | |
for key in sorted(list(ipa_to_phonemefeats)): # careful: non-deterministic | |
phone_to_id[key] = count | |
count += 1 | |
return phone_to_id | |
if __name__ == '__main__': | |
print(generate_phone_to_id_lookup()) | |