thanhhungtakeshi's picture
using levenshtein distance for similarity score
7423626
import Levenshtein
ARPABET_TO_IPA = {
"AA": "ɑ", # father
"AE": "æ", # cat
"AH": "ʌ", # strut
"AO": "ɔ", # thought
"AW": "aʊ", # now
"AY": "aɪ", # my
"B": "b",
"CH": "tʃ",
"D": "d",
"DH": "ð",
"EH": "ɛ", # bed
"ER": "ɝ", # bird (rhotic); could also map to ɜː in non-rhotic
"EY": "eɪ", # face
"F": "f",
"G": "ɡ",
"HH": "h",
"IH": "ɪ", # sit
"IY": "iː", # seat <-- changed
"JH": "dʒ",
"K": "k",
"L": "l",
"M": "m",
"N": "n",
"NG": "ŋ",
"OW": "oʊ", # goat
"OY": "ɔɪ", # boy
"P": "p",
"R": "ɹ",
"S": "s",
"SH": "ʃ",
"T": "t",
"TH": "θ",
"UH": "ʊ", # foot
"UW": "uː", # goose <-- changed
"V": "v",
"W": "w",
"Y": "j",
"Z": "z",
"ZH": "ʒ"
}
def arpabet_to_ipa_seq(arpabet_seq):
# remove stress numbers like UW1 → UW
return [ARPABET_TO_IPA.get(sym.rstrip("012").upper(), sym) for sym in arpabet_seq]
def levenshtein_similarity_score(seq1, seq2):
"""
Calculate the Levenshtein distance between two sequences.
"""
str1 = "".join(seq1)
str2 = "".join(seq2)
distance = Levenshtein.distance(str1, str2)
max_len = max(len(str1), len(str2))
if max_len == 0: # Handle empty strings to prevent division by zero
normalized_distance = 0.0
else:
normalized_distance = distance / max_len
similarity_score = 1 - normalized_distance
return int(similarity_score * 100)
if __name__ == "__main__":
# print(arpabet_to_ipa_seq(['ah', 'l', 'ow', 'ay', 'd', 'ow', 'n', 't', 'r', 'ih']))
corrected_ipa = ["ð", "ɛ", "ɹ"]
user_ipa = ["ʌ", "ð", "ɛ", "ɹ"]
print(levenshtein_similarity_score(corrected_ipa, user_ipa))