Commit
·
7423626
1
Parent(s):
126a0a6
using levenshtein distance for similarity score
Browse files- app.py +1 -1
- requirements.txt +10 -9
- utils.py +18 -10
app.py
CHANGED
|
@@ -14,7 +14,7 @@ from g2p_en import G2p
|
|
| 14 |
from minineedle import needle, smith, core
|
| 15 |
|
| 16 |
from model import AlignmentRequest
|
| 17 |
-
from utils import arpabet_to_ipa_seq, similarity_score
|
| 18 |
from audio import decode_audio_bytes, preprocess_audio
|
| 19 |
|
| 20 |
# Configure environment
|
|
|
|
| 14 |
from minineedle import needle, smith, core
|
| 15 |
|
| 16 |
from model import AlignmentRequest
|
| 17 |
+
from utils import arpabet_to_ipa_seq, levenshtein_similarity_score as similarity_score
|
| 18 |
from audio import decode_audio_bytes, preprocess_audio
|
| 19 |
|
| 20 |
# Configure environment
|
requirements.txt
CHANGED
|
@@ -1,11 +1,12 @@
|
|
| 1 |
-
fastapi
|
| 2 |
-
uvicorn
|
| 3 |
-
transformers
|
| 4 |
-
huggingface-hub
|
| 5 |
-
soundfile
|
| 6 |
-
av
|
| 7 |
-
numpy
|
| 8 |
-
python-multipart
|
| 9 |
protobuf
|
| 10 |
minineedle==3.1.5
|
| 11 |
-
g2p-en==2.1.0
|
|
|
|
|
|
| 1 |
+
fastapi==0.117.1
|
| 2 |
+
uvicorn==0.36.0
|
| 3 |
+
transformers==4.56.2
|
| 4 |
+
huggingface-hub==0.35.0
|
| 5 |
+
soundfile==0.13.1
|
| 6 |
+
av==15.1.0
|
| 7 |
+
numpy==2.3.3
|
| 8 |
+
python-multipart==0.0.20
|
| 9 |
protobuf
|
| 10 |
minineedle==3.1.5
|
| 11 |
+
g2p-en==2.1.0
|
| 12 |
+
python-Levenshtein==0.27.1
|
utils.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
|
|
|
|
|
| 1 |
ARPABET_TO_IPA = {
|
| 2 |
"AA": "ɑ", # father
|
| 3 |
"AE": "æ", # cat
|
|
@@ -46,18 +48,24 @@ def arpabet_to_ipa_seq(arpabet_seq):
|
|
| 46 |
return [ARPABET_TO_IPA.get(sym.rstrip("012").upper(), sym) for sym in arpabet_seq]
|
| 47 |
|
| 48 |
|
| 49 |
-
def
|
| 50 |
"""
|
| 51 |
-
Calculate
|
| 52 |
-
Score = matches / total
|
| 53 |
"""
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
|
| 62 |
if __name__ == "__main__":
|
| 63 |
-
print(arpabet_to_ipa_seq(['ah', 'l', 'ow', 'ay', 'd', 'ow', 'n', 't', 'r', 'ih']))
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import Levenshtein
|
| 2 |
+
|
| 3 |
ARPABET_TO_IPA = {
|
| 4 |
"AA": "ɑ", # father
|
| 5 |
"AE": "æ", # cat
|
|
|
|
| 48 |
return [ARPABET_TO_IPA.get(sym.rstrip("012").upper(), sym) for sym in arpabet_seq]
|
| 49 |
|
| 50 |
|
| 51 |
+
def levenshtein_similarity_score(seq1, seq2):
|
| 52 |
"""
|
| 53 |
+
Calculate the Levenshtein distance between two sequences.
|
|
|
|
| 54 |
"""
|
| 55 |
+
str1 = "".join(seq1)
|
| 56 |
+
str2 = "".join(seq2)
|
| 57 |
+
distance = Levenshtein.distance(str1, str2)
|
| 58 |
+
max_len = max(len(str1), len(str2))
|
| 59 |
+
if max_len == 0: # Handle empty strings to prevent division by zero
|
| 60 |
+
normalized_distance = 0.0
|
| 61 |
+
else:
|
| 62 |
+
normalized_distance = distance / max_len
|
| 63 |
+
similarity_score = 1 - normalized_distance
|
| 64 |
+
return int(similarity_score * 100)
|
| 65 |
|
| 66 |
|
| 67 |
if __name__ == "__main__":
|
| 68 |
+
# print(arpabet_to_ipa_seq(['ah', 'l', 'ow', 'ay', 'd', 'ow', 'n', 't', 'r', 'ih']))
|
| 69 |
+
corrected_ipa = ["ð", "ɛ", "ɹ"]
|
| 70 |
+
user_ipa = ["ʌ", "ð", "ɛ", "ɹ"]
|
| 71 |
+
print(levenshtein_similarity_score(corrected_ipa, user_ipa))
|