thanhhungtakeshi commited on
Commit
7423626
·
1 Parent(s): 126a0a6

using levenshtein distance for similarity score

Browse files
Files changed (3) hide show
  1. app.py +1 -1
  2. requirements.txt +10 -9
  3. utils.py +18 -10
app.py CHANGED
@@ -14,7 +14,7 @@ from g2p_en import G2p
14
  from minineedle import needle, smith, core
15
 
16
  from model import AlignmentRequest
17
- from utils import arpabet_to_ipa_seq, similarity_score
18
  from audio import decode_audio_bytes, preprocess_audio
19
 
20
  # Configure environment
 
14
  from minineedle import needle, smith, core
15
 
16
  from model import AlignmentRequest
17
+ from utils import arpabet_to_ipa_seq, levenshtein_similarity_score as similarity_score
18
  from audio import decode_audio_bytes, preprocess_audio
19
 
20
  # Configure environment
requirements.txt CHANGED
@@ -1,11 +1,12 @@
1
- fastapi
2
- uvicorn
3
- transformers
4
- huggingface-hub
5
- soundfile
6
- av
7
- numpy
8
- python-multipart
9
  protobuf
10
  minineedle==3.1.5
11
- g2p-en==2.1.0
 
 
1
+ fastapi==0.117.1
2
+ uvicorn==0.36.0
3
+ transformers==4.56.2
4
+ huggingface-hub==0.35.0
5
+ soundfile==0.13.1
6
+ av==15.1.0
7
+ numpy==2.3.3
8
+ python-multipart==0.0.20
9
  protobuf
10
  minineedle==3.1.5
11
+ g2p-en==2.1.0
12
+ python-Levenshtein==0.27.1
utils.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  ARPABET_TO_IPA = {
2
  "AA": "ɑ", # father
3
  "AE": "æ", # cat
@@ -46,18 +48,24 @@ def arpabet_to_ipa_seq(arpabet_seq):
46
  return [ARPABET_TO_IPA.get(sym.rstrip("012").upper(), sym) for sym in arpabet_seq]
47
 
48
 
49
- def similarity_score(list1, list2):
50
  """
51
- Calculate similarity score between two lists of characters (same length).
52
- Score = matches / total
53
  """
54
- if len(list1) != len(list2):
55
- return 0
56
-
57
- matches = sum(c1 == c2 for c1, c2 in zip(list1, list2))
58
- score = matches / len(list1)
59
- return int(round(score, 2) * 100)
 
 
 
 
60
 
61
 
62
  if __name__ == "__main__":
63
- print(arpabet_to_ipa_seq(['ah', 'l', 'ow', 'ay', 'd', 'ow', 'n', 't', 'r', 'ih']))
 
 
 
 
1
+ import Levenshtein
2
+
3
  ARPABET_TO_IPA = {
4
  "AA": "ɑ", # father
5
  "AE": "æ", # cat
 
48
  return [ARPABET_TO_IPA.get(sym.rstrip("012").upper(), sym) for sym in arpabet_seq]
49
 
50
 
51
+ def levenshtein_similarity_score(seq1, seq2):
52
  """
53
+ Calculate the Levenshtein distance between two sequences.
 
54
  """
55
+ str1 = "".join(seq1)
56
+ str2 = "".join(seq2)
57
+ distance = Levenshtein.distance(str1, str2)
58
+ max_len = max(len(str1), len(str2))
59
+ if max_len == 0: # Handle empty strings to prevent division by zero
60
+ normalized_distance = 0.0
61
+ else:
62
+ normalized_distance = distance / max_len
63
+ similarity_score = 1 - normalized_distance
64
+ return int(similarity_score * 100)
65
 
66
 
67
  if __name__ == "__main__":
68
+ # print(arpabet_to_ipa_seq(['ah', 'l', 'ow', 'ay', 'd', 'ow', 'n', 't', 'r', 'ih']))
69
+ corrected_ipa = ["ð", "ɛ", "ɹ"]
70
+ user_ipa = ["ʌ", "ð", "ɛ", "ɹ"]
71
+ print(levenshtein_similarity_score(corrected_ipa, user_ipa))