seba3y commited on
Commit
25e2c30
1 Parent(s): c13f0a5

Upload 3 files

Browse files
Files changed (2) hide show
  1. logic.py +10 -9
  2. scoring.py +34 -60
logic.py CHANGED
@@ -1,7 +1,7 @@
1
  from phonemizer.separator import Separator
2
  from phonemizer import phonemize
3
  # from phonemizer.backend.espeak.wrapper import EspeakWrapper
4
- from Levenshtein import distance as levenshtein_distance
5
  from scoring import calculate_fluency_and_pronunciation
6
 
7
  import whisper
@@ -28,13 +28,14 @@ def rate_pronunciation(expected_phonemes, actual_phonemes):
28
  results = []
29
  for i, base_word in enumerate(actual_phonemes):
30
  best_dist = float('inf')
31
- error_threshold = len(base_word) * 0.45
32
- for pred_word_id in range(max(0, i-2), i + min(6, len(expected_phonemes) - i)):
33
- dist = levenshtein_distance(expected_phonemes[pred_word_id], base_word,)
34
- if dist < best_dist:
35
- best_dist = dist
36
- if best_dist == 0: # Early stopping on perfect match
37
- break
 
38
  if best_dist == 0:
39
  results.append(3)
40
  elif best_dist <= error_threshold:
@@ -52,7 +53,7 @@ def Speaker_speech_analysis(audio_path, text):
52
  transcribtion = text2phoneme(pre_transcribtion)
53
  text_phone = text2phoneme(text)
54
  scores = rate_pronunciation(transcribtion, text_phone)
55
- FP_scores = calculate_fluency_and_pronunciation(audio_path, pre_transcribtion, scores, len(text.split()))
56
  word_scores = [(word, s) for word, s in zip(text.split(), scores)]
57
 
58
  FP_scores['word_scores'] = word_scores
 
1
  from phonemizer.separator import Separator
2
  from phonemizer import phonemize
3
  # from phonemizer.backend.espeak.wrapper import EspeakWrapper
4
+ from Levenshtein import distance as levenshtein_distance
5
  from scoring import calculate_fluency_and_pronunciation
6
 
7
  import whisper
 
28
  results = []
29
  for i, base_word in enumerate(actual_phonemes):
30
  best_dist = float('inf')
31
+ if i <= len(expected_phonemes):
32
+ for j in range(max(0, i-1), i + min(3, len(expected_phonemes) - i)):
33
+ dist = levenshtein_distance(expected_phonemes[j], base_word,)
34
+ if dist < best_dist:
35
+ best_dist = dist
36
+ if best_dist == 0: # Early stopping on perfect match
37
+ break
38
+ error_threshold = len(base_word) * 0.40
39
  if best_dist == 0:
40
  results.append(3)
41
  elif best_dist <= error_threshold:
 
53
  transcribtion = text2phoneme(pre_transcribtion)
54
  text_phone = text2phoneme(text)
55
  scores = rate_pronunciation(transcribtion, text_phone)
56
+ FP_scores = calculate_fluency_and_pronunciation(audio_path, len(pre_transcribtion.split()), scores, len(text.split()))
57
  word_scores = [(word, s) for word, s in zip(text.split(), scores)]
58
 
59
  FP_scores['word_scores'] = word_scores
scoring.py CHANGED
@@ -2,26 +2,19 @@ import numpy as np
2
  import librosa
3
 
4
  def calculate_expected_value(scores):
5
- """
6
- Calculate the expected value for a list of outcomes (scores), assuming each unique score
7
- occurs with a frequency proportional to its count in the list.
8
-
9
- :param scores: List of outcomes (numeric values).
10
- :return: The expected value (a weighted average of all possible outcomes).
11
- """
12
  # First calculate the probability of each unique score
13
  unique_scores, counts = np.unique(scores, return_counts=True)
14
  probabilities = counts / len(scores)
15
 
16
- # Then calculate the expected value as the sum of scores times their probabilities
17
  expected_value = np.dot(unique_scores, probabilities)
18
  return expected_value
19
 
20
 
21
- def calculate_fluency_score(audio_path, transcription, word_pronunciation_scores, base_script_len):
22
- total_words = len(transcription.split())
23
  avg_pronunciation_score = calculate_expected_value(word_pronunciation_scores)
24
- if (total_words / base_script_len) < 0.15 or avg_pronunciation_score < 1.3:
25
  return 10
26
  audio, sr = librosa.load(audio_path)
27
  non_silent_intervals = librosa.effects.split(audio, top_db=22)
@@ -29,28 +22,25 @@ def calculate_fluency_score(audio_path, transcription, word_pronunciation_scores
29
 
30
  total_duration = len(audio) / sr
31
 
32
-
33
- non_silent_duration = non_silent_duration if total_words > 4 else 0
34
  ideal_min_rate, ideal_max_rate = 120 / 60, 140 / 60
35
  actual_speech_rate = (total_words / (non_silent_duration + 1e-7)) * (total_words / base_script_len)
36
  speaking_ratio = non_silent_duration / total_duration
37
  # Existing speech rate score calculation
38
 
39
  # Determine if speech rate is within the ideal range
40
- if ideal_min_rate <= actual_speech_rate <= ideal_max_rate:
41
- # Within the ideal range
42
- speech_rate_score = 1
 
 
 
43
  else:
44
- # Outside the ideal range, score is proportional to how close it is to the range
45
- if actual_speech_rate < ideal_min_rate:
46
- # Too slow
47
- speech_rate_score = actual_speech_rate / ideal_min_rate
48
- else:
49
- # Too fast
50
- speech_rate_score = 2 - (actual_speech_rate / ideal_max_rate)
51
- # Clamp the score between 0 and 1
52
- speech_rate_score = max(0, min(speech_rate_score, 1))
53
-
54
  # If speaking ratio is significantly less than the gold standard, reduce the fluency score
55
  gold_standard_ratio = 0.9 # Assuming 90% speaking time is gold standard for natural speech
56
  speaking_ratio_score = min(speaking_ratio / gold_standard_ratio, 1)
@@ -58,61 +48,45 @@ def calculate_fluency_score(audio_path, transcription, word_pronunciation_scores
58
 
59
  # Pronunciation score calculation
60
  avg_pronunciation_score = (avg_pronunciation_score - 1) / 2
61
- pronunciation_variance = np.var(word_pronunciation_scores, ddof=1,)
 
62
 
63
  # Weighted combination of scores
64
  # Adjust weights as needed
65
- weight_speech_rate = 0.20
66
  weight_speaking_ratio = 0.20
67
  weight_pronunciation = 0.50
68
- weight_pronunciation_variance = 0.10
69
 
70
- combined_score = (speech_rate_score * weight_speech_rate +
71
- speaking_ratio_score * weight_speaking_ratio +
72
- avg_pronunciation_score * weight_pronunciation +
73
- (1 / (1 + pronunciation_variance)) * weight_pronunciation_variance)
74
 
75
  # Scale the combined score to be between 10% and 100%
76
  scaled_fluency_score = 10 + combined_score * 80
77
 
78
  return scaled_fluency_score
79
 
80
- def calculate_pronunciation_accuracy(word_pronunciation_scores, fluency_score, base_script_len):
81
- if len(word_pronunciation_scores) / base_script_len < 0.25:
82
- return 10
83
  # Calculate average word pronunciation score
84
  avg_pronunciation_score = calculate_expected_value(word_pronunciation_scores)
85
- print(avg_pronunciation_score)
86
- # Adjust pronunciation score based on fluency
87
- # fluency_score = fluency_score / 100
88
- # This is a simplistic adjustment. It can be refined based on more detailed analysis
89
- fluency_adjustment = fluency_score / 100
90
- print(fluency_adjustment)
91
- adjusted_pronunciation_score = avg_pronunciation_score * fluency_adjustment
92
- print(adjusted_pronunciation_score)
93
- # Map to 0-5 scale based on score guide
94
- # These thresholds can be adjusted based on empirical data or further analysis
95
- if adjusted_pronunciation_score >= 2.4:
96
- score_guide_level = 5
97
- elif adjusted_pronunciation_score >= 1.7:
98
- score_guide_level = 4
99
- elif adjusted_pronunciation_score >= 1.0:
100
- score_guide_level = 3
101
- elif adjusted_pronunciation_score >= 0.5:
102
- score_guide_level = 2
103
- else:
104
- score_guide_level = 1
105
 
 
 
 
 
 
 
106
  # Scale to 10% - 90%
107
- final_score = 10 + (score_guide_level - 1) * 20 # Scale each level to a range of 20%
108
 
109
  return final_score
110
 
111
- def calculate_fluency_and_pronunciation(audio_path, transcription, word_pronunciation_scores, base_script_len):
112
 
113
- fluency_score = calculate_fluency_score(audio_path, transcription, word_pronunciation_scores, base_script_len)
114
 
115
- pronunciation_accuracy = calculate_pronunciation_accuracy(word_pronunciation_scores, fluency_score, base_script_len)
116
 
117
  return {'fluency_score': fluency_score, 'pronunciation_accuracy': pronunciation_accuracy}
118
 
 
2
  import librosa
3
 
4
  def calculate_expected_value(scores):
 
 
 
 
 
 
 
5
  # First calculate the probability of each unique score
6
  unique_scores, counts = np.unique(scores, return_counts=True)
7
  probabilities = counts / len(scores)
8
 
9
+ # Then calculate the expected value as the sum of scores times their probabilities
10
  expected_value = np.dot(unique_scores, probabilities)
11
  return expected_value
12
 
13
 
14
+ def calculate_fluency_score(audio_path, total_words, word_pronunciation_scores, base_script_len):
15
+
16
  avg_pronunciation_score = calculate_expected_value(word_pronunciation_scores)
17
+ if (total_words / base_script_len) < 0.15 or avg_pronunciation_score < 1.5:
18
  return 10
19
  audio, sr = librosa.load(audio_path)
20
  non_silent_intervals = librosa.effects.split(audio, top_db=22)
 
22
 
23
  total_duration = len(audio) / sr
24
 
25
+ non_silent_duration = non_silent_duration
 
26
  ideal_min_rate, ideal_max_rate = 120 / 60, 140 / 60
27
  actual_speech_rate = (total_words / (non_silent_duration + 1e-7)) * (total_words / base_script_len)
28
  speaking_ratio = non_silent_duration / total_duration
29
  # Existing speech rate score calculation
30
 
31
  # Determine if speech rate is within the ideal range
32
+ if actual_speech_rate <= ideal_max_rate:
33
+ # Within the ideal range or speaking slow
34
+ max_ratio = actual_speech_rate / ideal_max_rate
35
+ min_ratio = (actual_speech_rate / ideal_min_rate)
36
+ speech_rate_score = np.mean([max_ratio, min_ratio]) - 0.167
37
+ # for normal speaking speech_rate_score between (0.708, 1) and for slow speaking speech_rate_score (0.707, 0)
38
  else:
39
+ # Too fast
40
+ # for fast speaking speech_rate_score (0.707, 0)
41
+ max_ratio = actual_speech_rate / ideal_max_rate
42
+ speech_rate_score = 0.7 / max_ratio
43
+
 
 
 
 
 
44
  # If speaking ratio is significantly less than the gold standard, reduce the fluency score
45
  gold_standard_ratio = 0.9 # Assuming 90% speaking time is gold standard for natural speech
46
  speaking_ratio_score = min(speaking_ratio / gold_standard_ratio, 1)
 
48
 
49
  # Pronunciation score calculation
50
  avg_pronunciation_score = (avg_pronunciation_score - 1) / 2
51
+
52
+ # pronunciation_variance = np.var(word_pronunciation_scores, ddof=1,)
53
 
54
  # Weighted combination of scores
55
  # Adjust weights as needed
56
+ weight_speech_rate = 0.30
57
  weight_speaking_ratio = 0.20
58
  weight_pronunciation = 0.50
59
+ # weight_pronunciation_variance = 0.10
60
 
61
+ combined_score = speech_rate_score * weight_speech_rate + speaking_ratio_score * weight_speaking_ratio + avg_pronunciation_score * weight_pronunciation
 
 
 
62
 
63
  # Scale the combined score to be between 10% and 100%
64
  scaled_fluency_score = 10 + combined_score * 80
65
 
66
  return scaled_fluency_score
67
 
68
+ def calculate_pronunciation_accuracy(word_pronunciation_scores, fluency_score, base_script_len, total_words):
69
+ # if total_words / base_script_len < 0.25:
70
+ # return 10
71
  # Calculate average word pronunciation score
72
  avg_pronunciation_score = calculate_expected_value(word_pronunciation_scores)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
+ fluency_score = fluency_score / 100
75
+
76
+ avg_pronunciation_score = (avg_pronunciation_score - 1) / 2
77
+ avg_weight = 0.8
78
+ flu_weight = 0.2
79
+ combined_score = avg_weight * avg_pronunciation_score + flu_weight * fluency_score
80
  # Scale to 10% - 90%
81
+ final_score = 10 + combined_score * 90
82
 
83
  return final_score
84
 
85
+ def calculate_fluency_and_pronunciation(audio_path, total_words, word_pronunciation_scores, base_script_len):
86
 
87
+ fluency_score = calculate_fluency_score(audio_path, total_words, word_pronunciation_scores, base_script_len)
88
 
89
+ pronunciation_accuracy = calculate_pronunciation_accuracy(word_pronunciation_scores, fluency_score, base_script_len, total_words)
90
 
91
  return {'fluency_score': fluency_score, 'pronunciation_accuracy': pronunciation_accuracy}
92