Spaces:
Running
Running
Upload 3 files
Browse files- logic.py +10 -9
- scoring.py +34 -60
logic.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
from phonemizer.separator import Separator
|
2 |
from phonemizer import phonemize
|
3 |
# from phonemizer.backend.espeak.wrapper import EspeakWrapper
|
4 |
-
from Levenshtein import distance as levenshtein_distance
|
5 |
from scoring import calculate_fluency_and_pronunciation
|
6 |
|
7 |
import whisper
|
@@ -28,13 +28,14 @@ def rate_pronunciation(expected_phonemes, actual_phonemes):
|
|
28 |
results = []
|
29 |
for i, base_word in enumerate(actual_phonemes):
|
30 |
best_dist = float('inf')
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
|
|
38 |
if best_dist == 0:
|
39 |
results.append(3)
|
40 |
elif best_dist <= error_threshold:
|
@@ -52,7 +53,7 @@ def Speaker_speech_analysis(audio_path, text):
|
|
52 |
transcribtion = text2phoneme(pre_transcribtion)
|
53 |
text_phone = text2phoneme(text)
|
54 |
scores = rate_pronunciation(transcribtion, text_phone)
|
55 |
-
FP_scores = calculate_fluency_and_pronunciation(audio_path, pre_transcribtion, scores, len(text.split()))
|
56 |
word_scores = [(word, s) for word, s in zip(text.split(), scores)]
|
57 |
|
58 |
FP_scores['word_scores'] = word_scores
|
|
|
1 |
from phonemizer.separator import Separator
|
2 |
from phonemizer import phonemize
|
3 |
# from phonemizer.backend.espeak.wrapper import EspeakWrapper
|
4 |
+
from Levenshtein import distance as levenshtein_distance
|
5 |
from scoring import calculate_fluency_and_pronunciation
|
6 |
|
7 |
import whisper
|
|
|
28 |
results = []
|
29 |
for i, base_word in enumerate(actual_phonemes):
|
30 |
best_dist = float('inf')
|
31 |
+
if i <= len(expected_phonemes):
|
32 |
+
for j in range(max(0, i-1), i + min(3, len(expected_phonemes) - i)):
|
33 |
+
dist = levenshtein_distance(expected_phonemes[j], base_word,)
|
34 |
+
if dist < best_dist:
|
35 |
+
best_dist = dist
|
36 |
+
if best_dist == 0: # Early stopping on perfect match
|
37 |
+
break
|
38 |
+
error_threshold = len(base_word) * 0.40
|
39 |
if best_dist == 0:
|
40 |
results.append(3)
|
41 |
elif best_dist <= error_threshold:
|
|
|
53 |
transcribtion = text2phoneme(pre_transcribtion)
|
54 |
text_phone = text2phoneme(text)
|
55 |
scores = rate_pronunciation(transcribtion, text_phone)
|
56 |
+
FP_scores = calculate_fluency_and_pronunciation(audio_path, len(pre_transcribtion.split()), scores, len(text.split()))
|
57 |
word_scores = [(word, s) for word, s in zip(text.split(), scores)]
|
58 |
|
59 |
FP_scores['word_scores'] = word_scores
|
scoring.py
CHANGED
@@ -2,26 +2,19 @@ import numpy as np
|
|
2 |
import librosa
|
3 |
|
4 |
def calculate_expected_value(scores):
|
5 |
-
"""
|
6 |
-
Calculate the expected value for a list of outcomes (scores), assuming each unique score
|
7 |
-
occurs with a frequency proportional to its count in the list.
|
8 |
-
|
9 |
-
:param scores: List of outcomes (numeric values).
|
10 |
-
:return: The expected value (a weighted average of all possible outcomes).
|
11 |
-
"""
|
12 |
# First calculate the probability of each unique score
|
13 |
unique_scores, counts = np.unique(scores, return_counts=True)
|
14 |
probabilities = counts / len(scores)
|
15 |
|
16 |
-
# Then calculate the expected value as the sum of scores times their probabilities
|
17 |
expected_value = np.dot(unique_scores, probabilities)
|
18 |
return expected_value
|
19 |
|
20 |
|
21 |
-
def calculate_fluency_score(audio_path,
|
22 |
-
|
23 |
avg_pronunciation_score = calculate_expected_value(word_pronunciation_scores)
|
24 |
-
if (total_words / base_script_len) < 0.15 or avg_pronunciation_score < 1.
|
25 |
return 10
|
26 |
audio, sr = librosa.load(audio_path)
|
27 |
non_silent_intervals = librosa.effects.split(audio, top_db=22)
|
@@ -29,28 +22,25 @@ def calculate_fluency_score(audio_path, transcription, word_pronunciation_scores
|
|
29 |
|
30 |
total_duration = len(audio) / sr
|
31 |
|
32 |
-
|
33 |
-
non_silent_duration = non_silent_duration if total_words > 4 else 0
|
34 |
ideal_min_rate, ideal_max_rate = 120 / 60, 140 / 60
|
35 |
actual_speech_rate = (total_words / (non_silent_duration + 1e-7)) * (total_words / base_script_len)
|
36 |
speaking_ratio = non_silent_duration / total_duration
|
37 |
# Existing speech rate score calculation
|
38 |
|
39 |
# Determine if speech rate is within the ideal range
|
40 |
-
if
|
41 |
-
# Within the ideal range
|
42 |
-
|
|
|
|
|
|
|
43 |
else:
|
44 |
-
#
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
# Too fast
|
50 |
-
speech_rate_score = 2 - (actual_speech_rate / ideal_max_rate)
|
51 |
-
# Clamp the score between 0 and 1
|
52 |
-
speech_rate_score = max(0, min(speech_rate_score, 1))
|
53 |
-
|
54 |
# If speaking ratio is significantly less than the gold standard, reduce the fluency score
|
55 |
gold_standard_ratio = 0.9 # Assuming 90% speaking time is gold standard for natural speech
|
56 |
speaking_ratio_score = min(speaking_ratio / gold_standard_ratio, 1)
|
@@ -58,61 +48,45 @@ def calculate_fluency_score(audio_path, transcription, word_pronunciation_scores
|
|
58 |
|
59 |
# Pronunciation score calculation
|
60 |
avg_pronunciation_score = (avg_pronunciation_score - 1) / 2
|
61 |
-
|
|
|
62 |
|
63 |
# Weighted combination of scores
|
64 |
# Adjust weights as needed
|
65 |
-
weight_speech_rate = 0.
|
66 |
weight_speaking_ratio = 0.20
|
67 |
weight_pronunciation = 0.50
|
68 |
-
weight_pronunciation_variance = 0.10
|
69 |
|
70 |
-
combined_score =
|
71 |
-
speaking_ratio_score * weight_speaking_ratio +
|
72 |
-
avg_pronunciation_score * weight_pronunciation +
|
73 |
-
(1 / (1 + pronunciation_variance)) * weight_pronunciation_variance)
|
74 |
|
75 |
# Scale the combined score to be between 10% and 100%
|
76 |
scaled_fluency_score = 10 + combined_score * 80
|
77 |
|
78 |
return scaled_fluency_score
|
79 |
|
80 |
-
def calculate_pronunciation_accuracy(word_pronunciation_scores, fluency_score, base_script_len):
|
81 |
-
if
|
82 |
-
|
83 |
# Calculate average word pronunciation score
|
84 |
avg_pronunciation_score = calculate_expected_value(word_pronunciation_scores)
|
85 |
-
print(avg_pronunciation_score)
|
86 |
-
# Adjust pronunciation score based on fluency
|
87 |
-
# fluency_score = fluency_score / 100
|
88 |
-
# This is a simplistic adjustment. It can be refined based on more detailed analysis
|
89 |
-
fluency_adjustment = fluency_score / 100
|
90 |
-
print(fluency_adjustment)
|
91 |
-
adjusted_pronunciation_score = avg_pronunciation_score * fluency_adjustment
|
92 |
-
print(adjusted_pronunciation_score)
|
93 |
-
# Map to 0-5 scale based on score guide
|
94 |
-
# These thresholds can be adjusted based on empirical data or further analysis
|
95 |
-
if adjusted_pronunciation_score >= 2.4:
|
96 |
-
score_guide_level = 5
|
97 |
-
elif adjusted_pronunciation_score >= 1.7:
|
98 |
-
score_guide_level = 4
|
99 |
-
elif adjusted_pronunciation_score >= 1.0:
|
100 |
-
score_guide_level = 3
|
101 |
-
elif adjusted_pronunciation_score >= 0.5:
|
102 |
-
score_guide_level = 2
|
103 |
-
else:
|
104 |
-
score_guide_level = 1
|
105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
# Scale to 10% - 90%
|
107 |
-
final_score = 10 +
|
108 |
|
109 |
return final_score
|
110 |
|
111 |
-
def calculate_fluency_and_pronunciation(audio_path,
|
112 |
|
113 |
-
fluency_score = calculate_fluency_score(audio_path,
|
114 |
|
115 |
-
pronunciation_accuracy = calculate_pronunciation_accuracy(word_pronunciation_scores, fluency_score, base_script_len)
|
116 |
|
117 |
return {'fluency_score': fluency_score, 'pronunciation_accuracy': pronunciation_accuracy}
|
118 |
|
|
|
2 |
import librosa
|
3 |
|
4 |
def calculate_expected_value(scores):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
# First calculate the probability of each unique score
|
6 |
unique_scores, counts = np.unique(scores, return_counts=True)
|
7 |
probabilities = counts / len(scores)
|
8 |
|
9 |
+
# Then calculate the expected value as the sum of scores times their probabilities
|
10 |
expected_value = np.dot(unique_scores, probabilities)
|
11 |
return expected_value
|
12 |
|
13 |
|
14 |
+
def calculate_fluency_score(audio_path, total_words, word_pronunciation_scores, base_script_len):
|
15 |
+
|
16 |
avg_pronunciation_score = calculate_expected_value(word_pronunciation_scores)
|
17 |
+
if (total_words / base_script_len) < 0.15 or avg_pronunciation_score < 1.5:
|
18 |
return 10
|
19 |
audio, sr = librosa.load(audio_path)
|
20 |
non_silent_intervals = librosa.effects.split(audio, top_db=22)
|
|
|
22 |
|
23 |
total_duration = len(audio) / sr
|
24 |
|
25 |
+
non_silent_duration = non_silent_duration
|
|
|
26 |
ideal_min_rate, ideal_max_rate = 120 / 60, 140 / 60
|
27 |
actual_speech_rate = (total_words / (non_silent_duration + 1e-7)) * (total_words / base_script_len)
|
28 |
speaking_ratio = non_silent_duration / total_duration
|
29 |
# Existing speech rate score calculation
|
30 |
|
31 |
# Determine if speech rate is within the ideal range
|
32 |
+
if actual_speech_rate <= ideal_max_rate:
|
33 |
+
# Within the ideal range or speaking slow
|
34 |
+
max_ratio = actual_speech_rate / ideal_max_rate
|
35 |
+
min_ratio = (actual_speech_rate / ideal_min_rate)
|
36 |
+
speech_rate_score = np.mean([max_ratio, min_ratio]) - 0.167
|
37 |
+
# for normal speaking speech_rate_score between (0.708, 1) and for slow speaking speech_rate_score (0.707, 0)
|
38 |
else:
|
39 |
+
# Too fast
|
40 |
+
# for fast speaking speech_rate_score (0.707, 0)
|
41 |
+
max_ratio = actual_speech_rate / ideal_max_rate
|
42 |
+
speech_rate_score = 0.7 / max_ratio
|
43 |
+
|
|
|
|
|
|
|
|
|
|
|
44 |
# If speaking ratio is significantly less than the gold standard, reduce the fluency score
|
45 |
gold_standard_ratio = 0.9 # Assuming 90% speaking time is gold standard for natural speech
|
46 |
speaking_ratio_score = min(speaking_ratio / gold_standard_ratio, 1)
|
|
|
48 |
|
49 |
# Pronunciation score calculation
|
50 |
avg_pronunciation_score = (avg_pronunciation_score - 1) / 2
|
51 |
+
|
52 |
+
# pronunciation_variance = np.var(word_pronunciation_scores, ddof=1,)
|
53 |
|
54 |
# Weighted combination of scores
|
55 |
# Adjust weights as needed
|
56 |
+
weight_speech_rate = 0.30
|
57 |
weight_speaking_ratio = 0.20
|
58 |
weight_pronunciation = 0.50
|
59 |
+
# weight_pronunciation_variance = 0.10
|
60 |
|
61 |
+
combined_score = speech_rate_score * weight_speech_rate + speaking_ratio_score * weight_speaking_ratio + avg_pronunciation_score * weight_pronunciation
|
|
|
|
|
|
|
62 |
|
63 |
# Scale the combined score to be between 10% and 100%
|
64 |
scaled_fluency_score = 10 + combined_score * 80
|
65 |
|
66 |
return scaled_fluency_score
|
67 |
|
68 |
+
def calculate_pronunciation_accuracy(word_pronunciation_scores, fluency_score, base_script_len, total_words):
|
69 |
+
# if total_words / base_script_len < 0.25:
|
70 |
+
# return 10
|
71 |
# Calculate average word pronunciation score
|
72 |
avg_pronunciation_score = calculate_expected_value(word_pronunciation_scores)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
|
74 |
+
fluency_score = fluency_score / 100
|
75 |
+
|
76 |
+
avg_pronunciation_score = (avg_pronunciation_score - 1) / 2
|
77 |
+
avg_weight = 0.8
|
78 |
+
flu_weight = 0.2
|
79 |
+
combined_score = avg_weight * avg_pronunciation_score + flu_weight * fluency_score
|
80 |
# Scale to 10% - 90%
|
81 |
+
final_score = 10 + combined_score * 90
|
82 |
|
83 |
return final_score
|
84 |
|
85 |
+
def calculate_fluency_and_pronunciation(audio_path, total_words, word_pronunciation_scores, base_script_len):
|
86 |
|
87 |
+
fluency_score = calculate_fluency_score(audio_path, total_words, word_pronunciation_scores, base_script_len)
|
88 |
|
89 |
+
pronunciation_accuracy = calculate_pronunciation_accuracy(word_pronunciation_scores, fluency_score, base_script_len, total_words)
|
90 |
|
91 |
return {'fluency_score': fluency_score, 'pronunciation_accuracy': pronunciation_accuracy}
|
92 |
|