Spaces:

seba3y
/

CAPT-ReadAloud

Running

App Files Files Community

seba3y commited on Dec 24, 2023

Commit

25e2c30

•

1 Parent(s): c13f0a5

Upload 3 files

Browse files

Files changed (2) hide show

logic.py +10 -9
scoring.py +34 -60

logic.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from phonemizer.separator import Separator
 from phonemizer import phonemize
 # from phonemizer.backend.espeak.wrapper import EspeakWrapper
-from Levenshtein import distance as levenshtein_distance
 from scoring import calculate_fluency_and_pronunciation
 import whisper
@@ -28,13 +28,14 @@ def rate_pronunciation(expected_phonemes, actual_phonemes):
     results = []
     for i, base_word in enumerate(actual_phonemes):
         best_dist = float('inf')
-        error_threshold = len(base_word) * 0.45
-        for pred_word_id in range(max(0, i-2), i + min(6, len(expected_phonemes) - i)):
-            dist = levenshtein_distance(expected_phonemes[pred_word_id], base_word,)
-            if dist < best_dist:
-                best_dist = dist
-            if best_dist == 0:  # Early stopping on perfect match
-                break
         if best_dist == 0:
            results.append(3)
         elif best_dist <= error_threshold:
@@ -52,7 +53,7 @@ def Speaker_speech_analysis(audio_path, text):
     transcribtion = text2phoneme(pre_transcribtion)
     text_phone    = text2phoneme(text)
     scores        = rate_pronunciation(transcribtion, text_phone)
-    FP_scores     = calculate_fluency_and_pronunciation(audio_path, pre_transcribtion, scores, len(text.split()))
     word_scores = [(word, s) for word, s in zip(text.split(), scores)]
     FP_scores['word_scores'] = word_scores

 from phonemizer.separator import Separator
 from phonemizer import phonemize
 # from phonemizer.backend.espeak.wrapper import EspeakWrapper
+from Levenshtein import distance as levenshtein_distance
 from scoring import calculate_fluency_and_pronunciation
 import whisper
     results = []
     for i, base_word in enumerate(actual_phonemes):
         best_dist = float('inf')
+        if i <= len(expected_phonemes):
+            for j in range(max(0, i-1), i + min(3, len(expected_phonemes) - i)):
+                dist = levenshtein_distance(expected_phonemes[j], base_word,)
+                if dist < best_dist:
+                    best_dist = dist
+                if best_dist == 0:  # Early stopping on perfect match
+                    break
+        error_threshold = len(base_word) * 0.40
         if best_dist == 0:
            results.append(3)
         elif best_dist <= error_threshold:
     transcribtion = text2phoneme(pre_transcribtion)
     text_phone    = text2phoneme(text)
     scores        = rate_pronunciation(transcribtion, text_phone)
+    FP_scores     = calculate_fluency_and_pronunciation(audio_path, len(pre_transcribtion.split()), scores, len(text.split()))
     word_scores = [(word, s) for word, s in zip(text.split(), scores)]
     FP_scores['word_scores'] = word_scores

scoring.py CHANGED Viewed

@@ -2,26 +2,19 @@ import numpy as np
 import librosa
 def calculate_expected_value(scores):
-    """
-    Calculate the expected value for a list of outcomes (scores), assuming each unique score
-    occurs with a frequency proportional to its count in the list.
-    :param scores: List of outcomes (numeric values).
-    :return: The expected value (a weighted average of all possible outcomes).
-    """
     # First calculate the probability of each unique score
     unique_scores, counts = np.unique(scores, return_counts=True)
     probabilities = counts / len(scores)
-    # Then calculate the expected value as the sum of scores times their probabilities
     expected_value = np.dot(unique_scores, probabilities)
     return expected_value
-def calculate_fluency_score(audio_path, transcription, word_pronunciation_scores, base_script_len):
-    total_words = len(transcription.split())
     avg_pronunciation_score = calculate_expected_value(word_pronunciation_scores)
-    if (total_words / base_script_len) < 0.15 or avg_pronunciation_score < 1.3:
         return 10
     audio, sr = librosa.load(audio_path)
     non_silent_intervals = librosa.effects.split(audio, top_db=22)
@@ -29,28 +22,25 @@ def calculate_fluency_score(audio_path, transcription, word_pronunciation_scores
     total_duration = len(audio) / sr
-    non_silent_duration = non_silent_duration if total_words > 4 else 0
     ideal_min_rate, ideal_max_rate = 120 / 60, 140 / 60
     actual_speech_rate = (total_words / (non_silent_duration + 1e-7)) * (total_words / base_script_len)
     speaking_ratio = non_silent_duration / total_duration
     # Existing speech rate score calculation
     # Determine if speech rate is within the ideal range
-    if ideal_min_rate <= actual_speech_rate <= ideal_max_rate:
-        # Within the ideal range
-        speech_rate_score = 1
     else:
-        # Outside the ideal range, score is proportional to how close it is to the range
-        if actual_speech_rate < ideal_min_rate:
-            # Too slow
-            speech_rate_score = actual_speech_rate / ideal_min_rate
-        else:
-            # Too fast
-            speech_rate_score = 2 - (actual_speech_rate / ideal_max_rate)
-        # Clamp the score between 0 and 1
-        speech_rate_score = max(0, min(speech_rate_score, 1))
     # If speaking ratio is significantly less than the gold standard, reduce the fluency score
     gold_standard_ratio = 0.9  # Assuming 90% speaking time is gold standard for natural speech
     speaking_ratio_score = min(speaking_ratio / gold_standard_ratio, 1)
@@ -58,61 +48,45 @@ def calculate_fluency_score(audio_path, transcription, word_pronunciation_scores
     # Pronunciation score calculation
     avg_pronunciation_score = (avg_pronunciation_score - 1) / 2
-    pronunciation_variance = np.var(word_pronunciation_scores, ddof=1,)
     # Weighted combination of scores
     # Adjust weights as needed
-    weight_speech_rate = 0.20
     weight_speaking_ratio = 0.20
     weight_pronunciation = 0.50
-    weight_pronunciation_variance = 0.10
-    combined_score = (speech_rate_score * weight_speech_rate +
-                      speaking_ratio_score * weight_speaking_ratio +
-                      avg_pronunciation_score * weight_pronunciation +
-                      (1 / (1 + pronunciation_variance)) * weight_pronunciation_variance)
     # Scale the combined score to be between 10% and 100%
     scaled_fluency_score = 10 + combined_score * 80
     return scaled_fluency_score
-def calculate_pronunciation_accuracy(word_pronunciation_scores, fluency_score, base_script_len):
-    if len(word_pronunciation_scores) / base_script_len < 0.25:
-        return 10
     # Calculate average word pronunciation score
     avg_pronunciation_score = calculate_expected_value(word_pronunciation_scores)
-    print(avg_pronunciation_score)
-    # Adjust pronunciation score based on fluency
-    # fluency_score = fluency_score / 100
-    # This is a simplistic adjustment. It can be refined based on more detailed analysis
-    fluency_adjustment = fluency_score / 100
-    print(fluency_adjustment)
-    adjusted_pronunciation_score = avg_pronunciation_score * fluency_adjustment
-    print(adjusted_pronunciation_score)
-    # Map to 0-5 scale based on score guide
-    # These thresholds can be adjusted based on empirical data or further analysis
-    if adjusted_pronunciation_score >= 2.4:
-        score_guide_level = 5
-    elif adjusted_pronunciation_score >= 1.7:
-        score_guide_level = 4
-    elif adjusted_pronunciation_score >= 1.0:
-        score_guide_level = 3
-    elif adjusted_pronunciation_score >= 0.5:
-        score_guide_level = 2
-    else:
-        score_guide_level = 1
     # Scale to 10% - 90%
-    final_score = 10 + (score_guide_level - 1) * 20  # Scale each level to a range of 20%
     return final_score
-def calculate_fluency_and_pronunciation(audio_path, transcription, word_pronunciation_scores, base_script_len):
-    fluency_score = calculate_fluency_score(audio_path, transcription, word_pronunciation_scores, base_script_len)
-    pronunciation_accuracy = calculate_pronunciation_accuracy(word_pronunciation_scores, fluency_score, base_script_len)
     return {'fluency_score': fluency_score, 'pronunciation_accuracy': pronunciation_accuracy}

 import librosa
 def calculate_expected_value(scores):
     # First calculate the probability of each unique score
     unique_scores, counts = np.unique(scores, return_counts=True)
     probabilities = counts / len(scores)
+    # Then calculate the expected value as the sum of scores times their probabilities
     expected_value = np.dot(unique_scores, probabilities)
     return expected_value
+def calculate_fluency_score(audio_path, total_words, word_pronunciation_scores, base_script_len):
     avg_pronunciation_score = calculate_expected_value(word_pronunciation_scores)
+    if (total_words / base_script_len) < 0.15 or avg_pronunciation_score < 1.5:
         return 10
     audio, sr = librosa.load(audio_path)
     non_silent_intervals = librosa.effects.split(audio, top_db=22)
     total_duration = len(audio) / sr
+    non_silent_duration = non_silent_duration
     ideal_min_rate, ideal_max_rate = 120 / 60, 140 / 60
     actual_speech_rate = (total_words / (non_silent_duration + 1e-7)) * (total_words / base_script_len)
     speaking_ratio = non_silent_duration / total_duration
     # Existing speech rate score calculation
     # Determine if speech rate is within the ideal range
+    if actual_speech_rate <= ideal_max_rate:
+        # Within the ideal range or speaking slow
+        max_ratio = actual_speech_rate / ideal_max_rate
+        min_ratio = (actual_speech_rate / ideal_min_rate)
+        speech_rate_score = np.mean([max_ratio, min_ratio]) - 0.167
+        # for normal speaking speech_rate_score between (0.708, 1) and for slow speaking speech_rate_score (0.707, 0)
     else:
+        # Too fast
+        # for fast speaking speech_rate_score (0.707, 0)
+        max_ratio = actual_speech_rate / ideal_max_rate
+        speech_rate_score = 0.7 / max_ratio
     # If speaking ratio is significantly less than the gold standard, reduce the fluency score
     gold_standard_ratio = 0.9  # Assuming 90% speaking time is gold standard for natural speech
     speaking_ratio_score = min(speaking_ratio / gold_standard_ratio, 1)
     # Pronunciation score calculation
     avg_pronunciation_score = (avg_pronunciation_score - 1) / 2
+    # pronunciation_variance = np.var(word_pronunciation_scores, ddof=1,)
     # Weighted combination of scores
     # Adjust weights as needed
+    weight_speech_rate = 0.30
     weight_speaking_ratio = 0.20
     weight_pronunciation = 0.50
+    # weight_pronunciation_variance = 0.10
+    combined_score = speech_rate_score * weight_speech_rate + speaking_ratio_score * weight_speaking_ratio + avg_pronunciation_score * weight_pronunciation
     # Scale the combined score to be between 10% and 100%
     scaled_fluency_score = 10 + combined_score * 80
     return scaled_fluency_score
+def calculate_pronunciation_accuracy(word_pronunciation_scores, fluency_score, base_script_len, total_words):
+    # if total_words / base_script_len < 0.25:
+    #     return 10
     # Calculate average word pronunciation score
     avg_pronunciation_score = calculate_expected_value(word_pronunciation_scores)
+    fluency_score = fluency_score / 100
+    avg_pronunciation_score = (avg_pronunciation_score - 1) / 2
+    avg_weight = 0.8
+    flu_weight = 0.2
+    combined_score = avg_weight * avg_pronunciation_score + flu_weight * fluency_score
     # Scale to 10% - 90%
+    final_score = 10 + combined_score * 90
     return final_score
+def calculate_fluency_and_pronunciation(audio_path, total_words, word_pronunciation_scores, base_script_len):
+    fluency_score = calculate_fluency_score(audio_path, total_words, word_pronunciation_scores, base_script_len)
+    pronunciation_accuracy = calculate_pronunciation_accuracy(word_pronunciation_scores, fluency_score, base_script_len, total_words)
     return {'fluency_score': fluency_score, 'pronunciation_accuracy': pronunciation_accuracy}