Spaces:

seba3y
/

CAPT-ReadAloud

Running

App Files Files Community

seba3y commited on Dec 18, 2023

Commit

c13f0a5

•

1 Parent(s): 2a7e44d

Upload 5 files

Browse files

Files changed (4) hide show

app.py +88 -21
logic.py +14 -8
requirements.txt +2 -1
scoring.py +121 -0

app.py CHANGED Viewed

@@ -1,22 +1,35 @@
 import gradio as gr
-from logic import compare_audio_with_text
 from scipy.io import wavfile
 def create_html_from_scores(word_scores):
     html_output = ''
-    # Ensure the number of words and scores match
     for word, score in word_scores:
         if score == 1:
-            html_output += f'<span style="color: red;">{word}</span> '
         elif score == 2:
-            html_output += f'<span style="color: orange;">{word}</span> '
         else:
-            html_output += f'<span style="color: green;">{word}</span> '
     return html_output
@@ -27,8 +40,16 @@ def analyze_audio(text, audio):
     wavfile.write(temp_filename, audio[0], audio[1])
-    result = compare_audio_with_text(temp_filename, text)
-    html_content = create_html_from_scores(result)
     html_with_css = f"""
     <style>
     .legend {{
@@ -39,19 +60,62 @@ def analyze_audio(text, audio):
     }}
     .legend-dot {{
-      height: 15px;
-      width: 15px;
-      border-radius: 50%;
-      display: inline-block;
     }}
-    .good {{ color: #28a745; }}
-    .average {{ color: #ffc107; }}
-    .bad {{ color: #dc3545; }}
-    .text {{ font-size: 20px; }}
-    </style>
-    <h1> Word Pronunciation scores </h1>
     <div class="legend">
       <span class="legend-dot" style="background-color: #28a745;"></span><span>Good</span>
       <span class="legend-dot" style="background-color: #ffc107;"></span><span>Understandable</span>
@@ -61,17 +125,20 @@ def analyze_audio(text, audio):
     <p class="text">
       {html_content}
     </p>
     """
     return html_with_css
 # Define the Gradio interface
 iface = gr.Interface(fn=analyze_audio,
                      inputs=[gr.Textbox(label='Training Text', placeholder='Write the text for pronunciation task', interactive=True, visible=True, show_copy_button=True,),
-                             gr.Audio(label="Recoreded Audio")
                              ],
                      outputs=[gr.HTML(label="Analysis of pronunciation"),
                               ],
-                     # css=additional_css,
                      # title="Audio Analysis Tool",
                      description="Write any text and recored an audio to predict pronunciation erors"
                      )

 import gradio as gr
+from logic import Speaker_speech_analysis
 from scipy.io import wavfile
 def create_html_from_scores(word_scores):
     html_output = ''
     for word, score in word_scores:
         if score == 1:
+            html_output += f'<span style="color: #dc3545;">{word}</span> '
         elif score == 2:
+            html_output += f'<span style="color: #ffc107;">{word}</span> '
         else:
+            html_output += f'<span style="color: #28a745;">{word}</span> '
     return html_output
+def generate_progress_bar(score, label):
+    score = round(score, 2)
+    score_text = f"{score:.2f}" if score < 90 else "90"
+    bar_color = "#dc3545" if score < 30 else "#ffc107" if score < 60 else "#28a745"
+    bar_length = f"{(score / 90) * 100}%"
+    return f"""
+    <div class="progress-label">{label}:</div>
+    <div class="progress-container">
+        <div class="progress-bar" style="width: {bar_length}; background-color: {bar_color};">
+            <div class="progress-score">{score_text}</div>
+        </div>
+    </div>
+    <div class="progress-max">Max: 90</div>
+    """
+# CSS to be used in the Gradio Interface
     wavfile.write(temp_filename, audio[0], audio[1])
+    result = Speaker_speech_analysis(temp_filename, text)
+    accuracy_score = result['pronunciation_accuracy']
+    fluency_score  = result['fluency_score']
+    word_scores    = result['word_scores']
+    html_content = create_html_from_scores(word_scores)
+    pronunciation_progress_bar = generate_progress_bar(accuracy_score, "Pronunciation Accuracy")
+    fluency_progress_bar = generate_progress_bar(fluency_score, "Fluency Score")
     html_with_css = f"""
     <style>
     .legend {{
     }}
     .legend-dot {{
+        height: 15px;
+        width: 15px;
+        border-radius: 50%;
+        display: inline-block;
+      }}
+    .good {{ color: #28a745;
+    }}
+    .average {{ color: #ffc107;
+    }}
+    .bad {{ color: #dc3545;
     }}
+    .text {{
+        font-size: 20px;
+        margin-bottom: 20px;
+      }}
+    .progress-container {{
+        width: 100%;
+        background-color: #ddd;
+        border-radius: 13px;
+        overflow: hidden;
+      }}
+    .progress-bar {{
+        height: 30px;
+        line-height: 30px;
+        text-align: center;
+        font-size: 16px;
+        border-radius: 15px;
+        transition: width 1s ease;
+      }}
+    .progress-label {{
+        font-weight: bold;
+        font-size: 22px;
+        margin-bottom: 20px;
+        margin-top: 5px;
+        text-align: center;
+      }}
+    .progress-score {{
+        display: inline-block;
+        color: black;
+      }}
+    .progress-max {{
+        text-align: right;
+        margin: 10px;
+        font-size: 16px;
+      }}
+    </style>
     <div class="legend">
       <span class="legend-dot" style="background-color: #28a745;"></span><span>Good</span>
       <span class="legend-dot" style="background-color: #ffc107;"></span><span>Understandable</span>
     <p class="text">
       {html_content}
     </p>
+    {pronunciation_progress_bar}
+    {fluency_progress_bar}
     """
     return html_with_css
 # Define the Gradio interface
 iface = gr.Interface(fn=analyze_audio,
                      inputs=[gr.Textbox(label='Training Text', placeholder='Write the text for pronunciation task', interactive=True, visible=True, show_copy_button=True,),
+                             gr.Audio(label="Recoreded Audio", sources=['microphone', 'upload'])
                              ],
                      outputs=[gr.HTML(label="Analysis of pronunciation"),
                               ],
+                    #  css=additional_css,
                      # title="Audio Analysis Tool",
                      description="Write any text and recored an audio to predict pronunciation erors"
                      )

logic.py CHANGED Viewed

@@ -2,6 +2,7 @@ from phonemizer.separator import Separator
 from phonemizer import phonemize
 # from phonemizer.backend.espeak.wrapper import EspeakWrapper
 from Levenshtein import distance as levenshtein_distance
 import whisper
 import torch
@@ -9,7 +10,7 @@ import torch
 device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
 model = whisper.load_model("base.en", device=device)
-separator = Separator(phone=None, word=' | ',)
 # EspeakWrapper.set_library(r"C:\Program Files\eSpeak NG\libespeak-ng.dll")
@@ -42,15 +43,20 @@ def rate_pronunciation(expected_phonemes, actual_phonemes):
             results.append(1)
     return results
-def compare_audio_with_text(audio, text):
-    transcribtion = transcribe(audio)['text']
-    print(transcribtion)
-    transcribtion = text2phoneme(transcribtion)
     text_phone    = text2phoneme(text)
     scores        = rate_pronunciation(transcribtion, text_phone)
-    result = [(word, s) for word, s in zip(text.split(), scores)]
-    return result
 if __name__ == '__main__':

 from phonemizer import phonemize
 # from phonemizer.backend.espeak.wrapper import EspeakWrapper
 from Levenshtein import distance as levenshtein_distance
+from scoring import calculate_fluency_and_pronunciation
 import whisper
 import torch
 device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
 model = whisper.load_model("base.en", device=device)
+separator = Separator(phone=None, word='',)
 # EspeakWrapper.set_library(r"C:\Program Files\eSpeak NG\libespeak-ng.dll")
             results.append(1)
     return results
+def Speaker_speech_analysis(audio_path, text):
+    pre_transcribtion = transcribe(audio_path)['text']
+    print(pre_transcribtion)
+    transcribtion = text2phoneme(pre_transcribtion)
     text_phone    = text2phoneme(text)
     scores        = rate_pronunciation(transcribtion, text_phone)
+    FP_scores     = calculate_fluency_and_pronunciation(audio_path, pre_transcribtion, scores, len(text.split()))
+    word_scores = [(word, s) for word, s in zip(text.split(), scores)]
+    FP_scores['word_scores'] = word_scores
+    return FP_scores
 if __name__ == '__main__':

requirements.txt CHANGED Viewed

@@ -6,4 +6,5 @@ gradio
 scipy
 numpy
 resampy
-Levenshtein

 scipy
 numpy
 resampy
+Levenshtein
+librosa

scoring.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import numpy as np
+import librosa
+def calculate_expected_value(scores):
+    """
+    Calculate the expected value for a list of outcomes (scores), assuming each unique score
+    occurs with a frequency proportional to its count in the list.
+    :param scores: List of outcomes (numeric values).
+    :return: The expected value (a weighted average of all possible outcomes).
+    """
+    # First calculate the probability of each unique score
+    unique_scores, counts = np.unique(scores, return_counts=True)
+    probabilities = counts / len(scores)
+    # Then calculate the expected value as the sum of scores times their probabilities
+    expected_value = np.dot(unique_scores, probabilities)
+    return expected_value
+def calculate_fluency_score(audio_path, transcription, word_pronunciation_scores, base_script_len):
+    total_words = len(transcription.split())
+    avg_pronunciation_score = calculate_expected_value(word_pronunciation_scores)
+    if (total_words / base_script_len) < 0.15 or avg_pronunciation_score < 1.3:
+        return 10
+    audio, sr = librosa.load(audio_path)
+    non_silent_intervals = librosa.effects.split(audio, top_db=22)
+    non_silent_duration = sum([intv[1] - intv[0] for intv in non_silent_intervals]) / sr
+    total_duration = len(audio) / sr
+    non_silent_duration = non_silent_duration if total_words > 4 else 0
+    ideal_min_rate, ideal_max_rate = 120 / 60, 140 / 60
+    actual_speech_rate = (total_words / (non_silent_duration + 1e-7)) * (total_words / base_script_len)
+    speaking_ratio = non_silent_duration / total_duration
+    # Existing speech rate score calculation
+    # Determine if speech rate is within the ideal range
+    if ideal_min_rate <= actual_speech_rate <= ideal_max_rate:
+        # Within the ideal range
+        speech_rate_score = 1
+    else:
+        # Outside the ideal range, score is proportional to how close it is to the range
+        if actual_speech_rate < ideal_min_rate:
+            # Too slow
+            speech_rate_score = actual_speech_rate / ideal_min_rate
+        else:
+            # Too fast
+            speech_rate_score = 2 - (actual_speech_rate / ideal_max_rate)
+        # Clamp the score between 0 and 1
+        speech_rate_score = max(0, min(speech_rate_score, 1))
+    # If speaking ratio is significantly less than the gold standard, reduce the fluency score
+    gold_standard_ratio = 0.9  # Assuming 90% speaking time is gold standard for natural speech
+    speaking_ratio_score = min(speaking_ratio / gold_standard_ratio, 1)
+    # Pronunciation score calculation
+    avg_pronunciation_score = (avg_pronunciation_score - 1) / 2
+    pronunciation_variance = np.var(word_pronunciation_scores, ddof=1,)
+    # Weighted combination of scores
+    # Adjust weights as needed
+    weight_speech_rate = 0.20
+    weight_speaking_ratio = 0.20
+    weight_pronunciation = 0.50
+    weight_pronunciation_variance = 0.10
+    combined_score = (speech_rate_score * weight_speech_rate +
+                      speaking_ratio_score * weight_speaking_ratio +
+                      avg_pronunciation_score * weight_pronunciation +
+                      (1 / (1 + pronunciation_variance)) * weight_pronunciation_variance)
+    # Scale the combined score to be between 10% and 100%
+    scaled_fluency_score = 10 + combined_score * 80
+    return scaled_fluency_score
+def calculate_pronunciation_accuracy(word_pronunciation_scores, fluency_score, base_script_len):
+    if len(word_pronunciation_scores) / base_script_len < 0.25:
+        return 10
+    # Calculate average word pronunciation score
+    avg_pronunciation_score = calculate_expected_value(word_pronunciation_scores)
+    print(avg_pronunciation_score)
+    # Adjust pronunciation score based on fluency
+    # fluency_score = fluency_score / 100
+    # This is a simplistic adjustment. It can be refined based on more detailed analysis
+    fluency_adjustment = fluency_score / 100
+    print(fluency_adjustment)
+    adjusted_pronunciation_score = avg_pronunciation_score * fluency_adjustment
+    print(adjusted_pronunciation_score)
+    # Map to 0-5 scale based on score guide
+    # These thresholds can be adjusted based on empirical data or further analysis
+    if adjusted_pronunciation_score >= 2.4:
+        score_guide_level = 5
+    elif adjusted_pronunciation_score >= 1.7:
+        score_guide_level = 4
+    elif adjusted_pronunciation_score >= 1.0:
+        score_guide_level = 3
+    elif adjusted_pronunciation_score >= 0.5:
+        score_guide_level = 2
+    else:
+        score_guide_level = 1
+    # Scale to 10% - 90%
+    final_score = 10 + (score_guide_level - 1) * 20  # Scale each level to a range of 20%
+    return final_score
+def calculate_fluency_and_pronunciation(audio_path, transcription, word_pronunciation_scores, base_script_len):
+    fluency_score = calculate_fluency_score(audio_path, transcription, word_pronunciation_scores, base_script_len)
+    pronunciation_accuracy = calculate_pronunciation_accuracy(word_pronunciation_scores, fluency_score, base_script_len)
+    return {'fluency_score': fluency_score, 'pronunciation_accuracy': pronunciation_accuracy}
+if __name__ == '__main__':
+    pass