Spaces:

seba3y
/

CAPT-ReadAloud

Sleeping

App Files Files Community

seba3y commited on Dec 17, 2023

Commit

86a5e7d

•

1 Parent(s): 3be22dd

Upload 3 files

Browse files

Files changed (3) hide show

app.py +63 -90
logic.py +66 -0
requirements.txt +9 -0

app.py CHANGED Viewed

@@ -1,108 +1,81 @@
 import gradio as gr
-from audio import predict_all  # This is your custom module for predictions
-import re  # Regular expressions for text processing
-# Additional CSS for styling the confidence bars and the result layout
-additional_css = """
-/* CSS for the confidence bars */
-.confidence-section {
-    display: flex;
-    align-items: center;
-    margin-top: 10px;
-}
-.confidence-label {
-    margin-right: 10px;
-    font-weight: bold;
-}
-.confidence-bar {
-    height: 20px;
-    width: 100%;
-    background-color: #eee;
-    border-radius: 10px;
-    margin: 10px 0;
-}
-.confidence-fill {
-    height: 100%;
-    border-radius: 10px;
-    background-color: #4caf50; /* Change color based on confidence level if desired */
-    text-align: center;
-    color: white;
-    line-height: 20px;
-}
-/* Additional CSS for styling the rest of your results */
-"""
-# Function to generate custom HTML for the confidence bar
-def custom_confidence_bar(confidence):
-    color = "#4caf50" if confidence > 75 else "#FFC107" if confidence > 50 else "#F44336"
-    return f"""
-<div class="confidence-section">
-    <span class="confidence-label">Model Confidence:</span>
-    <div class="confidence-bar">
-        <div class="confidence-fill" style="width: {confidence}%; background-color: {color};">
-            {confidence}%
-        </div>
-    </div>
-</div>
-    """
-# Function to extract score level from message
-def extract_score_level(message):
-    match = re.search(r'Score: (\d+)-(\d+)', message)
-    score_level = f"{match.group(1)} of 10" if match else "N/A"
-    return score_level
-def message_markdown(label, message, task, score_level):
-    md = f'''# {label}
-    **Model Prediction:** {message}
-    **{task} Score:** {score_level}
-    '''
-    return md
-# Function to process the audio file and analyze it
-def analyze_audio(audio_data):
-    # Assuming predict_all returns a tuple of (message, confidence) for accuracy and fluency
-    accuracy, fluency = predict_all(audio_data)
-    # Unpack the results
-    accuracy_message, accuracy_confidence = accuracy
-    fluency_message, fluency_confidence = fluency
-    # Extract the score level from the message
-    accuracy_score = extract_score_level(accuracy_message)
-    fluency_score = extract_score_level(fluency_message)
-    # Remove the score level from the original message
-    accuracy_message = accuracy_message.split(",")[1].strip() if "," in accuracy_message else accuracy_message
-    fluency_message = fluency_message.split(",")[1].strip() if "," in fluency_message else fluency_message
-    # Generate the confidence bar HTML
-    accuracy_confidence_html = custom_confidence_bar(accuracy_confidence * 100)
-    fluency_confidence_html = custom_confidence_bar(fluency_confidence * 100)
-    accuracy_markdown = message_markdown('Accuracy of Pronunciation', accuracy_message, 'Pronunciation', accuracy_score)
-    fluency_markdown = message_markdown('Speaker Fluency', fluency_message, 'Fluency', fluency_score)
-    return accuracy_markdown, accuracy_confidence_html, fluency_markdown, fluency_confidence_html
 # Define the Gradio interface
-iface = gr.Interface(
-    fn=analyze_audio,
-    inputs=gr.Audio(label="Upload Audio"),
-    outputs=[
-        gr.Markdown(label="Accuracy Score Level"),
-        gr.HTML(label="Accuracy Confidence"),
-        gr.Markdown(label="Fluency Score Level"),
-        gr.HTML(label="Fluency Confidence"),
-    ],
-    css=additional_css,
-    title="Audio Analysis Tool",
-    description="Upload an audio file to analyze its accuracy and fluency."
-)
 # Run the Gradio app
 if __name__ == "__main__":
-    iface.launch()

 import gradio as gr
+from logic import compare_audio_with_text
+from scipy.io import wavfile
+def create_html_from_scores(word_scores):
+    html_output = ''
+    # Ensure the number of words and scores match
+    for word, score in word_scores:
+        if score == 1:
+            html_output += f'<span style="color: red;">{word}</span> '
+        elif score == 2:
+            html_output += f'<span style="color: orange;">{word}</span> '
+        else:
+            html_output += f'<span style="color: green;">{word}</span> '
+    return html_output
+def analyze_audio(text, audio):
+# Write the processed audio to a temporary WAV file
+    temp_filename = 'temp_audio.wav'
+    wavfile.write(temp_filename, audio[0], audio[1])
+    result = compare_audio_with_text(temp_filename, text)
+    html_content = create_html_from_scores(result)
+    html_with_css = f"""
+    <style>
+    .legend {{
+      font-size: 22px;
+      display: flex;
+      align-items: center;
+      gap: 12px;
+    }}
+    .legend-dot {{
+      height: 15px;
+      width: 15px;
+      border-radius: 50%;
+      display: inline-block;
+    }}
+    .good {{ color: #28a745; }}
+    .average {{ color: #ffc107; }}
+    .bad {{ color: #dc3545; }}
+    .text {{ font-size: 20px; }}
+    </style>
+    <div class="legend">
+      <span class="legend-dot" style="background-color: #28a745;"></span><span>Good</span>
+      <span class="legend-dot" style="background-color: #ffc107;"></span><span>Understandable</span>
+      <span class="legend-dot" style="background-color: #dc3545;"></span><span>Bad</span>
+    </div>
+    <p class="text">
+      {html_content}
+    </p>
+    """
+    return html_with_css
 # Define the Gradio interface
+iface = gr.Interface(fn=analyze_audio,
+                     inputs=[gr.Textbox(label='Training Text', placeholder='Write the text for pronunciation task', interactive=True, visible=True, show_copy_button=True,),
+                             gr.Audio(label="Upload Audio")
+                             ],
+                     outputs=[gr.HTML(label="Analysis of pronunciation"),
+                              ],
+                     # css=additional_css,
+                     # title="Audio Analysis Tool",
+                     description="Upload an audio file to analyze pronunciation accuracy and speech fluency."
+                     )
 # Run the Gradio app
 if __name__ == "__main__":
+    iface.launch()

logic.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from phonemizer.separator import Separator
+from phonemizer import phonemize
+# from phonemizer.backend.espeak.wrapper import EspeakWrapper
+from Levenshtein import distance as levenshtein_distance
+import whisper
+import torch
+device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
+model = whisper.load_model("base.en", device=device)
+separator = Separator(phone=None, word=' | ',)
+# EspeakWrapper.set_library(r"C:\Program Files\eSpeak NG\libespeak-ng.dll")
+def transcribe(audio):
+    result = model.transcribe(audio, word_timestamps=False, no_speech_threshold=0.4,  compression_ratio_threshold=2, temperature=0)
+    return {'language': result['language'], 'text': result['text']}
+def text2phoneme(text):
+    return phonemize(text.lower(), backend='espeak' , separator=separator, strip=True, with_stress=False, tie=False, language='en-us')
+def rate_pronunciation(expected_phonemes, actual_phonemes):
+    expected_phonemes = expected_phonemes.split(" | ")
+    actual_phonemes = actual_phonemes.split(" | ")
+    # Calculate the Levenshtein distance between the two phoneme sequences
+    results = []
+    for i, base_word in enumerate(actual_phonemes):
+        best_dist = float('inf')
+        error_threshold = len(base_word) * 0.45
+        for pred_word_id in range(max(0, i-2), i + min(6, len(expected_phonemes) - i)):
+            dist = levenshtein_distance(expected_phonemes[pred_word_id], base_word,)
+            if dist < best_dist:
+                best_dist = dist
+            if best_dist == 0:  # Early stopping on perfect match
+                break
+        if best_dist == 0:
+           results.append(3)
+        elif best_dist <= error_threshold:
+            results.append(2)
+        else:
+            results.append(1)
+    return results
+def compare_audio_with_text(audio, text):
+    transcribtion = transcribe(audio)['text']
+    print(transcribtion)
+    transcribtion = text2phoneme(transcribtion)
+    text_phone    = text2phoneme(text)
+    scores        = rate_pronunciation(transcribtion, text_phone)
+    result = [(word, s) for word, s in zip(text.split(), scores)]
+    return result
+if __name__ == '__main__':
+    text = 'i have ADHD '
+    text = text2phoneme(text)
+    file_path = r'user_recording.wav'
+    trans = transcribe(file_path)['text']
+    print(trans)
+    trans = text2phoneme(trans)
+    print('base:', text)
+    print('predicted:', trans)
+    result = rate_pronunciation(trans, text)
+    print(result)

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+espeak-ng
+phonemizer
+wave
+torch
+openai-whisper
+gradio
+numpy
+resampy
+Levenshtein