Spaces:
Running
Running
Upload 5 files
Browse files- app.py +88 -21
- logic.py +14 -8
- requirements.txt +2 -1
- scoring.py +121 -0
app.py
CHANGED
@@ -1,22 +1,35 @@
|
|
1 |
import gradio as gr
|
2 |
-
from logic import
|
3 |
from scipy.io import wavfile
|
4 |
|
5 |
|
6 |
|
7 |
def create_html_from_scores(word_scores):
|
8 |
html_output = ''
|
9 |
-
|
10 |
-
# Ensure the number of words and scores match
|
11 |
-
|
12 |
for word, score in word_scores:
|
13 |
if score == 1:
|
14 |
-
html_output += f'<span style="color:
|
15 |
elif score == 2:
|
16 |
-
html_output += f'<span style="color:
|
17 |
else:
|
18 |
-
html_output += f'<span style="color:
|
19 |
return html_output
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
|
22 |
|
@@ -27,8 +40,16 @@ def analyze_audio(text, audio):
|
|
27 |
wavfile.write(temp_filename, audio[0], audio[1])
|
28 |
|
29 |
|
30 |
-
result =
|
31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
html_with_css = f"""
|
33 |
<style>
|
34 |
.legend {{
|
@@ -39,19 +60,62 @@ def analyze_audio(text, audio):
|
|
39 |
}}
|
40 |
|
41 |
.legend-dot {{
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
}}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
-
.good {{ color: #28a745; }}
|
49 |
-
.average {{ color: #ffc107; }}
|
50 |
-
.bad {{ color: #dc3545; }}
|
51 |
|
52 |
-
.text {{ font-size: 20px; }}
|
53 |
-
</style>
|
54 |
-
<h1> Word Pronunciation scores </h1>
|
55 |
<div class="legend">
|
56 |
<span class="legend-dot" style="background-color: #28a745;"></span><span>Good</span>
|
57 |
<span class="legend-dot" style="background-color: #ffc107;"></span><span>Understandable</span>
|
@@ -61,17 +125,20 @@ def analyze_audio(text, audio):
|
|
61 |
<p class="text">
|
62 |
{html_content}
|
63 |
</p>
|
|
|
|
|
|
|
64 |
"""
|
65 |
return html_with_css
|
66 |
|
67 |
# Define the Gradio interface
|
68 |
iface = gr.Interface(fn=analyze_audio,
|
69 |
inputs=[gr.Textbox(label='Training Text', placeholder='Write the text for pronunciation task', interactive=True, visible=True, show_copy_button=True,),
|
70 |
-
gr.Audio(label="Recoreded Audio")
|
71 |
],
|
72 |
outputs=[gr.HTML(label="Analysis of pronunciation"),
|
73 |
],
|
74 |
-
|
75 |
# title="Audio Analysis Tool",
|
76 |
description="Write any text and recored an audio to predict pronunciation erors"
|
77 |
)
|
|
|
1 |
import gradio as gr
|
2 |
+
from logic import Speaker_speech_analysis
|
3 |
from scipy.io import wavfile
|
4 |
|
5 |
|
6 |
|
7 |
def create_html_from_scores(word_scores):
|
8 |
html_output = ''
|
|
|
|
|
|
|
9 |
for word, score in word_scores:
|
10 |
if score == 1:
|
11 |
+
html_output += f'<span style="color: #dc3545;">{word}</span> '
|
12 |
elif score == 2:
|
13 |
+
html_output += f'<span style="color: #ffc107;">{word}</span> '
|
14 |
else:
|
15 |
+
html_output += f'<span style="color: #28a745;">{word}</span> '
|
16 |
return html_output
|
17 |
+
|
18 |
+
def generate_progress_bar(score, label):
|
19 |
+
score = round(score, 2)
|
20 |
+
score_text = f"{score:.2f}" if score < 90 else "90"
|
21 |
+
bar_color = "#dc3545" if score < 30 else "#ffc107" if score < 60 else "#28a745"
|
22 |
+
bar_length = f"{(score / 90) * 100}%"
|
23 |
+
return f"""
|
24 |
+
<div class="progress-label">{label}:</div>
|
25 |
+
<div class="progress-container">
|
26 |
+
<div class="progress-bar" style="width: {bar_length}; background-color: {bar_color};">
|
27 |
+
<div class="progress-score">{score_text}</div>
|
28 |
+
</div>
|
29 |
+
</div>
|
30 |
+
<div class="progress-max">Max: 90</div>
|
31 |
+
"""
|
32 |
+
# CSS to be used in the Gradio Interface
|
33 |
|
34 |
|
35 |
|
|
|
40 |
wavfile.write(temp_filename, audio[0], audio[1])
|
41 |
|
42 |
|
43 |
+
result = Speaker_speech_analysis(temp_filename, text)
|
44 |
+
accuracy_score = result['pronunciation_accuracy']
|
45 |
+
fluency_score = result['fluency_score']
|
46 |
+
word_scores = result['word_scores']
|
47 |
+
|
48 |
+
html_content = create_html_from_scores(word_scores)
|
49 |
+
pronunciation_progress_bar = generate_progress_bar(accuracy_score, "Pronunciation Accuracy")
|
50 |
+
fluency_progress_bar = generate_progress_bar(fluency_score, "Fluency Score")
|
51 |
+
|
52 |
+
|
53 |
html_with_css = f"""
|
54 |
<style>
|
55 |
.legend {{
|
|
|
60 |
}}
|
61 |
|
62 |
.legend-dot {{
|
63 |
+
height: 15px;
|
64 |
+
width: 15px;
|
65 |
+
border-radius: 50%;
|
66 |
+
display: inline-block;
|
67 |
+
}}
|
68 |
+
|
69 |
+
.good {{ color: #28a745;
|
70 |
+
}}
|
71 |
+
.average {{ color: #ffc107;
|
72 |
+
}}
|
73 |
+
.bad {{ color: #dc3545;
|
74 |
}}
|
75 |
+
|
76 |
+
.text {{
|
77 |
+
font-size: 20px;
|
78 |
+
margin-bottom: 20px;
|
79 |
+
}}
|
80 |
+
|
81 |
+
.progress-container {{
|
82 |
+
width: 100%;
|
83 |
+
background-color: #ddd;
|
84 |
+
border-radius: 13px;
|
85 |
+
overflow: hidden;
|
86 |
+
}}
|
87 |
+
|
88 |
+
.progress-bar {{
|
89 |
+
height: 30px;
|
90 |
+
line-height: 30px;
|
91 |
+
text-align: center;
|
92 |
+
font-size: 16px;
|
93 |
+
border-radius: 15px;
|
94 |
+
transition: width 1s ease;
|
95 |
+
}}
|
96 |
+
|
97 |
+
.progress-label {{
|
98 |
+
font-weight: bold;
|
99 |
+
font-size: 22px;
|
100 |
+
margin-bottom: 20px;
|
101 |
+
margin-top: 5px;
|
102 |
+
text-align: center;
|
103 |
+
}}
|
104 |
+
|
105 |
+
.progress-score {{
|
106 |
+
display: inline-block;
|
107 |
+
color: black;
|
108 |
+
}}
|
109 |
+
|
110 |
+
.progress-max {{
|
111 |
+
text-align: right;
|
112 |
+
margin: 10px;
|
113 |
+
font-size: 16px;
|
114 |
+
}}
|
115 |
+
|
116 |
+
</style>
|
117 |
|
|
|
|
|
|
|
118 |
|
|
|
|
|
|
|
119 |
<div class="legend">
|
120 |
<span class="legend-dot" style="background-color: #28a745;"></span><span>Good</span>
|
121 |
<span class="legend-dot" style="background-color: #ffc107;"></span><span>Understandable</span>
|
|
|
125 |
<p class="text">
|
126 |
{html_content}
|
127 |
</p>
|
128 |
+
|
129 |
+
{pronunciation_progress_bar}
|
130 |
+
{fluency_progress_bar}
|
131 |
"""
|
132 |
return html_with_css
|
133 |
|
134 |
# Define the Gradio interface
|
135 |
iface = gr.Interface(fn=analyze_audio,
|
136 |
inputs=[gr.Textbox(label='Training Text', placeholder='Write the text for pronunciation task', interactive=True, visible=True, show_copy_button=True,),
|
137 |
+
gr.Audio(label="Recoreded Audio", sources=['microphone', 'upload'])
|
138 |
],
|
139 |
outputs=[gr.HTML(label="Analysis of pronunciation"),
|
140 |
],
|
141 |
+
# css=additional_css,
|
142 |
# title="Audio Analysis Tool",
|
143 |
description="Write any text and recored an audio to predict pronunciation erors"
|
144 |
)
|
logic.py
CHANGED
@@ -2,6 +2,7 @@ from phonemizer.separator import Separator
|
|
2 |
from phonemizer import phonemize
|
3 |
# from phonemizer.backend.espeak.wrapper import EspeakWrapper
|
4 |
from Levenshtein import distance as levenshtein_distance
|
|
|
5 |
|
6 |
import whisper
|
7 |
import torch
|
@@ -9,7 +10,7 @@ import torch
|
|
9 |
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
|
10 |
|
11 |
model = whisper.load_model("base.en", device=device)
|
12 |
-
separator = Separator(phone=None, word='
|
13 |
|
14 |
# EspeakWrapper.set_library(r"C:\Program Files\eSpeak NG\libespeak-ng.dll")
|
15 |
|
@@ -42,15 +43,20 @@ def rate_pronunciation(expected_phonemes, actual_phonemes):
|
|
42 |
results.append(1)
|
43 |
return results
|
44 |
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
|
|
|
|
|
|
49 |
text_phone = text2phoneme(text)
|
50 |
scores = rate_pronunciation(transcribtion, text_phone)
|
51 |
-
|
52 |
-
|
53 |
-
|
|
|
|
|
54 |
|
55 |
if __name__ == '__main__':
|
56 |
|
|
|
2 |
from phonemizer import phonemize
|
3 |
# from phonemizer.backend.espeak.wrapper import EspeakWrapper
|
4 |
from Levenshtein import distance as levenshtein_distance
|
5 |
+
from scoring import calculate_fluency_and_pronunciation
|
6 |
|
7 |
import whisper
|
8 |
import torch
|
|
|
10 |
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
|
11 |
|
12 |
model = whisper.load_model("base.en", device=device)
|
13 |
+
separator = Separator(phone=None, word='',)
|
14 |
|
15 |
# EspeakWrapper.set_library(r"C:\Program Files\eSpeak NG\libespeak-ng.dll")
|
16 |
|
|
|
43 |
results.append(1)
|
44 |
return results
|
45 |
|
46 |
+
|
47 |
+
|
48 |
+
|
49 |
+
def Speaker_speech_analysis(audio_path, text):
|
50 |
+
pre_transcribtion = transcribe(audio_path)['text']
|
51 |
+
print(pre_transcribtion)
|
52 |
+
transcribtion = text2phoneme(pre_transcribtion)
|
53 |
text_phone = text2phoneme(text)
|
54 |
scores = rate_pronunciation(transcribtion, text_phone)
|
55 |
+
FP_scores = calculate_fluency_and_pronunciation(audio_path, pre_transcribtion, scores, len(text.split()))
|
56 |
+
word_scores = [(word, s) for word, s in zip(text.split(), scores)]
|
57 |
+
|
58 |
+
FP_scores['word_scores'] = word_scores
|
59 |
+
return FP_scores
|
60 |
|
61 |
if __name__ == '__main__':
|
62 |
|
requirements.txt
CHANGED
@@ -6,4 +6,5 @@ gradio
|
|
6 |
scipy
|
7 |
numpy
|
8 |
resampy
|
9 |
-
Levenshtein
|
|
|
|
6 |
scipy
|
7 |
numpy
|
8 |
resampy
|
9 |
+
Levenshtein
|
10 |
+
librosa
|
scoring.py
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import librosa
|
3 |
+
|
4 |
+
def calculate_expected_value(scores):
|
5 |
+
"""
|
6 |
+
Calculate the expected value for a list of outcomes (scores), assuming each unique score
|
7 |
+
occurs with a frequency proportional to its count in the list.
|
8 |
+
|
9 |
+
:param scores: List of outcomes (numeric values).
|
10 |
+
:return: The expected value (a weighted average of all possible outcomes).
|
11 |
+
"""
|
12 |
+
# First calculate the probability of each unique score
|
13 |
+
unique_scores, counts = np.unique(scores, return_counts=True)
|
14 |
+
probabilities = counts / len(scores)
|
15 |
+
|
16 |
+
# Then calculate the expected value as the sum of scores times their probabilities
|
17 |
+
expected_value = np.dot(unique_scores, probabilities)
|
18 |
+
return expected_value
|
19 |
+
|
20 |
+
|
21 |
+
def calculate_fluency_score(audio_path, transcription, word_pronunciation_scores, base_script_len):
|
22 |
+
total_words = len(transcription.split())
|
23 |
+
avg_pronunciation_score = calculate_expected_value(word_pronunciation_scores)
|
24 |
+
if (total_words / base_script_len) < 0.15 or avg_pronunciation_score < 1.3:
|
25 |
+
return 10
|
26 |
+
audio, sr = librosa.load(audio_path)
|
27 |
+
non_silent_intervals = librosa.effects.split(audio, top_db=22)
|
28 |
+
non_silent_duration = sum([intv[1] - intv[0] for intv in non_silent_intervals]) / sr
|
29 |
+
|
30 |
+
total_duration = len(audio) / sr
|
31 |
+
|
32 |
+
|
33 |
+
non_silent_duration = non_silent_duration if total_words > 4 else 0
|
34 |
+
ideal_min_rate, ideal_max_rate = 120 / 60, 140 / 60
|
35 |
+
actual_speech_rate = (total_words / (non_silent_duration + 1e-7)) * (total_words / base_script_len)
|
36 |
+
speaking_ratio = non_silent_duration / total_duration
|
37 |
+
# Existing speech rate score calculation
|
38 |
+
|
39 |
+
# Determine if speech rate is within the ideal range
|
40 |
+
if ideal_min_rate <= actual_speech_rate <= ideal_max_rate:
|
41 |
+
# Within the ideal range
|
42 |
+
speech_rate_score = 1
|
43 |
+
else:
|
44 |
+
# Outside the ideal range, score is proportional to how close it is to the range
|
45 |
+
if actual_speech_rate < ideal_min_rate:
|
46 |
+
# Too slow
|
47 |
+
speech_rate_score = actual_speech_rate / ideal_min_rate
|
48 |
+
else:
|
49 |
+
# Too fast
|
50 |
+
speech_rate_score = 2 - (actual_speech_rate / ideal_max_rate)
|
51 |
+
# Clamp the score between 0 and 1
|
52 |
+
speech_rate_score = max(0, min(speech_rate_score, 1))
|
53 |
+
|
54 |
+
# If speaking ratio is significantly less than the gold standard, reduce the fluency score
|
55 |
+
gold_standard_ratio = 0.9 # Assuming 90% speaking time is gold standard for natural speech
|
56 |
+
speaking_ratio_score = min(speaking_ratio / gold_standard_ratio, 1)
|
57 |
+
|
58 |
+
|
59 |
+
# Pronunciation score calculation
|
60 |
+
avg_pronunciation_score = (avg_pronunciation_score - 1) / 2
|
61 |
+
pronunciation_variance = np.var(word_pronunciation_scores, ddof=1,)
|
62 |
+
|
63 |
+
# Weighted combination of scores
|
64 |
+
# Adjust weights as needed
|
65 |
+
weight_speech_rate = 0.20
|
66 |
+
weight_speaking_ratio = 0.20
|
67 |
+
weight_pronunciation = 0.50
|
68 |
+
weight_pronunciation_variance = 0.10
|
69 |
+
|
70 |
+
combined_score = (speech_rate_score * weight_speech_rate +
|
71 |
+
speaking_ratio_score * weight_speaking_ratio +
|
72 |
+
avg_pronunciation_score * weight_pronunciation +
|
73 |
+
(1 / (1 + pronunciation_variance)) * weight_pronunciation_variance)
|
74 |
+
|
75 |
+
# Scale the combined score to be between 10% and 100%
|
76 |
+
scaled_fluency_score = 10 + combined_score * 80
|
77 |
+
|
78 |
+
return scaled_fluency_score
|
79 |
+
|
80 |
+
def calculate_pronunciation_accuracy(word_pronunciation_scores, fluency_score, base_script_len):
|
81 |
+
if len(word_pronunciation_scores) / base_script_len < 0.25:
|
82 |
+
return 10
|
83 |
+
# Calculate average word pronunciation score
|
84 |
+
avg_pronunciation_score = calculate_expected_value(word_pronunciation_scores)
|
85 |
+
print(avg_pronunciation_score)
|
86 |
+
# Adjust pronunciation score based on fluency
|
87 |
+
# fluency_score = fluency_score / 100
|
88 |
+
# This is a simplistic adjustment. It can be refined based on more detailed analysis
|
89 |
+
fluency_adjustment = fluency_score / 100
|
90 |
+
print(fluency_adjustment)
|
91 |
+
adjusted_pronunciation_score = avg_pronunciation_score * fluency_adjustment
|
92 |
+
print(adjusted_pronunciation_score)
|
93 |
+
# Map to 0-5 scale based on score guide
|
94 |
+
# These thresholds can be adjusted based on empirical data or further analysis
|
95 |
+
if adjusted_pronunciation_score >= 2.4:
|
96 |
+
score_guide_level = 5
|
97 |
+
elif adjusted_pronunciation_score >= 1.7:
|
98 |
+
score_guide_level = 4
|
99 |
+
elif adjusted_pronunciation_score >= 1.0:
|
100 |
+
score_guide_level = 3
|
101 |
+
elif adjusted_pronunciation_score >= 0.5:
|
102 |
+
score_guide_level = 2
|
103 |
+
else:
|
104 |
+
score_guide_level = 1
|
105 |
+
|
106 |
+
# Scale to 10% - 90%
|
107 |
+
final_score = 10 + (score_guide_level - 1) * 20 # Scale each level to a range of 20%
|
108 |
+
|
109 |
+
return final_score
|
110 |
+
|
111 |
+
def calculate_fluency_and_pronunciation(audio_path, transcription, word_pronunciation_scores, base_script_len):
|
112 |
+
|
113 |
+
fluency_score = calculate_fluency_score(audio_path, transcription, word_pronunciation_scores, base_script_len)
|
114 |
+
|
115 |
+
pronunciation_accuracy = calculate_pronunciation_accuracy(word_pronunciation_scores, fluency_score, base_script_len)
|
116 |
+
|
117 |
+
return {'fluency_score': fluency_score, 'pronunciation_accuracy': pronunciation_accuracy}
|
118 |
+
|
119 |
+
|
120 |
+
if __name__ == '__main__':
|
121 |
+
pass
|