Spaces:
Sleeping
Sleeping
Upload 3 files
Browse files- app.py +63 -90
- logic.py +66 -0
- requirements.txt +9 -0
app.py
CHANGED
@@ -1,108 +1,81 @@
|
|
1 |
import gradio as gr
|
2 |
-
from
|
3 |
-
|
4 |
|
5 |
-
# Additional CSS for styling the confidence bars and the result layout
|
6 |
-
additional_css = """
|
7 |
-
/* CSS for the confidence bars */
|
8 |
-
.confidence-section {
|
9 |
-
display: flex;
|
10 |
-
align-items: center;
|
11 |
-
margin-top: 10px;
|
12 |
-
}
|
13 |
|
14 |
-
.confidence-label {
|
15 |
-
margin-right: 10px;
|
16 |
-
font-weight: bold;
|
17 |
-
}
|
18 |
-
.confidence-bar {
|
19 |
-
height: 20px;
|
20 |
-
width: 100%;
|
21 |
-
background-color: #eee;
|
22 |
-
border-radius: 10px;
|
23 |
-
margin: 10px 0;
|
24 |
-
}
|
25 |
|
26 |
-
|
27 |
-
|
28 |
-
border-radius: 10px;
|
29 |
-
background-color: #4caf50; /* Change color based on confidence level if desired */
|
30 |
-
text-align: center;
|
31 |
-
color: white;
|
32 |
-
line-height: 20px;
|
33 |
-
}
|
34 |
-
/* Additional CSS for styling the rest of your results */
|
35 |
-
"""
|
36 |
-
|
37 |
-
# Function to generate custom HTML for the confidence bar
|
38 |
-
def custom_confidence_bar(confidence):
|
39 |
-
color = "#4caf50" if confidence > 75 else "#FFC107" if confidence > 50 else "#F44336"
|
40 |
-
return f"""
|
41 |
-
<div class="confidence-section">
|
42 |
-
<span class="confidence-label">Model Confidence:</span>
|
43 |
-
<div class="confidence-bar">
|
44 |
-
<div class="confidence-fill" style="width: {confidence}%; background-color: {color};">
|
45 |
-
{confidence}%
|
46 |
-
</div>
|
47 |
-
</div>
|
48 |
-
</div>
|
49 |
-
"""
|
50 |
|
51 |
-
|
52 |
-
# Function to extract score level from message
|
53 |
-
def extract_score_level(message):
|
54 |
-
match = re.search(r'Score: (\d+)-(\d+)', message)
|
55 |
-
score_level = f"{match.group(1)} of 10" if match else "N/A"
|
56 |
-
return score_level
|
57 |
-
def message_markdown(label, message, task, score_level):
|
58 |
-
md = f'''# {label}
|
59 |
-
**Model Prediction:** {message}
|
60 |
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
|
|
|
|
68 |
|
69 |
-
# Unpack the results
|
70 |
-
accuracy_message, accuracy_confidence = accuracy
|
71 |
-
fluency_message, fluency_confidence = fluency
|
72 |
|
73 |
-
# Extract the score level from the message
|
74 |
-
accuracy_score = extract_score_level(accuracy_message)
|
75 |
-
fluency_score = extract_score_level(fluency_message)
|
76 |
|
77 |
-
|
78 |
-
|
79 |
-
|
|
|
80 |
|
81 |
-
|
82 |
-
|
83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
|
85 |
-
|
|
|
|
|
86 |
|
87 |
-
|
|
|
88 |
|
89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
# Define the Gradio interface
|
92 |
-
iface = gr.Interface(
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
title="Audio Analysis Tool",
|
103 |
-
description="Upload an audio file to analyze its accuracy and fluency."
|
104 |
-
)
|
105 |
|
106 |
# Run the Gradio app
|
107 |
if __name__ == "__main__":
|
108 |
-
iface.launch()
|
|
|
1 |
import gradio as gr
|
2 |
+
from logic import compare_audio_with_text
|
3 |
+
from scipy.io import wavfile
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
+
def create_html_from_scores(word_scores):
|
8 |
+
html_output = ''
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
+
# Ensure the number of words and scores match
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
+
for word, score in word_scores:
|
13 |
+
if score == 1:
|
14 |
+
html_output += f'<span style="color: red;">{word}</span> '
|
15 |
+
elif score == 2:
|
16 |
+
html_output += f'<span style="color: orange;">{word}</span> '
|
17 |
+
else:
|
18 |
+
html_output += f'<span style="color: green;">{word}</span> '
|
19 |
+
return html_output
|
20 |
+
|
21 |
|
|
|
|
|
|
|
22 |
|
|
|
|
|
|
|
23 |
|
24 |
+
def analyze_audio(text, audio):
|
25 |
+
# Write the processed audio to a temporary WAV file
|
26 |
+
temp_filename = 'temp_audio.wav'
|
27 |
+
wavfile.write(temp_filename, audio[0], audio[1])
|
28 |
|
29 |
+
|
30 |
+
result = compare_audio_with_text(temp_filename, text)
|
31 |
+
html_content = create_html_from_scores(result)
|
32 |
+
html_with_css = f"""
|
33 |
+
<style>
|
34 |
+
.legend {{
|
35 |
+
font-size: 22px;
|
36 |
+
display: flex;
|
37 |
+
align-items: center;
|
38 |
+
gap: 12px;
|
39 |
+
}}
|
40 |
+
|
41 |
+
.legend-dot {{
|
42 |
+
height: 15px;
|
43 |
+
width: 15px;
|
44 |
+
border-radius: 50%;
|
45 |
+
display: inline-block;
|
46 |
+
}}
|
47 |
|
48 |
+
.good {{ color: #28a745; }}
|
49 |
+
.average {{ color: #ffc107; }}
|
50 |
+
.bad {{ color: #dc3545; }}
|
51 |
|
52 |
+
.text {{ font-size: 20px; }}
|
53 |
+
</style>
|
54 |
|
55 |
+
<div class="legend">
|
56 |
+
<span class="legend-dot" style="background-color: #28a745;"></span><span>Good</span>
|
57 |
+
<span class="legend-dot" style="background-color: #ffc107;"></span><span>Understandable</span>
|
58 |
+
<span class="legend-dot" style="background-color: #dc3545;"></span><span>Bad</span>
|
59 |
+
</div>
|
60 |
+
|
61 |
+
<p class="text">
|
62 |
+
{html_content}
|
63 |
+
</p>
|
64 |
+
"""
|
65 |
+
return html_with_css
|
66 |
|
67 |
# Define the Gradio interface
|
68 |
+
iface = gr.Interface(fn=analyze_audio,
|
69 |
+
inputs=[gr.Textbox(label='Training Text', placeholder='Write the text for pronunciation task', interactive=True, visible=True, show_copy_button=True,),
|
70 |
+
gr.Audio(label="Upload Audio")
|
71 |
+
],
|
72 |
+
outputs=[gr.HTML(label="Analysis of pronunciation"),
|
73 |
+
],
|
74 |
+
# css=additional_css,
|
75 |
+
# title="Audio Analysis Tool",
|
76 |
+
description="Upload an audio file to analyze pronunciation accuracy and speech fluency."
|
77 |
+
)
|
|
|
|
|
|
|
78 |
|
79 |
# Run the Gradio app
|
80 |
if __name__ == "__main__":
|
81 |
+
iface.launch()
|
logic.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from phonemizer.separator import Separator
|
2 |
+
from phonemizer import phonemize
|
3 |
+
# from phonemizer.backend.espeak.wrapper import EspeakWrapper
|
4 |
+
from Levenshtein import distance as levenshtein_distance
|
5 |
+
|
6 |
+
import whisper
|
7 |
+
import torch
|
8 |
+
|
9 |
+
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
|
10 |
+
|
11 |
+
model = whisper.load_model("base.en", device=device)
|
12 |
+
separator = Separator(phone=None, word=' | ',)
|
13 |
+
|
14 |
+
# EspeakWrapper.set_library(r"C:\Program Files\eSpeak NG\libespeak-ng.dll")
|
15 |
+
|
16 |
+
def transcribe(audio):
|
17 |
+
result = model.transcribe(audio, word_timestamps=False, no_speech_threshold=0.4, compression_ratio_threshold=2, temperature=0)
|
18 |
+
return {'language': result['language'], 'text': result['text']}
|
19 |
+
|
20 |
+
def text2phoneme(text):
|
21 |
+
return phonemize(text.lower(), backend='espeak' , separator=separator, strip=True, with_stress=False, tie=False, language='en-us')
|
22 |
+
|
23 |
+
def rate_pronunciation(expected_phonemes, actual_phonemes):
|
24 |
+
expected_phonemes = expected_phonemes.split(" | ")
|
25 |
+
actual_phonemes = actual_phonemes.split(" | ")
|
26 |
+
# Calculate the Levenshtein distance between the two phoneme sequences
|
27 |
+
results = []
|
28 |
+
for i, base_word in enumerate(actual_phonemes):
|
29 |
+
best_dist = float('inf')
|
30 |
+
error_threshold = len(base_word) * 0.45
|
31 |
+
for pred_word_id in range(max(0, i-2), i + min(6, len(expected_phonemes) - i)):
|
32 |
+
dist = levenshtein_distance(expected_phonemes[pred_word_id], base_word,)
|
33 |
+
if dist < best_dist:
|
34 |
+
best_dist = dist
|
35 |
+
if best_dist == 0: # Early stopping on perfect match
|
36 |
+
break
|
37 |
+
if best_dist == 0:
|
38 |
+
results.append(3)
|
39 |
+
elif best_dist <= error_threshold:
|
40 |
+
results.append(2)
|
41 |
+
else:
|
42 |
+
results.append(1)
|
43 |
+
return results
|
44 |
+
|
45 |
+
def compare_audio_with_text(audio, text):
|
46 |
+
transcribtion = transcribe(audio)['text']
|
47 |
+
print(transcribtion)
|
48 |
+
transcribtion = text2phoneme(transcribtion)
|
49 |
+
text_phone = text2phoneme(text)
|
50 |
+
scores = rate_pronunciation(transcribtion, text_phone)
|
51 |
+
|
52 |
+
result = [(word, s) for word, s in zip(text.split(), scores)]
|
53 |
+
return result
|
54 |
+
|
55 |
+
if __name__ == '__main__':
|
56 |
+
|
57 |
+
text = 'i have ADHD '
|
58 |
+
text = text2phoneme(text)
|
59 |
+
file_path = r'user_recording.wav'
|
60 |
+
trans = transcribe(file_path)['text']
|
61 |
+
print(trans)
|
62 |
+
trans = text2phoneme(trans)
|
63 |
+
print('base:', text)
|
64 |
+
print('predicted:', trans)
|
65 |
+
result = rate_pronunciation(trans, text)
|
66 |
+
print(result)
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
espeak-ng
|
2 |
+
phonemizer
|
3 |
+
wave
|
4 |
+
torch
|
5 |
+
openai-whisper
|
6 |
+
gradio
|
7 |
+
numpy
|
8 |
+
resampy
|
9 |
+
Levenshtein
|