seba3y commited on
Commit
86a5e7d
1 Parent(s): 3be22dd

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +63 -90
  2. logic.py +66 -0
  3. requirements.txt +9 -0
app.py CHANGED
@@ -1,108 +1,81 @@
1
  import gradio as gr
2
- from audio import predict_all # This is your custom module for predictions
3
- import re # Regular expressions for text processing
4
 
5
- # Additional CSS for styling the confidence bars and the result layout
6
- additional_css = """
7
- /* CSS for the confidence bars */
8
- .confidence-section {
9
- display: flex;
10
- align-items: center;
11
- margin-top: 10px;
12
- }
13
 
14
- .confidence-label {
15
- margin-right: 10px;
16
- font-weight: bold;
17
- }
18
- .confidence-bar {
19
- height: 20px;
20
- width: 100%;
21
- background-color: #eee;
22
- border-radius: 10px;
23
- margin: 10px 0;
24
- }
25
 
26
- .confidence-fill {
27
- height: 100%;
28
- border-radius: 10px;
29
- background-color: #4caf50; /* Change color based on confidence level if desired */
30
- text-align: center;
31
- color: white;
32
- line-height: 20px;
33
- }
34
- /* Additional CSS for styling the rest of your results */
35
- """
36
-
37
- # Function to generate custom HTML for the confidence bar
38
- def custom_confidence_bar(confidence):
39
- color = "#4caf50" if confidence > 75 else "#FFC107" if confidence > 50 else "#F44336"
40
- return f"""
41
- <div class="confidence-section">
42
- <span class="confidence-label">Model Confidence:</span>
43
- <div class="confidence-bar">
44
- <div class="confidence-fill" style="width: {confidence}%; background-color: {color};">
45
- {confidence}%
46
- </div>
47
- </div>
48
- </div>
49
- """
50
 
51
-
52
- # Function to extract score level from message
53
- def extract_score_level(message):
54
- match = re.search(r'Score: (\d+)-(\d+)', message)
55
- score_level = f"{match.group(1)} of 10" if match else "N/A"
56
- return score_level
57
- def message_markdown(label, message, task, score_level):
58
- md = f'''# {label}
59
- **Model Prediction:** {message}
60
 
61
- **{task} Score:** {score_level}
62
- '''
63
- return md
64
- # Function to process the audio file and analyze it
65
- def analyze_audio(audio_data):
66
- # Assuming predict_all returns a tuple of (message, confidence) for accuracy and fluency
67
- accuracy, fluency = predict_all(audio_data)
 
 
68
 
69
- # Unpack the results
70
- accuracy_message, accuracy_confidence = accuracy
71
- fluency_message, fluency_confidence = fluency
72
 
73
- # Extract the score level from the message
74
- accuracy_score = extract_score_level(accuracy_message)
75
- fluency_score = extract_score_level(fluency_message)
76
 
77
- # Remove the score level from the original message
78
- accuracy_message = accuracy_message.split(",")[1].strip() if "," in accuracy_message else accuracy_message
79
- fluency_message = fluency_message.split(",")[1].strip() if "," in fluency_message else fluency_message
 
80
 
81
- # Generate the confidence bar HTML
82
- accuracy_confidence_html = custom_confidence_bar(accuracy_confidence * 100)
83
- fluency_confidence_html = custom_confidence_bar(fluency_confidence * 100)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
- accuracy_markdown = message_markdown('Accuracy of Pronunciation', accuracy_message, 'Pronunciation', accuracy_score)
 
 
86
 
87
- fluency_markdown = message_markdown('Speaker Fluency', fluency_message, 'Fluency', fluency_score)
 
88
 
89
- return accuracy_markdown, accuracy_confidence_html, fluency_markdown, fluency_confidence_html
 
 
 
 
 
 
 
 
 
 
90
 
91
  # Define the Gradio interface
92
- iface = gr.Interface(
93
- fn=analyze_audio,
94
- inputs=gr.Audio(label="Upload Audio"),
95
- outputs=[
96
- gr.Markdown(label="Accuracy Score Level"),
97
- gr.HTML(label="Accuracy Confidence"),
98
- gr.Markdown(label="Fluency Score Level"),
99
- gr.HTML(label="Fluency Confidence"),
100
- ],
101
- css=additional_css,
102
- title="Audio Analysis Tool",
103
- description="Upload an audio file to analyze its accuracy and fluency."
104
- )
105
 
106
  # Run the Gradio app
107
  if __name__ == "__main__":
108
- iface.launch()
 
1
  import gradio as gr
2
+ from logic import compare_audio_with_text
3
+ from scipy.io import wavfile
4
 
 
 
 
 
 
 
 
 
5
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
+ def create_html_from_scores(word_scores):
8
+ html_output = ''
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
+ # Ensure the number of words and scores match
 
 
 
 
 
 
 
 
11
 
12
+ for word, score in word_scores:
13
+ if score == 1:
14
+ html_output += f'<span style="color: red;">{word}</span> '
15
+ elif score == 2:
16
+ html_output += f'<span style="color: orange;">{word}</span> '
17
+ else:
18
+ html_output += f'<span style="color: green;">{word}</span> '
19
+ return html_output
20
+
21
 
 
 
 
22
 
 
 
 
23
 
24
+ def analyze_audio(text, audio):
25
+ # Write the processed audio to a temporary WAV file
26
+ temp_filename = 'temp_audio.wav'
27
+ wavfile.write(temp_filename, audio[0], audio[1])
28
 
29
+
30
+ result = compare_audio_with_text(temp_filename, text)
31
+ html_content = create_html_from_scores(result)
32
+ html_with_css = f"""
33
+ <style>
34
+ .legend {{
35
+ font-size: 22px;
36
+ display: flex;
37
+ align-items: center;
38
+ gap: 12px;
39
+ }}
40
+
41
+ .legend-dot {{
42
+ height: 15px;
43
+ width: 15px;
44
+ border-radius: 50%;
45
+ display: inline-block;
46
+ }}
47
 
48
+ .good {{ color: #28a745; }}
49
+ .average {{ color: #ffc107; }}
50
+ .bad {{ color: #dc3545; }}
51
 
52
+ .text {{ font-size: 20px; }}
53
+ </style>
54
 
55
+ <div class="legend">
56
+ <span class="legend-dot" style="background-color: #28a745;"></span><span>Good</span>
57
+ <span class="legend-dot" style="background-color: #ffc107;"></span><span>Understandable</span>
58
+ <span class="legend-dot" style="background-color: #dc3545;"></span><span>Bad</span>
59
+ </div>
60
+
61
+ <p class="text">
62
+ {html_content}
63
+ </p>
64
+ """
65
+ return html_with_css
66
 
67
  # Define the Gradio interface
68
+ iface = gr.Interface(fn=analyze_audio,
69
+ inputs=[gr.Textbox(label='Training Text', placeholder='Write the text for pronunciation task', interactive=True, visible=True, show_copy_button=True,),
70
+ gr.Audio(label="Upload Audio")
71
+ ],
72
+ outputs=[gr.HTML(label="Analysis of pronunciation"),
73
+ ],
74
+ # css=additional_css,
75
+ # title="Audio Analysis Tool",
76
+ description="Upload an audio file to analyze pronunciation accuracy and speech fluency."
77
+ )
 
 
 
78
 
79
  # Run the Gradio app
80
  if __name__ == "__main__":
81
+ iface.launch()
logic.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from phonemizer.separator import Separator
2
+ from phonemizer import phonemize
3
+ # from phonemizer.backend.espeak.wrapper import EspeakWrapper
4
+ from Levenshtein import distance as levenshtein_distance
5
+
6
+ import whisper
7
+ import torch
8
+
9
+ device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
10
+
11
+ model = whisper.load_model("base.en", device=device)
12
+ separator = Separator(phone=None, word=' | ',)
13
+
14
+ # EspeakWrapper.set_library(r"C:\Program Files\eSpeak NG\libespeak-ng.dll")
15
+
16
+ def transcribe(audio):
17
+ result = model.transcribe(audio, word_timestamps=False, no_speech_threshold=0.4, compression_ratio_threshold=2, temperature=0)
18
+ return {'language': result['language'], 'text': result['text']}
19
+
20
+ def text2phoneme(text):
21
+ return phonemize(text.lower(), backend='espeak' , separator=separator, strip=True, with_stress=False, tie=False, language='en-us')
22
+
23
+ def rate_pronunciation(expected_phonemes, actual_phonemes):
24
+ expected_phonemes = expected_phonemes.split(" | ")
25
+ actual_phonemes = actual_phonemes.split(" | ")
26
+ # Calculate the Levenshtein distance between the two phoneme sequences
27
+ results = []
28
+ for i, base_word in enumerate(actual_phonemes):
29
+ best_dist = float('inf')
30
+ error_threshold = len(base_word) * 0.45
31
+ for pred_word_id in range(max(0, i-2), i + min(6, len(expected_phonemes) - i)):
32
+ dist = levenshtein_distance(expected_phonemes[pred_word_id], base_word,)
33
+ if dist < best_dist:
34
+ best_dist = dist
35
+ if best_dist == 0: # Early stopping on perfect match
36
+ break
37
+ if best_dist == 0:
38
+ results.append(3)
39
+ elif best_dist <= error_threshold:
40
+ results.append(2)
41
+ else:
42
+ results.append(1)
43
+ return results
44
+
45
+ def compare_audio_with_text(audio, text):
46
+ transcribtion = transcribe(audio)['text']
47
+ print(transcribtion)
48
+ transcribtion = text2phoneme(transcribtion)
49
+ text_phone = text2phoneme(text)
50
+ scores = rate_pronunciation(transcribtion, text_phone)
51
+
52
+ result = [(word, s) for word, s in zip(text.split(), scores)]
53
+ return result
54
+
55
+ if __name__ == '__main__':
56
+
57
+ text = 'i have ADHD '
58
+ text = text2phoneme(text)
59
+ file_path = r'user_recording.wav'
60
+ trans = transcribe(file_path)['text']
61
+ print(trans)
62
+ trans = text2phoneme(trans)
63
+ print('base:', text)
64
+ print('predicted:', trans)
65
+ result = rate_pronunciation(trans, text)
66
+ print(result)
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ espeak-ng
2
+ phonemizer
3
+ wave
4
+ torch
5
+ openai-whisper
6
+ gradio
7
+ numpy
8
+ resampy
9
+ Levenshtein