seba3y commited on
Commit
c13f0a5
1 Parent(s): 2a7e44d

Upload 5 files

Browse files
Files changed (4) hide show
  1. app.py +88 -21
  2. logic.py +14 -8
  3. requirements.txt +2 -1
  4. scoring.py +121 -0
app.py CHANGED
@@ -1,22 +1,35 @@
1
  import gradio as gr
2
- from logic import compare_audio_with_text
3
  from scipy.io import wavfile
4
 
5
 
6
 
7
  def create_html_from_scores(word_scores):
8
  html_output = ''
9
-
10
- # Ensure the number of words and scores match
11
-
12
  for word, score in word_scores:
13
  if score == 1:
14
- html_output += f'<span style="color: red;">{word}</span> '
15
  elif score == 2:
16
- html_output += f'<span style="color: orange;">{word}</span> '
17
  else:
18
- html_output += f'<span style="color: green;">{word}</span> '
19
  return html_output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
 
22
 
@@ -27,8 +40,16 @@ def analyze_audio(text, audio):
27
  wavfile.write(temp_filename, audio[0], audio[1])
28
 
29
 
30
- result = compare_audio_with_text(temp_filename, text)
31
- html_content = create_html_from_scores(result)
 
 
 
 
 
 
 
 
32
  html_with_css = f"""
33
  <style>
34
  .legend {{
@@ -39,19 +60,62 @@ def analyze_audio(text, audio):
39
  }}
40
 
41
  .legend-dot {{
42
- height: 15px;
43
- width: 15px;
44
- border-radius: 50%;
45
- display: inline-block;
 
 
 
 
 
 
 
46
  }}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
- .good {{ color: #28a745; }}
49
- .average {{ color: #ffc107; }}
50
- .bad {{ color: #dc3545; }}
51
 
52
- .text {{ font-size: 20px; }}
53
- </style>
54
- <h1> Word Pronunciation scores </h1>
55
  <div class="legend">
56
  <span class="legend-dot" style="background-color: #28a745;"></span><span>Good</span>
57
  <span class="legend-dot" style="background-color: #ffc107;"></span><span>Understandable</span>
@@ -61,17 +125,20 @@ def analyze_audio(text, audio):
61
  <p class="text">
62
  {html_content}
63
  </p>
 
 
 
64
  """
65
  return html_with_css
66
 
67
  # Define the Gradio interface
68
  iface = gr.Interface(fn=analyze_audio,
69
  inputs=[gr.Textbox(label='Training Text', placeholder='Write the text for pronunciation task', interactive=True, visible=True, show_copy_button=True,),
70
- gr.Audio(label="Recoreded Audio")
71
  ],
72
  outputs=[gr.HTML(label="Analysis of pronunciation"),
73
  ],
74
- # css=additional_css,
75
  # title="Audio Analysis Tool",
76
  description="Write any text and recored an audio to predict pronunciation erors"
77
  )
 
1
  import gradio as gr
2
+ from logic import Speaker_speech_analysis
3
  from scipy.io import wavfile
4
 
5
 
6
 
7
  def create_html_from_scores(word_scores):
8
  html_output = ''
 
 
 
9
  for word, score in word_scores:
10
  if score == 1:
11
+ html_output += f'<span style="color: #dc3545;">{word}</span> '
12
  elif score == 2:
13
+ html_output += f'<span style="color: #ffc107;">{word}</span> '
14
  else:
15
+ html_output += f'<span style="color: #28a745;">{word}</span> '
16
  return html_output
17
+
18
+ def generate_progress_bar(score, label):
19
+ score = round(score, 2)
20
+ score_text = f"{score:.2f}" if score < 90 else "90"
21
+ bar_color = "#dc3545" if score < 30 else "#ffc107" if score < 60 else "#28a745"
22
+ bar_length = f"{(score / 90) * 100}%"
23
+ return f"""
24
+ <div class="progress-label">{label}:</div>
25
+ <div class="progress-container">
26
+ <div class="progress-bar" style="width: {bar_length}; background-color: {bar_color};">
27
+ <div class="progress-score">{score_text}</div>
28
+ </div>
29
+ </div>
30
+ <div class="progress-max">Max: 90</div>
31
+ """
32
+ # CSS to be used in the Gradio Interface
33
 
34
 
35
 
 
40
  wavfile.write(temp_filename, audio[0], audio[1])
41
 
42
 
43
+ result = Speaker_speech_analysis(temp_filename, text)
44
+ accuracy_score = result['pronunciation_accuracy']
45
+ fluency_score = result['fluency_score']
46
+ word_scores = result['word_scores']
47
+
48
+ html_content = create_html_from_scores(word_scores)
49
+ pronunciation_progress_bar = generate_progress_bar(accuracy_score, "Pronunciation Accuracy")
50
+ fluency_progress_bar = generate_progress_bar(fluency_score, "Fluency Score")
51
+
52
+
53
  html_with_css = f"""
54
  <style>
55
  .legend {{
 
60
  }}
61
 
62
  .legend-dot {{
63
+ height: 15px;
64
+ width: 15px;
65
+ border-radius: 50%;
66
+ display: inline-block;
67
+ }}
68
+
69
+ .good {{ color: #28a745;
70
+ }}
71
+ .average {{ color: #ffc107;
72
+ }}
73
+ .bad {{ color: #dc3545;
74
  }}
75
+
76
+ .text {{
77
+ font-size: 20px;
78
+ margin-bottom: 20px;
79
+ }}
80
+
81
+ .progress-container {{
82
+ width: 100%;
83
+ background-color: #ddd;
84
+ border-radius: 13px;
85
+ overflow: hidden;
86
+ }}
87
+
88
+ .progress-bar {{
89
+ height: 30px;
90
+ line-height: 30px;
91
+ text-align: center;
92
+ font-size: 16px;
93
+ border-radius: 15px;
94
+ transition: width 1s ease;
95
+ }}
96
+
97
+ .progress-label {{
98
+ font-weight: bold;
99
+ font-size: 22px;
100
+ margin-bottom: 20px;
101
+ margin-top: 5px;
102
+ text-align: center;
103
+ }}
104
+
105
+ .progress-score {{
106
+ display: inline-block;
107
+ color: black;
108
+ }}
109
+
110
+ .progress-max {{
111
+ text-align: right;
112
+ margin: 10px;
113
+ font-size: 16px;
114
+ }}
115
+
116
+ </style>
117
 
 
 
 
118
 
 
 
 
119
  <div class="legend">
120
  <span class="legend-dot" style="background-color: #28a745;"></span><span>Good</span>
121
  <span class="legend-dot" style="background-color: #ffc107;"></span><span>Understandable</span>
 
125
  <p class="text">
126
  {html_content}
127
  </p>
128
+
129
+ {pronunciation_progress_bar}
130
+ {fluency_progress_bar}
131
  """
132
  return html_with_css
133
 
134
  # Define the Gradio interface
135
  iface = gr.Interface(fn=analyze_audio,
136
  inputs=[gr.Textbox(label='Training Text', placeholder='Write the text for pronunciation task', interactive=True, visible=True, show_copy_button=True,),
137
+ gr.Audio(label="Recoreded Audio", sources=['microphone', 'upload'])
138
  ],
139
  outputs=[gr.HTML(label="Analysis of pronunciation"),
140
  ],
141
+ # css=additional_css,
142
  # title="Audio Analysis Tool",
143
  description="Write any text and recored an audio to predict pronunciation erors"
144
  )
logic.py CHANGED
@@ -2,6 +2,7 @@ from phonemizer.separator import Separator
2
  from phonemizer import phonemize
3
  # from phonemizer.backend.espeak.wrapper import EspeakWrapper
4
  from Levenshtein import distance as levenshtein_distance
 
5
 
6
  import whisper
7
  import torch
@@ -9,7 +10,7 @@ import torch
9
  device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
10
 
11
  model = whisper.load_model("base.en", device=device)
12
- separator = Separator(phone=None, word=' | ',)
13
 
14
  # EspeakWrapper.set_library(r"C:\Program Files\eSpeak NG\libespeak-ng.dll")
15
 
@@ -42,15 +43,20 @@ def rate_pronunciation(expected_phonemes, actual_phonemes):
42
  results.append(1)
43
  return results
44
 
45
- def compare_audio_with_text(audio, text):
46
- transcribtion = transcribe(audio)['text']
47
- print(transcribtion)
48
- transcribtion = text2phoneme(transcribtion)
 
 
 
49
  text_phone = text2phoneme(text)
50
  scores = rate_pronunciation(transcribtion, text_phone)
51
-
52
- result = [(word, s) for word, s in zip(text.split(), scores)]
53
- return result
 
 
54
 
55
  if __name__ == '__main__':
56
 
 
2
  from phonemizer import phonemize
3
  # from phonemizer.backend.espeak.wrapper import EspeakWrapper
4
  from Levenshtein import distance as levenshtein_distance
5
+ from scoring import calculate_fluency_and_pronunciation
6
 
7
  import whisper
8
  import torch
 
10
  device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
11
 
12
  model = whisper.load_model("base.en", device=device)
13
+ separator = Separator(phone=None, word='',)
14
 
15
  # EspeakWrapper.set_library(r"C:\Program Files\eSpeak NG\libespeak-ng.dll")
16
 
 
43
  results.append(1)
44
  return results
45
 
46
+
47
+
48
+
49
+ def Speaker_speech_analysis(audio_path, text):
50
+ pre_transcribtion = transcribe(audio_path)['text']
51
+ print(pre_transcribtion)
52
+ transcribtion = text2phoneme(pre_transcribtion)
53
  text_phone = text2phoneme(text)
54
  scores = rate_pronunciation(transcribtion, text_phone)
55
+ FP_scores = calculate_fluency_and_pronunciation(audio_path, pre_transcribtion, scores, len(text.split()))
56
+ word_scores = [(word, s) for word, s in zip(text.split(), scores)]
57
+
58
+ FP_scores['word_scores'] = word_scores
59
+ return FP_scores
60
 
61
  if __name__ == '__main__':
62
 
requirements.txt CHANGED
@@ -6,4 +6,5 @@ gradio
6
  scipy
7
  numpy
8
  resampy
9
- Levenshtein
 
 
6
  scipy
7
  numpy
8
  resampy
9
+ Levenshtein
10
+ librosa
scoring.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import librosa
3
+
4
+ def calculate_expected_value(scores):
5
+ """
6
+ Calculate the expected value for a list of outcomes (scores), assuming each unique score
7
+ occurs with a frequency proportional to its count in the list.
8
+
9
+ :param scores: List of outcomes (numeric values).
10
+ :return: The expected value (a weighted average of all possible outcomes).
11
+ """
12
+ # First calculate the probability of each unique score
13
+ unique_scores, counts = np.unique(scores, return_counts=True)
14
+ probabilities = counts / len(scores)
15
+
16
+ # Then calculate the expected value as the sum of scores times their probabilities
17
+ expected_value = np.dot(unique_scores, probabilities)
18
+ return expected_value
19
+
20
+
21
+ def calculate_fluency_score(audio_path, transcription, word_pronunciation_scores, base_script_len):
22
+ total_words = len(transcription.split())
23
+ avg_pronunciation_score = calculate_expected_value(word_pronunciation_scores)
24
+ if (total_words / base_script_len) < 0.15 or avg_pronunciation_score < 1.3:
25
+ return 10
26
+ audio, sr = librosa.load(audio_path)
27
+ non_silent_intervals = librosa.effects.split(audio, top_db=22)
28
+ non_silent_duration = sum([intv[1] - intv[0] for intv in non_silent_intervals]) / sr
29
+
30
+ total_duration = len(audio) / sr
31
+
32
+
33
+ non_silent_duration = non_silent_duration if total_words > 4 else 0
34
+ ideal_min_rate, ideal_max_rate = 120 / 60, 140 / 60
35
+ actual_speech_rate = (total_words / (non_silent_duration + 1e-7)) * (total_words / base_script_len)
36
+ speaking_ratio = non_silent_duration / total_duration
37
+ # Existing speech rate score calculation
38
+
39
+ # Determine if speech rate is within the ideal range
40
+ if ideal_min_rate <= actual_speech_rate <= ideal_max_rate:
41
+ # Within the ideal range
42
+ speech_rate_score = 1
43
+ else:
44
+ # Outside the ideal range, score is proportional to how close it is to the range
45
+ if actual_speech_rate < ideal_min_rate:
46
+ # Too slow
47
+ speech_rate_score = actual_speech_rate / ideal_min_rate
48
+ else:
49
+ # Too fast
50
+ speech_rate_score = 2 - (actual_speech_rate / ideal_max_rate)
51
+ # Clamp the score between 0 and 1
52
+ speech_rate_score = max(0, min(speech_rate_score, 1))
53
+
54
+ # If speaking ratio is significantly less than the gold standard, reduce the fluency score
55
+ gold_standard_ratio = 0.9 # Assuming 90% speaking time is gold standard for natural speech
56
+ speaking_ratio_score = min(speaking_ratio / gold_standard_ratio, 1)
57
+
58
+
59
+ # Pronunciation score calculation
60
+ avg_pronunciation_score = (avg_pronunciation_score - 1) / 2
61
+ pronunciation_variance = np.var(word_pronunciation_scores, ddof=1,)
62
+
63
+ # Weighted combination of scores
64
+ # Adjust weights as needed
65
+ weight_speech_rate = 0.20
66
+ weight_speaking_ratio = 0.20
67
+ weight_pronunciation = 0.50
68
+ weight_pronunciation_variance = 0.10
69
+
70
+ combined_score = (speech_rate_score * weight_speech_rate +
71
+ speaking_ratio_score * weight_speaking_ratio +
72
+ avg_pronunciation_score * weight_pronunciation +
73
+ (1 / (1 + pronunciation_variance)) * weight_pronunciation_variance)
74
+
75
+ # Scale the combined score to be between 10% and 100%
76
+ scaled_fluency_score = 10 + combined_score * 80
77
+
78
+ return scaled_fluency_score
79
+
80
+ def calculate_pronunciation_accuracy(word_pronunciation_scores, fluency_score, base_script_len):
81
+ if len(word_pronunciation_scores) / base_script_len < 0.25:
82
+ return 10
83
+ # Calculate average word pronunciation score
84
+ avg_pronunciation_score = calculate_expected_value(word_pronunciation_scores)
85
+ print(avg_pronunciation_score)
86
+ # Adjust pronunciation score based on fluency
87
+ # fluency_score = fluency_score / 100
88
+ # This is a simplistic adjustment. It can be refined based on more detailed analysis
89
+ fluency_adjustment = fluency_score / 100
90
+ print(fluency_adjustment)
91
+ adjusted_pronunciation_score = avg_pronunciation_score * fluency_adjustment
92
+ print(adjusted_pronunciation_score)
93
+ # Map to 0-5 scale based on score guide
94
+ # These thresholds can be adjusted based on empirical data or further analysis
95
+ if adjusted_pronunciation_score >= 2.4:
96
+ score_guide_level = 5
97
+ elif adjusted_pronunciation_score >= 1.7:
98
+ score_guide_level = 4
99
+ elif adjusted_pronunciation_score >= 1.0:
100
+ score_guide_level = 3
101
+ elif adjusted_pronunciation_score >= 0.5:
102
+ score_guide_level = 2
103
+ else:
104
+ score_guide_level = 1
105
+
106
+ # Scale to 10% - 90%
107
+ final_score = 10 + (score_guide_level - 1) * 20 # Scale each level to a range of 20%
108
+
109
+ return final_score
110
+
111
+ def calculate_fluency_and_pronunciation(audio_path, transcription, word_pronunciation_scores, base_script_len):
112
+
113
+ fluency_score = calculate_fluency_score(audio_path, transcription, word_pronunciation_scores, base_script_len)
114
+
115
+ pronunciation_accuracy = calculate_pronunciation_accuracy(word_pronunciation_scores, fluency_score, base_script_len)
116
+
117
+ return {'fluency_score': fluency_score, 'pronunciation_accuracy': pronunciation_accuracy}
118
+
119
+
120
+ if __name__ == '__main__':
121
+ pass