speechfeedback / app.py
MK-316's picture
Create app.py
a0a375f verified
raw
history blame contribute delete
No virus
2.81 kB
#@markdown Language Application: WER, Fluency (in N of pauses), WPM (Words per minute)
import gradio as gr
import speech_recognition as sr
from Levenshtein import distance as lev_distance, ratio
import tempfile
import soundfile as sf
import librosa
def analyze_speech(file_info):
r = sr.Recognizer()
with tempfile.NamedTemporaryFile(delete=True, suffix=".wav") as tmpfile:
# Write the sound file to the temporary file
sf.write(tmpfile.name, data=file_info[1], samplerate=44100, format='WAV')
tmpfile.seek(0)
# Load audio for pause analysis and speech rate
y, sr_lib = librosa.load(tmpfile.name, sr=None) # Load the file with the original sampling rate
duration = librosa.get_duration(y=y, sr=sr_lib)
# Detect pauses
pause_frames = librosa.effects.split(y, top_db=32)
pauses = [(start, end) for start, end in pause_frames if (end - start) / sr_lib > 0.5]
num_pauses = len(pauses)
with sr.AudioFile(tmpfile.name) as source:
audio_data = r.record(source)
text = r.recognize_google(audio_data)
return text, num_pauses, duration, len(text.split())
def calculate_wer(reference, hypothesis):
ref_words = reference.split()
hyp_words = hypothesis.split()
edit_distance = lev_distance(ref_words, hyp_words)
wer = edit_distance / len(ref_words) if ref_words else float('inf') # Avoid division by zero
return wer
def pronunciation_correction(expected_text, file_info):
user_spoken_text, num_pauses, duration, total_words = analyze_speech(file_info)
wer = calculate_wer(expected_text.lower(), user_spoken_text.lower())
wpm = total_words / (duration / 60) if duration > 0 else 0
similarity = ratio(expected_text.lower(), user_spoken_text.lower())
feedback = "Excellent pronunciation!" if similarity >= 0.9 else \
"Good pronunciation!" if similarity >= 0.7 else \
"Needs improvement." if similarity >= 0.5 else \
"Poor pronunciation, try to focus more on clarity."
description = f"WER: {wer:.2f}, Fluency: {num_pauses} pauses, {wpm:.0f} WPM"
return feedback, description
with gr.Blocks() as app:
with gr.Row():
text_input = gr.Textbox(label="Enter or paste your text here")
audio_input = gr.Audio(label="Upload Audio File", type="numpy")
check_pronunciation_button = gr.Button("Check Pronunciation")
pronunciation_feedback = gr.Textbox(label="Pronunciation Feedback")
pronunciation_details = gr.Textbox(label="Detailed Metrics")
check_pronunciation_button.click(
pronunciation_correction,
inputs=[text_input, audio_input],
outputs=[pronunciation_feedback, pronunciation_details]
)
app.launch(debug=True)