#@markdown Language Application: WER, Fluency (in N of pauses), WPM (Words per minute) import gradio as gr import speech_recognition as sr from Levenshtein import distance as lev_distance, ratio import tempfile import soundfile as sf import librosa def analyze_speech(file_info): r = sr.Recognizer() with tempfile.NamedTemporaryFile(delete=True, suffix=".wav") as tmpfile: # Write the sound file to the temporary file sf.write(tmpfile.name, data=file_info[1], samplerate=44100, format='WAV') tmpfile.seek(0) # Load audio for pause analysis and speech rate y, sr_lib = librosa.load(tmpfile.name, sr=None) # Load the file with the original sampling rate duration = librosa.get_duration(y=y, sr=sr_lib) # Detect pauses pause_frames = librosa.effects.split(y, top_db=32) pauses = [(start, end) for start, end in pause_frames if (end - start) / sr_lib > 0.5] num_pauses = len(pauses) with sr.AudioFile(tmpfile.name) as source: audio_data = r.record(source) text = r.recognize_google(audio_data) return text, num_pauses, duration, len(text.split()) def calculate_wer(reference, hypothesis): ref_words = reference.split() hyp_words = hypothesis.split() edit_distance = lev_distance(ref_words, hyp_words) wer = edit_distance / len(ref_words) if ref_words else float('inf') # Avoid division by zero return wer def pronunciation_correction(expected_text, file_info): user_spoken_text, num_pauses, duration, total_words = analyze_speech(file_info) wer = calculate_wer(expected_text.lower(), user_spoken_text.lower()) wpm = total_words / (duration / 60) if duration > 0 else 0 similarity = ratio(expected_text.lower(), user_spoken_text.lower()) feedback = "Excellent pronunciation!" if similarity >= 0.9 else \ "Good pronunciation!" if similarity >= 0.7 else \ "Needs improvement." if similarity >= 0.5 else \ "Poor pronunciation, try to focus more on clarity." description = f"WER: {wer:.2f}, Fluency: {num_pauses} pauses, {wpm:.0f} WPM" return feedback, description with gr.Blocks() as app: with gr.Row(): text_input = gr.Textbox(label="Enter or paste your text here") audio_input = gr.Audio(label="Upload Audio File", type="numpy") check_pronunciation_button = gr.Button("Check Pronunciation") pronunciation_feedback = gr.Textbox(label="Pronunciation Feedback") pronunciation_details = gr.Textbox(label="Detailed Metrics") check_pronunciation_button.click( pronunciation_correction, inputs=[text_input, audio_input], outputs=[pronunciation_feedback, pronunciation_details] ) app.launch(debug=True)