import gradio as gr from gtts import gTTS import speech_recognition as sr from difflib import SequenceMatcher import tempfile import os def tts(word): tts = gTTS(text=word, lang='en') temp_file_path = tempfile.mktemp(suffix=".mp3") tts.save(temp_file_path) return temp_file_path def recognize_speech_from_microphone(audio_path): recognizer = sr.Recognizer() try: with sr.AudioFile(audio_path) as source: audio_data = recognizer.record(source) text = recognizer.recognize_google(audio_data) return text except sr.UnknownValueError: return "Could not understand the audio" except sr.RequestError as e: return f"Could not request results from Google Speech Recognition service; {e}" except Exception as e: return str(e) def calculate_similarity(word, recognized_text): return SequenceMatcher(None, word.lower(), recognized_text.lower()).ratio() * 100 def process_audio(word, audio_path): recognized_text = recognize_speech_from_microphone(audio_path) if "Error" in recognized_text or "Could not" in recognized_text: return recognized_text, 0.0 similarity = calculate_similarity(word, recognized_text) return recognized_text, similarity def evaluate_pronunciation(word): temp_file_path = tts(word) return temp_file_path def process_all(word, audio_path): recognized_text, similarity = process_audio(word, audio_path) return recognized_text, similarity with gr.Blocks() as demo: with gr.Row(): word_input = gr.Textbox(label="Enter the word for pronunciation") tts_button = gr.Button("Listen to the word") tts_audio = gr.Audio(label="Original Pronunciation", type="filepath") with gr.Row(): mic_input = gr.Audio(label="Your Pronunciation", type="filepath") result_button = gr.Button("Evaluate Pronunciation") recognized_text = gr.Textbox(label="Recognized Text") similarity_score = gr.Number(label="Similarity (%)") tts_button.click(evaluate_pronunciation, inputs=word_input, outputs=tts_audio) result_button.click(process_all, inputs=[word_input, mic_input], outputs=[recognized_text, similarity_score]) demo.launch()