Spaces:

teatwots
/

stt

Runtime error

File size: 2,822 Bytes

23c65ce

#@markdown Output: Accuracy Score

import gradio as gr
import speech_recognition as sr
from Levenshtein import ratio
import tempfile
import numpy as np
import soundfile as sf
import pandas as pd

# Sample dataframe with sentences ordered from easy to hard
data = {
    "Sentences": [
        "A stitch in time saves nine.",
        "To be or not to be, that is the question.",
        "Five cats were living in safe caves.",
        "Hives give shelter to bees in large caves.",
        "His decision to plant a rose was amazing.",
        "She sells sea shells by the sea shore.",
        "The colorful parrot likes rolling berries.",
        "Time flies like an arrow; fruit flies like a banana.",
        "Good things come to those who wait.",
        "All human beings are born free and equal in dignity and rights."
    ]
}
df = pd.DataFrame(data)

def transcribe_audio(file_info):
    r = sr.Recognizer()
    with tempfile.NamedTemporaryFile(delete=True, suffix=".wav") as tmpfile:
        sf.write(tmpfile.name, data=file_info[1], samplerate=44100, format='WAV')
        tmpfile.seek(0)
        with sr.AudioFile(tmpfile.name) as source:
            audio_data = r.record(source)
    try:
        text = r.recognize_google(audio_data)
        return text
    except sr.UnknownValueError:
        return "Could not understand audio"
    except sr.RequestError as e:
        return f"Could not request results; {e}"

def pronunciation_correction(expected_text, file_info):
    user_spoken_text = transcribe_audio(file_info)
    similarity = ratio(expected_text.lower(), user_spoken_text.lower())
    description = f"{similarity:.2f}"

    if similarity >= 0.9:
        feedback = "Excellent pronunciation!"
    elif similarity >= 0.7:
        feedback = "Good pronunciation!"
    elif similarity >= 0.5:
        feedback = "Needs improvement."
    else:
        feedback = "Poor pronunciation, try to focus more on clarity."

    return feedback, description

with gr.Blocks() as app:
    with gr.Row():
        sentence_dropdown = gr.Dropdown(choices=df['Sentences'].tolist(), label="Select a Sentence")
        selected_sentence_output = gr.Textbox(label="Selected Text", interactive=False)
    audio_input = gr.Audio(label="Upload Audio File", type="numpy")
    check_pronunciation_button = gr.Button("Check Pronunciation")
    pronunciation_feedback = gr.Textbox(label="Pronunciation Feedback")
    pronunciation_score = gr.Number(label="Pronunciation Accuracy Score: 0 (No Match) ~ 1 (Perfect)")

    sentence_dropdown.change(lambda x: x, inputs=sentence_dropdown, outputs=selected_sentence_output)
    check_pronunciation_button.click(
        pronunciation_correction,
        inputs=[sentence_dropdown, audio_input],
        outputs=[pronunciation_feedback, pronunciation_score]
    )

app.launch(debug=True)