Spaces:

aletrn
/

ai-pronunciation-trainer

Running

File size: 15,181 Bytes

0931910
d804881
 
6777887
0931910
9ab32d7
d51ffe7
3a8b45a
38f204d
 
 
 
 
3a8b45a
290bfe0
918182d
 
bd49a31
 
 
918182d
3a8b45a
bd49a31
 
 
dc92d10
3a8b45a
1470bc9
 
 
 
 
 
d51ffe7
 
183d840
5d533ea
9ab32d7
 
0931910
 
a9078f6
 
9ab32d7
 
 
d51ffe7
38f204d
d51ffe7
a9078f6
d51ffe7
 
 
 
 
 
 
 
38f204d
d51ffe7
 
38f204d
d51ffe7
9ab32d7
38f204d
9ab32d7
38f204d
9ab32d7
38f204d
9ab32d7
d51ffe7
38f204d
6957865
183d840
 
38f204d
6957865
9ab32d7
38f204d
290bfe0
9ab32d7
 
1470bc9
38f204d
9ab32d7
d009a59
bd49a31
290bfe0
d009a59
 
 
 
 
 
 
 
 
 
 
 
d51ffe7
8b62994
3a8b45a
9ab32d7
a9078f6
9ab32d7
 
 
 
a9078f6
38f204d
9ab32d7
a9078f6
38f204d
9ab32d7
3a8b45a
38f204d
 
 
 
 
 
 
 
 
 
 
 
3a8b45a
 
ce5c4e6
3a8b45a
ce5c4e6
3a8b45a
ce5c4e6
 
d009a59
ce5c4e6
290bfe0
dc92d10
290bfe0
d009a59
290bfe0
d009a59
 
 
 
bd49a31
ca7e6be
 
bd49a31
dc92d10
 
bd49a31
 
 
 
 
ca7e6be
8b62994
a9078f6
918182d
a9078f6
 
 
dc92d10
 
bd49a31
 
 
ca7e6be
 
 
 
918182d
 
ca7e6be
 
 
 
918182d
 
ca7e6be
 
 
 
 
d009a59
ca7e6be
918182d
9ab32d7
8b62994
a9078f6
918182d
a9078f6
 
 
918182d
 
c0961d5
dc92d10
bd49a31
 
 
9ab32d7
 
918182d
 
 
ce5c4e6
dc92d10
290bfe0
ce5c4e6
 
 
dc92d10
918182d
 
 
dc92d10
918182d
dc92d10
918182d
 
 
 
 
 
 
 
 
 
dc92d10
918182d
 
 
 
 
 
ce5c4e6
918182d
 
 
 
 
 
 
 
 
 
 
 
bd49a31
918182d
 
 
 
38f204d
6957865
d51ffe7
38f204d
1470bc9
 
 
dc92d10
a9078f6
38f204d
1470bc9
 
 
 
38f204d
d51ffe7
9ab32d7
 
dc92d10
9ab32d7
0931910
9ab32d7
dc92d10
 
 
bd49a31
 
 
 
 
 
 
 
dc92d10
5d533ea
918182d
5d533ea
 
 
 
918182d
5d533ea
 
d804881
 
 
d009a59

from pathlib import Path
import gradio as gr

from aip_trainer import PROJECT_ROOT_FOLDER, app_logger, sample_rate_start
from aip_trainer.lambdas import js, lambdaGetSample, lambdaSpeechToScore, lambdaTTS


css = """
.speech-output-label p {color: grey; margin-bottom: white;}
.background-white {background-color: white !important; }
.speech-output-group {padding: 12px;}
.speech-output-container {min-height: 60px;}
.speech-output-html {text-align: left; }
"""
word_idx_text = "Selected word index"


def get_textbox_hidden(text = None):
    if text:
        return gr.Number(value=text, visible=False)
    return gr.Textbox(visible=False)

def get_number_hidden(x: int = None):
    if x:
        return gr.Number(value=x, visible=False)
    return gr.Number(visible=False)

def clear():
    return None


def clear2():
    return None, None


with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
    local_storage = gr.BrowserState([0.0, 0.0])
    app_logger.info("start gradio app building...")

    project_root_folder = Path(PROJECT_ROOT_FOLDER)
    with open(project_root_folder / "aip_trainer" / "lambdas" / "app_description.md", "r", encoding="utf-8") as app_description_src:
        md_app_description = app_description_src.read()
        gr.Markdown(md_app_description.format(sample_rate_start=sample_rate_start))
    with gr.Row():
        with gr.Column(scale=4, min_width=300):
            with gr.Row():
                with gr.Column(scale=2, min_width=80):
                    radio_language = gr.Radio(["de", "en"], label="Language", value="en", elem_id="radio-language-id-element")
                with gr.Column(scale=5, min_width=160):
                    radio_difficulty = gr.Radio(
                        label="Difficulty",
                        value=0,
                        choices=[
                            ("random", 0),
                            ("easy", 1),
                            ("medium", 2),
                            ("hard", 3),
                        ],
                        elem_id="radio-difficulty-id-element",
                    )
                with gr.Column(scale=1, min_width=100):
                    btn_random_phrase = gr.Button(value="Choose a random phrase", elem_id="btn-random-phrase-id-element")
            with gr.Row():
                with gr.Column(scale=7, min_width=300):
                    text_student_transcription = gr.Textbox(
                        lines=3,
                        label="Phrase to read for speech recognition",
                        value="Hi there, how are you?",
                        elem_id="text-student-transcription-id-element",
                    )
            with gr.Row():
                audio_tts = gr.Audio(label="Audio TTS", elem_id="audio-tts-id-element")
            with gr.Row():
                btn_run_tts = gr.Button(value="TTS in browser", elem_id="btn-run-tts-id-element")
                btn_run_tts_backend = gr.Button(value="TTS backend", elem_id="btn-run-tts-backend-id-element")
                btn_clear_tts = gr.Button(value="Clear TTS backend", elem_id="btn-clear-tts-backend-id-element")
                btn_clear_tts.click(clear, inputs=[], outputs=[audio_tts])
            with gr.Row():
                audio_student_recording_stt = gr.Audio(
                    label="Record a speech to evaluate",
                    sources=["microphone", "upload"],
                    type="filepath",
                    show_download_button=True,
                    elem_id="audio-student-recording-stt-id-element",
                )
            with gr.Row():
                num_audio_duration_hidden = gr.Number(label="num_first_audio_duration", value=0, interactive=False, visible=False)
                with gr.Accordion("Click here to expand the table examples", open=False, elem_id="accordion-examples-id-element"):
                    examples_text = gr.Examples(
                        examples=[
                            ["Hallo, wie geht es dir?", "de", 1],
                            ["Hi there, how are you?", "en", 1],
                            ["Die König-Ludwig-Eiche ist ein Naturdenkmal im Staatsbad Brückenau.", "de", 2,],
                            ["Rome is home to some of the most beautiful monuments in the world.", "en", 2],
                            ["Die König-Ludwig-Eiche ist ein Naturdenkmal im Staatsbad Brückenau, einem Ortsteil des drei Kilometer nordöstlich gelegenen Bad Brückenau im Landkreis Bad Kissingen in Bayern.", "de", 3],
                            ["Some machine learning models are designed to understand and generate human-like text based on the input they receive.", "en", 3],
                        ],
                        inputs=[text_student_transcription, radio_language, radio_difficulty],
                        elem_id="examples-text-id-element",
                    )
        with gr.Column(scale=4, min_width=320):
            text_transcribed_hidden = gr.Textbox(
                placeholder=None, label="Transcribed text", visible=False
            )
            text_letter_correctness = gr.Textbox(
                placeholder=None,
                label="Letters correctness",
                visible=False,
            )
            text_recording_ipa = gr.Textbox(
                placeholder=None, label="Student phonetic transcription", elem_id="text-student-recording-ipa-id-element"
            )
            text_ideal_ipa = gr.Textbox(
                placeholder=None, label="Ideal phonetic transcription", elem_id="text-ideal-ipa-id-element"
            )
            text_raw_json_output_hidden = gr.Textbox(placeholder=None, label="text_raw_json_output_hidden", visible=False)
            with gr.Group(elem_classes="speech-output-group background-white"):
                gr.Markdown("Speech accuracy output", elem_classes="speech-output-label background-white")
                with gr.Group(elem_classes="speech-output-container background-white"):
                    html_output = gr.HTML(
                        label="Speech accuracy output",
                        elem_id="speech-output",
                        show_label=False,
                        visible=True,
                        render=True,
                        value=" - ",
                        elem_classes="speech-output-html background-white",
                    )
            with gr.Row():
                with gr.Column(min_width=100, elem_classes="speech-accuracy-score-container row2 col1"):
                    num_pronunciation_accuracy = gr.Number(label="Current score %", elem_id="number-pronunciation-accuracy-id-element")
                with gr.Column(min_width=100, elem_classes="speech-accuracy-score-container row2 col2"):
                    num_score_de = gr.Number(label="Global score DE %", value=0, interactive=False, elem_id="number-score-de-id-element")
                with gr.Column(min_width=100, elem_classes="speech-accuracy-score-container row2 col3"):
                    num_score_en = gr.Number(label="Global score EN %", value=0, interactive=False, elem_id="number-score-en-id-element")
            btn_recognize_speech_accuracy = gr.Button(value="Get speech accuracy score (%)", elem_id="btn-recognize-speech-accuracy-id-element")
            with gr.Row():
                num_tot_recognized_words = gr.Number(label="Total recognized words", visible=False, minimum=0, interactive=False)
                with gr.Column(scale=1, min_width=50):
                    num_selected_recognized_word = gr.Number(label=word_idx_text, visible=True, minimum=0, value=0, interactive=False)
                with gr.Column(scale=4, min_width=100):
                    audio_splitted_student_recording_stt = gr.Audio(
                        label="Splitted student speech output",
                        type="filepath",
                        show_download_button=True,
                        elem_id="audio-splitted-student-recording-stt-id-element",
                    )
            text_selected_recognized_word_hidden = gr.Textbox(label="text_selected_recognized_word", value="placeholder", interactive=False, visible=False)

    def get_updated_score_by_language(text: str, audio_rec: str | Path, lang: str, score_de: float, score_en: float):
        import json
        _transcribed_text, _letter_correctness, _pronunciation_accuracy, _recording_ipa, _ideal_ipa, _num_tot_recognized_word, first_audio_file, _res = lambdaSpeechToScore.get_speech_to_score_tuple(text, audio_rec, lang, remove_random_file=False)
        new_num_selected_recognized_word = gr.Number(label=word_idx_text, visible=True, value=0)
        words_list = _transcribed_text.split()
        first_word = words_list[0]
        json_res_loaded = json.loads(_res)
        audio_durations = json_res_loaded["audio_durations"]
        first_audio_duration = audio_durations[0]
        output = {
            text_transcribed_hidden: _transcribed_text,
            text_letter_correctness: _letter_correctness,
            num_pronunciation_accuracy: _pronunciation_accuracy,
            text_recording_ipa: _recording_ipa,
            text_ideal_ipa: _ideal_ipa,
            text_raw_json_output_hidden: _res,
            num_tot_recognized_words: _num_tot_recognized_word,
            num_selected_recognized_word: new_num_selected_recognized_word,
            audio_splitted_student_recording_stt: first_audio_file,
            text_selected_recognized_word_hidden: first_word,
            num_audio_duration_hidden: first_audio_duration
        }
        match lang:
            case "de":
                return {
                    num_score_de: float(score_de) + float(_pronunciation_accuracy),
                    num_score_en: float(score_en),
                    **output
                }
            case "en":
                return {
                    num_score_en: float(score_en) + float(_pronunciation_accuracy),
                    num_score_de: float(score_de),
                    **output
                }
            case _:
                raise NotImplementedError(f"Language {lang} not supported")

    btn_recognize_speech_accuracy.click(
        get_updated_score_by_language,
        inputs=[text_student_transcription, audio_student_recording_stt, radio_language, num_score_de, num_score_en],
        outputs=[
            text_transcribed_hidden,
            text_letter_correctness,
            num_pronunciation_accuracy,
            text_recording_ipa,
            text_ideal_ipa,
            text_raw_json_output_hidden,
            num_score_de,
            num_score_en,
            num_tot_recognized_words,
            num_selected_recognized_word,
            audio_splitted_student_recording_stt,
            text_selected_recognized_word_hidden,
            num_audio_duration_hidden
        ],
    )

    def change_max_selected_words(n):
        app_logger.info(f"change_max_selected_words: {n} ...")
        num_max_selected_words = n -1 
        app_logger.info(f"num_selected_recognized_words.maximum, pre: {num_selected_recognized_word.maximum} ...")
        label = word_idx_text if n == 0 else f"{word_idx_text} (from 0 to {num_max_selected_words})"
        interactive = n > 0
        app_logger.info(f"change_max_selected_words: {n}, is interactive? {interactive} ...")
        new_num_selected_recognized_words = gr.Number(label=label, visible=True, value=0, minimum=0, maximum=num_max_selected_words, interactive=interactive)
        app_logger.info(f"num_selected_recognized_words.maximum, post: {num_selected_recognized_word.maximum} ...")
        return new_num_selected_recognized_words

    num_tot_recognized_words.change(
        fn=change_max_selected_words,
        inputs=[num_tot_recognized_words],
        outputs=[num_selected_recognized_word],
    )

    def clear3():
        return None, None, None, None, None, None, 0, 0, 0

    text_student_transcription.change(
        clear3,
        inputs=[],
        outputs=[
            audio_student_recording_stt, audio_tts, audio_splitted_student_recording_stt, text_recording_ipa, text_ideal_ipa, text_transcribed_hidden,
            num_pronunciation_accuracy, num_selected_recognized_word, num_pronunciation_accuracy
        ],
    )

    def reset_max_total_recognized_words(content_text_recording_ipa, content_num_tot_recognized_words):
        if content_text_recording_ipa is None or content_text_recording_ipa == "":
            app_logger.info("reset_max_total_recognized_words...")
            new_num_tot_recognized_words = gr.Number(label="Total recognized words", visible=False, value=0, minimum=0, interactive=False)
            return new_num_tot_recognized_words
        return content_num_tot_recognized_words

    text_recording_ipa.change(
        reset_max_total_recognized_words,
        inputs=[text_recording_ipa, num_tot_recognized_words],
        outputs=[
            num_tot_recognized_words
        ],
    )
    text_recording_ipa.change(
        None,
        inputs=[get_textbox_hidden(), get_textbox_hidden(), get_number_hidden()],
        outputs=[html_output],
        js=js.js_update_ipa_output,
    )

    btn_run_tts.click(fn=None, inputs=[text_student_transcription, radio_language], outputs=audio_tts, js=js.js_play_audio)
    btn_run_tts_backend.click(
        fn=lambdaTTS.get_tts,
        inputs=[text_student_transcription, radio_language],
        outputs=audio_tts,
    )
    btn_random_phrase.click(
        fn=lambdaGetSample.get_random_selection,
        inputs=[radio_language, radio_difficulty],
        outputs=[text_student_transcription],
    )
    btn_random_phrase.click(
        clear2,
        inputs=[],
        outputs=[audio_student_recording_stt, audio_tts]
    )
    html_output.change(
        None,
        inputs=[text_transcribed_hidden, text_letter_correctness, num_selected_recognized_word],
        outputs=[html_output],
        js=js.js_update_ipa_output,
    )
    num_selected_recognized_word.input(
        fn=lambdaSpeechToScore.get_selected_word,
        inputs=[num_selected_recognized_word, text_raw_json_output_hidden],
        outputs=[audio_splitted_student_recording_stt, text_selected_recognized_word_hidden, num_audio_duration_hidden],
    )
    audio_splitted_student_recording_stt.play(
        fn=None,
        # text, language, sleepTime = null, prefix = null
        inputs=[text_selected_recognized_word_hidden, radio_language, num_audio_duration_hidden],
        outputs=audio_splitted_student_recording_stt,
        js=js.js_play_audio
    )
    
    @gradio_app.load(inputs=[local_storage], outputs=[num_score_de, num_score_en])
    def load_from_local_storage(saved_values):
        print("loading from local storage", saved_values)
        return saved_values[0], saved_values[1]

    @gr.on([num_score_de.change, num_score_en.change], inputs=[num_score_de, num_score_en], outputs=[local_storage])
    def save_to_local_storage(score_de, score_en):
        return [score_de, score_en]


if __name__ == "__main__":
    try:
        gradio_app.launch()
    except Exception as e:
        app_logger.error(f"Error: {e}")
        raise e