Spaces:
Running
Running
File size: 9,860 Bytes
0931910 d804881 6777887 0931910 9ab32d7 d51ffe7 3a8b45a 6957865 3a8b45a 1470bc9 d51ffe7 3a8b45a 5d533ea 9ab32d7 0931910 a9078f6 9ab32d7 d51ffe7 a9078f6 d51ffe7 a9078f6 d51ffe7 9ab32d7 a9078f6 9ab32d7 d51ffe7 6957865 9ab32d7 1470bc9 9ab32d7 1470bc9 9ab32d7 d51ffe7 8b62994 3a8b45a 9ab32d7 a9078f6 9ab32d7 a9078f6 3a8b45a 9ab32d7 a9078f6 3a8b45a 9ab32d7 3a8b45a 57141bb 3a8b45a d51ffe7 6777887 a9078f6 6777887 ca7e6be 8b62994 ca7e6be 8b62994 a9078f6 ca7e6be 9ab32d7 ca7e6be a9078f6 9ab32d7 8b62994 a9078f6 ca7e6be 9ab32d7 6957865 d51ffe7 a9078f6 1470bc9 a9078f6 1470bc9 d51ffe7 9ab32d7 8b62994 9ab32d7 0931910 9ab32d7 5d533ea d804881 6e31dbd |
|
from pathlib import Path
import gradio as gr
from aip_trainer import PROJECT_ROOT_FOLDER, app_logger, sample_rate_start
from aip_trainer.lambdas import js, lambdaGetSample, lambdaSpeechToScore, lambdaTTS
css = """
.speech-output-label p {color: grey;}
.speech-output-container {align-items: center; min-height: 60px; padding-left: 8px; padding-right: 8px; margin-top: -12px; border-width: 1px; border-style: solid; border-color: lightgrey;}
"""
js_play_audio = """
function playAudio(text, language) {
let voice_idx = 0;
let voice_synth = null;
let synth = window.speechSynthesis;
function setSpeech() {
return new Promise(
function (resolve, reject) {
let id;
id = setInterval(() => {
if (synth.getVoices().length !== 0) {
resolve(synth.getVoices());
clearInterval(id);
}
}, 10);
}
)
}
let s = setSpeech();
s.then((voices) => {
for (idx = 0; idx < voices.length; idx++) {
if (voices[idx].lang.slice(0, 2) == language) {
voice_synth = voices[idx];
break;
}
}
var utterThis = new SpeechSynthesisUtterance(text);
utterThis.voice = voice_synth;
utterThis.rate = 0.7;
synth.speak(utterThis);
return utterThis;
});
}
"""
def clear():
return None
def clear2():
return None, None
with gr.Blocks(css=css) as gradio_app:
local_storage = gr.BrowserState([0.0, 0.0])
app_logger.info("start gradio app building...")
project_root_folder = Path(PROJECT_ROOT_FOLDER)
with open(project_root_folder / "aip_trainer" / "lambdas" / "app_description.md", "r", encoding="utf-8") as app_description_src:
md_app_description = app_description_src.read()
gr.Markdown(md_app_description.format(sample_rate_start=sample_rate_start))
with gr.Row():
with gr.Column(scale=4, min_width=300):
with gr.Row():
with gr.Column(scale=2, min_width=80):
radio_language = gr.Radio(["de", "en"], label="Language", value="en")
with gr.Column(scale=5, min_width=160):
radio_difficulty = gr.Radio(
label="Difficulty",
value=0,
choices=[
("random", 0),
("easy", 1),
("medium", 2),
("hard", 3),
],
)
with gr.Column(scale=1, min_width=100):
btn_random_phrase = gr.Button(value="Choose a random phrase")
with gr.Row():
with gr.Column(scale=7, min_width=300):
text_learner_transcription = gr.Textbox(
lines=3,
label="Learner Transcription",
value="Hi there, how are you?",
)
with gr.Row():
audio_tts = gr.Audio(label="Audio TTS")
with gr.Row():
btn_run_tts = gr.Button(value="TTS in browser")
btn_run_tts_backend = gr.Button(value="TTS backend")
btn_clear_tts = gr.Button(value="Clear TTS backend")
btn_clear_tts.click(clear, inputs=[], outputs=[audio_tts])
with gr.Row():
audio_learner_recording_stt = gr.Audio(
label="Learner Recording",
sources=["microphone", "upload"],
type="filepath",
show_download_button=True,
)
with gr.Column(scale=4, min_width=320):
text_transcribed_hidden = gr.Textbox(
placeholder=None, label="Transcribed text", visible=False
)
text_letter_correctness = gr.Textbox(
placeholder=None,
label="Letters correctness",
visible=False,
)
text_recording_ipa = gr.Textbox(
placeholder=None, label="Learner phonetic transcription"
)
text_ideal_ipa = gr.Textbox(
placeholder=None, label="Ideal phonetic transcription"
)
text_raw_json_output_hidden = gr.Textbox(placeholder=None, label="text_raw_json_output_hidden", visible=False)
gr.Markdown("Speech accuracy output", elem_classes="speech-output-label")
with gr.Row(elem_classes="speech-output-container"):
html_output = gr.HTML(
label="Speech accuracy output",
elem_id="speech-output",
show_label=False,
visible=True,
render=True,
value=" - ",
elem_classes="speech-output",
)
with gr.Row():
gr.Markdown("### Speech accuracy score (%)", elem_classes="speech-accuracy-score-container row1")
with gr.Row():
with gr.Column(min_width=100, elem_classes="speech-accuracy-score-container row2 col1"):
number_pronunciation_accuracy = gr.Number(label="Current score")
with gr.Column(min_width=100, elem_classes="speech-accuracy-score-container row2 col2"):
number_score_de = gr.Number(label="Global score DE", value=0, interactive=False)
with gr.Column(min_width=100, elem_classes="speech-accuracy-score-container row2 col3"):
number_score_en = gr.Number(label="Global score EN", value=0, interactive=False)
with gr.Row():
btn = gr.Button(value="Recognize speech accuracy")
with gr.Accordion("Click here to expand the table examples", open=False):
examples_text = gr.Examples(
examples=[
["Hallo, wie geht es dir?", "de", 1],
["Hi there, how are you?", "en", 1],
["Die König-Ludwig-Eiche ist ein Naturdenkmal im Staatsbad Brückenau.", "de", 2,],
["Rome is home to some of the most beautiful monuments in the world.", "en", 2],
["Die König-Ludwig-Eiche ist ein Naturdenkmal im Staatsbad Brückenau, einem Ortsteil des drei Kilometer nordöstlich gelegenen Bad Brückenau im Landkreis Bad Kissingen in Bayern.", "de", 3],
["Some machine learning models are designed to understand and generate human-like text based on the input they receive.", "en", 3],
],
inputs=[text_learner_transcription, radio_language, radio_difficulty],
)
def get_updated_score_by_language(text: str, audio_rec: str | Path, lang: str, score_de: float, score_en: float):
_transcribed_text, _letter_correctness, _pronunciation_accuracy, _recording_ipa, _ideal_ipa, _res = lambdaSpeechToScore.get_speech_to_score_tuple(text, audio_rec, lang)
output = {
text_transcribed_hidden: _transcribed_text,
text_letter_correctness: _letter_correctness,
number_pronunciation_accuracy: _pronunciation_accuracy,
text_recording_ipa: _recording_ipa,
text_ideal_ipa: _ideal_ipa,
text_raw_json_output_hidden: _res,
}
match lang:
case "de":
return {
number_score_de: float(score_de) + float(_pronunciation_accuracy),
number_score_en: float(score_en),
**output
}
case "en":
return {
number_score_en: float(score_en) + float(_pronunciation_accuracy),
number_score_de: float(score_de),
**output
}
case _:
raise NotImplementedError(f"Language {lang} not supported")
btn.click(
get_updated_score_by_language,
inputs=[text_learner_transcription, audio_learner_recording_stt, radio_language, number_score_de, number_score_en],
outputs=[
text_transcribed_hidden,
text_letter_correctness,
number_pronunciation_accuracy,
text_recording_ipa,
text_ideal_ipa,
text_raw_json_output_hidden,
number_score_de, number_score_en
],
)
btn_run_tts.click(fn=None, inputs=[text_learner_transcription, radio_language], outputs=audio_tts, js=js_play_audio)
btn_run_tts_backend.click(
fn=lambdaTTS.get_tts,
inputs=[text_learner_transcription, radio_language],
outputs=audio_tts,
)
btn_random_phrase.click(
lambdaGetSample.get_random_selection,
inputs=[radio_language, radio_difficulty],
outputs=[text_learner_transcription],
)
btn_random_phrase.click(
clear2,
inputs=[],
outputs=[audio_learner_recording_stt, audio_tts]
)
html_output.change(
None,
inputs=[text_transcribed_hidden, text_letter_correctness],
outputs=[html_output],
js=js.js_update_ipa_output,
)
@gradio_app.load(inputs=[local_storage], outputs=[number_score_de, number_score_en])
def load_from_local_storage(saved_values):
print("loading from local storage", saved_values)
return saved_values[0], saved_values[1]
@gr.on([number_score_de.change, number_score_en.change], inputs=[number_score_de, number_score_en], outputs=[local_storage])
def save_to_local_storage(score_de, score_en):
return [score_de, score_en]
if __name__ == "__main__":
gradio_app.launch()
|