Spaces:

aletrn
/

ai-pronunciation-trainer

Running

App Files Files Community

alessandro trinca tornidor commited on Nov 20

Commit

9ab32d7

•

1 Parent(s): d804881

ci: hugginface space, move from docker to gradio sdk v5.6.0, add missing packages.txt with ffmpeg, pre-requirements.txt with pip, update gradio app to properly format informations to frontend, update tests

Browse files

Files changed (10) hide show

README.md +5 -2
aip_trainer/lambdas/lambdaSpeechToScore.py +17 -6
aip_trainer/lambdas/routes.py +0 -16
aip_trainer/models/models.py +60 -0
app.py +103 -118
packages.txt +1 -0
pre-requirements.txt +1 -0
requirements-flask.txt +21 -0
requirements.txt +1 -3
tests/test_GetAccuracyFromRecordedAudio.py +4 -4

README.md CHANGED Viewed

@@ -3,7 +3,9 @@ title: AI Pronunciation Trainer
 emoji: 🎤
 colorFrom: red
 colorTo: blue
-sdk: docker
 pinned: false
 license: mit
 ---
@@ -59,7 +61,8 @@ pnpm playwright test
 - add an updated online version on HuggingFace, Cloudflare or AWS
 - move from pytorch to onnxruntime (if possible)
-- refactor frontend with something more modern (e.g. vuejs)
 - refactor css style with tailwindcss
 - add more e2e tests with playwright

 emoji: 🎤
 colorFrom: red
 colorTo: blue
+sdk: gradio
+sdk_version: 5.6.0
+app_file: app.py
 pinned: false
 license: mit
 ---
 - add an updated online version on HuggingFace, Cloudflare or AWS
 - move from pytorch to onnxruntime (if possible)
+- refactor frontend with something more modern (e.g. vuejs, gradio)
+- improve documentation, backend tests
 - refactor css style with tailwindcss
 - add more e2e tests with playwright

aip_trainer/lambdas/lambdaSpeechToScore.py CHANGED Viewed

@@ -43,12 +43,13 @@ def lambda_handler(event, context):
             },
             'body': ''
         }
-    output = get_speech_to_score(real_text=real_text, file_bytes_or_audiotmpfile=file_bytes_or_audiotmpfile, language=language)
     app_logger.debug(f"output: {output} ...")
     return output
-def get_speech_to_score(real_text: str, file_bytes_or_audiotmpfile: str | dict, language: str = "en", remove_random_file: bool = True):
     app_logger.info(f"real_text:{real_text} ...")
     app_logger.debug(f"file_bytes:{file_bytes_or_audiotmpfile} ...")
     app_logger.info(f"language:{language} ...")
@@ -118,10 +119,12 @@ def get_speech_to_score(real_text: str, file_bytes_or_audiotmpfile: str | dict,
     duration = time.time() - start
     duration_tot = time.time() - start0
     app_logger.info(f'Time to post-process results: {duration}, tot_duration:{duration_tot}.')
-    res = {'real_transcript': result['recording_transcript'],
-           'ipa_transcript': result['recording_ipa'],
-           'pronunciation_accuracy': str(int(result['pronunciation_accuracy'])),
            'real_transcripts': real_transcripts, 'matched_transcripts': matched_transcripts,
            'real_transcripts_ipa': real_transcripts_ipa, 'matched_transcripts_ipa': matched_transcripts_ipa,
            'pair_accuracy_category': pair_accuracy_category,
@@ -129,7 +132,15 @@ def get_speech_to_score(real_text: str, file_bytes_or_audiotmpfile: str | dict,
            'end_time': result['end_time'],
            'is_letter_correct_all_words': is_letter_correct_all_words}
-    return json.dumps(res)
 # From Librosa

             },
             'body': ''
         }
+    output = get_speech_to_score_dict(real_text=real_text, file_bytes_or_audiotmpfile=file_bytes_or_audiotmpfile, language=language, remove_random_file=False)
+    output = json.dumps(output)
     app_logger.debug(f"output: {output} ...")
     return output
+def get_speech_to_score_dict(real_text: str, file_bytes_or_audiotmpfile: str | dict, language: str = "en", remove_random_file: bool = True):
     app_logger.info(f"real_text:{real_text} ...")
     app_logger.debug(f"file_bytes:{file_bytes_or_audiotmpfile} ...")
     app_logger.info(f"language:{language} ...")
     duration = time.time() - start
     duration_tot = time.time() - start0
     app_logger.info(f'Time to post-process results: {duration}, tot_duration:{duration_tot}.')
+    pronunciation_accuracy = str(int(result['pronunciation_accuracy']))
+    ipa_transcript = result['recording_ipa']
+    return {'real_transcript': result['recording_transcript'],
+           'ipa_transcript': ipa_transcript,
+           'pronunciation_accuracy': pronunciation_accuracy,
            'real_transcripts': real_transcripts, 'matched_transcripts': matched_transcripts,
            'real_transcripts_ipa': real_transcripts_ipa, 'matched_transcripts_ipa': matched_transcripts_ipa,
            'pair_accuracy_category': pair_accuracy_category,
            'end_time': result['end_time'],
            'is_letter_correct_all_words': is_letter_correct_all_words}
+def get_speech_to_score_tuple(real_text: str, file_bytes_or_audiotmpfile: str | dict, language: str = "en", remove_random_file: bool = True):
+    output = get_speech_to_score_dict(real_text=real_text, file_bytes_or_audiotmpfile=file_bytes_or_audiotmpfile, language=language, remove_random_file=remove_random_file)
+    real_transcripts = output['real_transcripts']
+    is_letter_correct_all_words = output['is_letter_correct_all_words']
+    pronunciation_accuracy = output['pronunciation_accuracy']
+    ipa_transcript = output['ipa_transcript']
+    real_transcripts_ipa = output['real_transcripts_ipa']
+    return real_transcripts, is_letter_correct_all_words, pronunciation_accuracy, ipa_transcript, real_transcripts_ipa, json.dumps(output)
 # From Librosa

aip_trainer/lambdas/routes.py DELETED Viewed

@@ -1,16 +0,0 @@
-import random
-import structlog
-from fastapi import APIRouter
-custom_structlog_logger = structlog.stdlib.get_logger(__name__)
-router = APIRouter()
-@router.get("/health")
-def health():
-    import torch
-    import torchaudio
-    custom_structlog_logger.info(f"Still alive, torch version:{torch.__version__}, torchaudio:{torchaudio.__version__} ...")
-    return "Still alive!"

aip_trainer/models/models.py CHANGED Viewed

@@ -8,6 +8,66 @@ from silero.utils import Decoder
 from aip_trainer import app_logger
 def silero_stt(
     language="en",
     version="latest",

 from aip_trainer import app_logger
+def silero_tts(language='en',
+               speaker='kseniya_16khz',
+               **kwargs):
+    """ Silero Text-To-Speech Models
+    language (str): language of the model, now available are ['ru', 'en', 'de', 'es', 'fr']
+    Returns a model and a set of utils
+    Please see https://github.com/snakers4/silero-models for usage examples
+    """
+    from omegaconf import OmegaConf
+    from silero.tts_utils import apply_tts
+    from silero.tts_utils import init_jit_model as init_jit_model_tts
+    models_list_file = os.path.join(os.path.dirname(__file__), "..", "..", "models.yml")
+    if not os.path.exists(models_list_file):
+        models_list_file = 'latest_silero_models.yml'
+    if not os.path.exists(models_list_file):
+        torch.hub.download_url_to_file('https://raw.githubusercontent.com/snakers4/silero-models/master/models.yml',
+                                    'latest_silero_models.yml',
+                                    progress=False)
+    assert os.path.exists(models_list_file)
+    models = OmegaConf.load(models_list_file)
+    available_languages = list(models.tts_models.keys())
+    assert language in available_languages, f'Language not in the supported list {available_languages}'
+    available_speakers = []
+    speaker_language = {}
+    for lang in available_languages:
+        speakers = list(models.tts_models.get(lang).keys())
+        available_speakers.extend(speakers)
+        for _ in speakers:
+            speaker_language[_] = lang
+    assert speaker in available_speakers, f'Speaker not in the supported list {available_speakers}'
+    assert language == speaker_language[speaker], f"Incorrect language '{language}' for this speaker, please specify '{speaker_language[speaker]}'"
+    model_conf = models.tts_models[language][speaker].latest
+    if '_v2' in speaker or '_v3' in speaker or 'v3_' in speaker or 'v4_' in speaker:
+        from torch import package
+        model_url = model_conf.package
+        model_dir = os.path.join(os.path.dirname(__file__), "model")
+        os.makedirs(model_dir, exist_ok=True)
+        model_path = os.path.join(model_dir, os.path.basename(model_url))
+        if not os.path.isfile(model_path):
+            torch.hub.download_url_to_file(model_url,
+                                           model_path,
+                                           progress=True)
+        imp = package.PackageImporter(model_path)
+        model = imp.load_pickle("tts_models", "model")
+        if speaker == 'multi_v2':
+            avail_speakers = model_conf.speakers
+            return model, avail_speakers
+        else:
+            example_text = model_conf.example
+            return model, example_text
+    else:
+        model = init_jit_model_tts(model_conf.jit)
+        symbols = model_conf.tokenset
+        example_text = model_conf.example
+        sample_rate = model_conf.sample_rate
+        return model, symbols, sample_rate, example_text, apply_tts
 def silero_stt(
     language="en",
     version="latest",

app.py CHANGED Viewed

@@ -1,127 +1,112 @@
-import logging
-import os
-import time
 import gradio as gr
-import structlog
-import uvicorn
-from aip_trainer.lambdas import lambdaSpeechToScore
-from asgi_correlation_id import CorrelationIdMiddleware
-from asgi_correlation_id.context import correlation_id
-from dotenv import load_dotenv
-from fastapi import FastAPI, Request, Response
-from uvicorn.protocols.utils import get_path_with_query_string
-from aip_trainer.utils.session_logger import setup_logging
-from aip_trainer.lambdas.routes import router
-load_dotenv()
-LOG_JSON_FORMAT = bool(os.getenv("LOG_JSON_FORMAT", False))
-LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
-setup_logging(json_logs=LOG_JSON_FORMAT, log_level=LOG_LEVEL)
-logger = structlog.stdlib.get_logger(__name__)
-app = FastAPI(title="Example API", version="1.0.0")
-@app.middleware("http")
-async def logging_middleware(request: Request, call_next) -> Response:
-    structlog.contextvars.clear_contextvars()
-    # These context vars will be added to all log entries emitted during the request
-    request_id = correlation_id.get()
-    # print(f"request_id:{request_id}.")
-    structlog.contextvars.bind_contextvars(request_id=request_id)
-    start_time = time.perf_counter_ns()
-    # If the call_next raises an error, we still want to return our own 500 response,
-    # so we can add headers to it (process time, request ID...)
-    response = Response(status_code=500)
-    try:
-        response = await call_next(request)
-    except Exception:
-        # TODO: Validate that we don't swallow exceptions (unit test?)
-        structlog.stdlib.get_logger("api.error").exception("Uncaught exception")
-        raise
-    finally:
-        process_time = time.perf_counter_ns() - start_time
-        status_code = response.status_code
-        url = get_path_with_query_string(request.scope)
-        client_host = request.client.host
-        client_port = request.client.port
-        http_method = request.method
-        http_version = request.scope["http_version"]
-        # Recreate the Uvicorn access log format, but add all parameters as structured information
-        logger.info(
-            f"""{client_host}:{client_port} - "{http_method} {url} HTTP/{http_version}" {status_code}""",
-            http={
-                "url": str(request.url),
-                "status_code": status_code,
-                "method": http_method,
-                "request_id": request_id,
-                "version": http_version,
-            },
-            network={"client": {"ip": client_host, "port": client_port}},
-            duration=process_time,
-        )
-        response.headers["X-Process-Time"] = str(process_time / 10 ** 9)
-        return response
-app.include_router(router)
-logger.info("routes included, creating gradio app")
-CUSTOM_GRADIO_PATH = "/"
-def get_gradio_app():
-    with gr.Blocks() as gradio_app:
-        logger.info("start gradio app building...")
-        gr.Markdown(
-            """
-            # Hello World!
-            Start typing below to _see_ the *output*.
-            Here a [link](https://huggingface.co/spaces/aletrn/gradio_with_fastapi).
-            """
-        )
-        learner_transcription = gr.Textbox(
-            label="Learner Transcription",
-            placeholder="It is nice to wreck a nice beach",
-        )
-        language = gr.Textbox(
-            label="language",
-            placeholder="en",
-        )
-        learner_recording = gr.Audio(
-            label="Learner Recording",
-            sources=["microphone", "upload"],
-            type="filepath"
-        )
-        text_output = gr.Textbox(lines=1, placeholder=None, label="Text Output")
-        btn = gr.Button(value="get speech score")
-        """
-        event = {'body': json.dumps(request.get_json(force=True))}
-        lambda_correct_output = lambdaSpeechToScore.lambda_handler(event, [])
         """
-        btn.click(
-            lambdaSpeechToScore.get_speech_to_score,
-            inputs=[learner_transcription, learner_recording, language],
-            outputs=[text_output]
-        )
-    return gradio_app
-logger.info("mounting gradio app within FastAPI...")
-gradio_app_md = get_gradio_app()
-app.add_middleware(CorrelationIdMiddleware)
-app = gr.mount_gradio_app(app, gradio_app_md, path=CUSTOM_GRADIO_PATH)
-logger.info("gradio app mounted")
 if __name__ == "__main__":
-    try:
-        uvicorn.run("app:app", host="127.0.0.1", port=7860, log_config=None, reload=True)
-    except Exception as ex:
-        logging.error(f"ex:{ex}.")
-        raise ex

 import gradio as gr
+from aip_trainer import app_logger
+from aip_trainer.lambdas import lambdaSpeechToScore
+js = """
+function updateCssText(text, letters) {
+    let wordsArr = text.split(" ")
+    let lettersWordsArr = letters.split(" ")
+    let speechOutputContainer = document.querySelector('#speech-output');
+    speechOutputContainer.textContent = ""
+    for (let idx in wordsArr) {
+        let word = wordsArr[idx]
+        let letterIsCorrect = lettersWordsArr[idx]
+        for (let idx1 in word) {
+        let letterCorrect = letterIsCorrect[idx1] == "1"
+        let containerLetter = document.createElement("span")
+        containerLetter.style.color = letterCorrect ? 'green' : "red"
+        containerLetter.innerText = word[idx1];
+        speechOutputContainer.appendChild(containerLetter)
+        }
+        let containerSpace = document.createElement("span")
+        containerSpace.textContent = " "
+        speechOutputContainer.appendChild(containerSpace)
+    }
+}
+"""
+with gr.Blocks() as gradio_app:
+    app_logger.info("start gradio app building...")
+    gr.Markdown(
         """
+        # AI Pronunciation Trainer
+        See [my fork](https://github.com/trincadev/ai-pronunciation-trainer) of [AI Pronunciation Trainer](https://github.com/Thiagohgl/ai-pronunciation-trainer) repositroy
+        for more details.
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=4, min_width=300):
+            with gr.Row():
+                with gr.Column(scale=1, min_width=50):
+                    language = gr.Radio(["de", "en"], label="Language", value="en")
+                with gr.Column(scale=7, min_width=300):
+                    learner_transcription = gr.Textbox(
+                        lines=3,
+                        label="Learner Transcription",
+                        value="Hi there, how are you?",
+                    )
+            with gr.Row():
+                learner_recording = gr.Audio(
+                    label="Learner Recording",
+                    sources=["microphone", "upload"],
+                    type="filepath",
+                )
+        with gr.Column(scale=3, min_width=300):
+            transcripted_text = gr.Textbox(
+                lines=2, placeholder=None, label="Transcripted text", visible=False
+            )
+            letter_correctness = gr.Textbox(
+                lines=1,
+                placeholder=None,
+                label="Letters correctness",
+                visible=False,
+            )
+            pronunciation_accuracy = gr.Textbox(
+                lines=1, placeholder=None, label="Pronunciation accuracy %"
+            )
+            recording_ipa = gr.Textbox(
+                lines=1, placeholder=None, label="Learner phonetic transcription"
+            )
+            ideal_ipa = gr.Textbox(
+                lines=1, placeholder=None, label="Ideal phonetic transcription"
+            )
+            res = gr.Textbox(lines=1, placeholder=None, label="RES", visible=False)
+            html_output = gr.HTML(
+                label="Speech accuracy output",
+                elem_id="speech-output",
+                show_label=True,
+                visible=True,
+                render=True,
+                value=" - ",
+                elem_classes="speech-output",
+            )
+            btn = gr.Button(value="Recognize speech accuracy")
+            # real_transcripts, is_letter_correct_all_words, pronunciation_accuracy, result['recording_ipa'], real_transcripts_ipa, res
+    btn.click(
+        lambdaSpeechToScore.get_speech_to_score_tuple,
+        inputs=[learner_transcription, learner_recording, language],
+        outputs=[
+            transcripted_text,
+            letter_correctness,
+            pronunciation_accuracy,
+            recording_ipa,
+            ideal_ipa,
+            res,
+        ],
+    )
+    html_output.change(
+        None,
+        inputs=[transcripted_text, letter_correctness],
+        outputs=[html_output],
+        js=js,
+    )
 if __name__ == "__main__":
+    gradio_app.launch()

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ffmpeg

pre-requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ pip

requirements-flask.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+audioread
+dtwalign
+eng_to_ipa
+epitran==1.25.1
+flask
+flask_cors
+gunicorn
+omegaconf
+ortools==9.11.4210
+pandas
+pickle-mixin
+python-dotenv
+requests
+sentencepiece
+silero==0.4.1
+soundfile==0.12.1
+sqlalchemy
+structlog
+torch
+torchaudio
+transformers

requirements.txt CHANGED Viewed

@@ -1,9 +1,8 @@
 audioread
 dtwalign
 eng_to_ipa
 epitran==1.25.1
-flask
-flask_cors
 gunicorn
 omegaconf
 ortools==9.11.4210
@@ -14,7 +13,6 @@ requests
 sentencepiece
 silero==0.4.1
 soundfile==0.12.1
-sqlalchemy
 structlog
 torch
 torchaudio

+asgi-correlation-id
 audioread
 dtwalign
 eng_to_ipa
 epitran==1.25.1
 gunicorn
 omegaconf
 ortools==9.11.4210
 sentencepiece
 silero==0.4.1
 soundfile==0.12.1
 structlog
 torch
 torchaudio

tests/test_GetAccuracyFromRecordedAudio.py CHANGED Viewed

@@ -86,7 +86,7 @@ class TestGetAccuracyFromRecordedAudio(unittest.TestCase):
         language = "en"
         path = EVENTS_FOLDER / f"test_{language}.wav"
-        output = lambdaSpeechToScore.get_speech_to_score(
             real_text=text_dict[language],
             file_bytes_or_audiotmpfile=path,
             language=language,
@@ -105,14 +105,14 @@ class TestGetAccuracyFromRecordedAudio(unittest.TestCase):
             "end_time": "0.559875 1.658125 1.14825 1.344375 1.658125",
             "is_letter_correct_all_words": "11 000001 111 111 1111 ",
         }
-        check_output(self, json.loads(output), expected_output)
     def test_get_speech_to_score_de_ok(self):
         from aip_trainer.lambdas import lambdaSpeechToScore
         language = "de"
         path = EVENTS_FOLDER / f"test_{language}.wav"
-        output = lambdaSpeechToScore.get_speech_to_score(
             real_text=text_dict[language],
             file_bytes_or_audiotmpfile=path,
             language=language,
@@ -131,7 +131,7 @@ class TestGetAccuracyFromRecordedAudio(unittest.TestCase):
             "end_time": "0.328 0.6458125 1.44025 2.4730625 2.15525 2.4730625",
             "is_letter_correct_all_words": "111 111 11111 000 1011 111 ",
         }
-        check_output(self, json.loads(output), expected_output)
 if __name__ == "__main__":

         language = "en"
         path = EVENTS_FOLDER / f"test_{language}.wav"
+        output = lambdaSpeechToScore.get_speech_to_score_dict(
             real_text=text_dict[language],
             file_bytes_or_audiotmpfile=path,
             language=language,
             "end_time": "0.559875 1.658125 1.14825 1.344375 1.658125",
             "is_letter_correct_all_words": "11 000001 111 111 1111 ",
         }
+        check_output(self, output, expected_output)
     def test_get_speech_to_score_de_ok(self):
         from aip_trainer.lambdas import lambdaSpeechToScore
         language = "de"
         path = EVENTS_FOLDER / f"test_{language}.wav"
+        output = lambdaSpeechToScore.get_speech_to_score_dict(
             real_text=text_dict[language],
             file_bytes_or_audiotmpfile=path,
             language=language,
             "end_time": "0.328 0.6458125 1.44025 2.4730625 2.15525 2.4730625",
             "is_letter_correct_all_words": "111 111 11111 000 1011 111 ",
         }
+        check_output(self, output, expected_output)
 if __name__ == "__main__":