Spaces:
Running
Running
alessandro trinca tornidor
commited on
Commit
·
d804881
1
Parent(s):
1e30c4b
feat: first working gradio fronted with refactored lambda_handler and tests
Browse files- aip_trainer/lambdas/lambdaSpeechToScore.py +27 -8
- aip_trainer/lambdas/routes.py +16 -0
- aip_trainer/utils/middlewares.py +0 -0
- app.py +127 -0
- tests/events/test_de.wav +0 -0
- tests/events/test_en.wav +0 -0
- tests/test_GetAccuracyFromRecordedAudio.py +108 -21
- tests/test_data_de_en_2.pickle +0 -0
aip_trainer/lambdas/lambdaSpeechToScore.py
CHANGED
@@ -1,6 +1,8 @@
|
|
|
|
1 |
import base64
|
2 |
import json
|
3 |
import os
|
|
|
4 |
import tempfile
|
5 |
import time
|
6 |
|
@@ -25,8 +27,9 @@ def lambda_handler(event, context):
|
|
25 |
data = json.loads(event['body'])
|
26 |
|
27 |
real_text = data['title']
|
28 |
-
|
29 |
-
|
|
|
30 |
language = data['language']
|
31 |
|
32 |
if len(real_text) == 0:
|
@@ -40,13 +43,26 @@ def lambda_handler(event, context):
|
|
40 |
},
|
41 |
'body': ''
|
42 |
}
|
|
|
|
|
|
|
|
|
43 |
|
|
|
|
|
|
|
|
|
44 |
start0 = time.time()
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
start = time.time()
|
52 |
app_logger.info(f'Loading .ogg file file {random_file_name} ...')
|
@@ -66,7 +82,8 @@ def lambda_handler(event, context):
|
|
66 |
app_logger.info(f'language_trainer_sst_lambda: result: {result}...')
|
67 |
|
68 |
start = time.time()
|
69 |
-
|
|
|
70 |
duration = time.time() - start
|
71 |
app_logger.info(f'Deleted file {random_file_name} in {duration}s.')
|
72 |
|
@@ -127,6 +144,8 @@ def audioread_load(path, offset=0.0, duration=None, dtype=np.float32):
|
|
127 |
This loads one block at a time, and then concatenates the results.
|
128 |
"""
|
129 |
|
|
|
|
|
130 |
y = []
|
131 |
app_logger.debug(f"reading audio file at path:{path} ...")
|
132 |
with audioread.audio_open(path) as input_file:
|
|
|
1 |
+
|
2 |
import base64
|
3 |
import json
|
4 |
import os
|
5 |
+
from pathlib import Path
|
6 |
import tempfile
|
7 |
import time
|
8 |
|
|
|
27 |
data = json.loads(event['body'])
|
28 |
|
29 |
real_text = data['title']
|
30 |
+
base64Audio = data["base64Audio"]
|
31 |
+
app_logger.debug(f"base64Audio:{base64Audio} ...")
|
32 |
+
file_bytes_or_audiotmpfile = base64.b64decode(base64Audio[22:].encode('utf-8'))
|
33 |
language = data['language']
|
34 |
|
35 |
if len(real_text) == 0:
|
|
|
43 |
},
|
44 |
'body': ''
|
45 |
}
|
46 |
+
output = get_speech_to_score(real_text=real_text, file_bytes_or_audiotmpfile=file_bytes_or_audiotmpfile, language=language)
|
47 |
+
app_logger.debug(f"output: {output} ...")
|
48 |
+
return output
|
49 |
+
|
50 |
|
51 |
+
def get_speech_to_score(real_text: str, file_bytes_or_audiotmpfile: str | dict, language: str = "en", remove_random_file: bool = True):
|
52 |
+
app_logger.info(f"real_text:{real_text} ...")
|
53 |
+
app_logger.debug(f"file_bytes:{file_bytes_or_audiotmpfile} ...")
|
54 |
+
app_logger.info(f"language:{language} ...")
|
55 |
start0 = time.time()
|
56 |
+
|
57 |
+
random_file_name = file_bytes_or_audiotmpfile
|
58 |
+
app_logger.debug(f"random_file_name:{random_file_name} ...")
|
59 |
+
if isinstance(file_bytes_or_audiotmpfile, (bytes, bytearray)):
|
60 |
+
app_logger.debug("writing streaming data to file on disk...")
|
61 |
+
with tempfile.NamedTemporaryFile(prefix="temp_sound_speech_score_", suffix=".ogg", delete=False) as f1:
|
62 |
+
f1.write(file_bytes_or_audiotmpfile)
|
63 |
+
duration = time.time() - start0
|
64 |
+
app_logger.info(f'Saved binary data in file in {duration}s.')
|
65 |
+
random_file_name = f1.name
|
66 |
|
67 |
start = time.time()
|
68 |
app_logger.info(f'Loading .ogg file file {random_file_name} ...')
|
|
|
82 |
app_logger.info(f'language_trainer_sst_lambda: result: {result}...')
|
83 |
|
84 |
start = time.time()
|
85 |
+
if remove_random_file:
|
86 |
+
os.remove(random_file_name)
|
87 |
duration = time.time() - start
|
88 |
app_logger.info(f'Deleted file {random_file_name} in {duration}s.')
|
89 |
|
|
|
144 |
This loads one block at a time, and then concatenates the results.
|
145 |
"""
|
146 |
|
147 |
+
import shutil
|
148 |
+
shutil.copyfile(path, Path("/tmp") / f"test_en_{Path(path).name}")
|
149 |
y = []
|
150 |
app_logger.debug(f"reading audio file at path:{path} ...")
|
151 |
with audioread.audio_open(path) as input_file:
|
aip_trainer/lambdas/routes.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
|
3 |
+
import structlog
|
4 |
+
from fastapi import APIRouter
|
5 |
+
|
6 |
+
|
7 |
+
custom_structlog_logger = structlog.stdlib.get_logger(__name__)
|
8 |
+
router = APIRouter()
|
9 |
+
|
10 |
+
|
11 |
+
@router.get("/health")
|
12 |
+
def health():
|
13 |
+
import torch
|
14 |
+
import torchaudio
|
15 |
+
custom_structlog_logger.info(f"Still alive, torch version:{torch.__version__}, torchaudio:{torchaudio.__version__} ...")
|
16 |
+
return "Still alive!"
|
aip_trainer/utils/middlewares.py
ADDED
File without changes
|
app.py
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
import time
|
4 |
+
|
5 |
+
import gradio as gr
|
6 |
+
import structlog
|
7 |
+
import uvicorn
|
8 |
+
from aip_trainer.lambdas import lambdaSpeechToScore
|
9 |
+
from asgi_correlation_id import CorrelationIdMiddleware
|
10 |
+
from asgi_correlation_id.context import correlation_id
|
11 |
+
from dotenv import load_dotenv
|
12 |
+
from fastapi import FastAPI, Request, Response
|
13 |
+
from uvicorn.protocols.utils import get_path_with_query_string
|
14 |
+
|
15 |
+
from aip_trainer.utils.session_logger import setup_logging
|
16 |
+
from aip_trainer.lambdas.routes import router
|
17 |
+
|
18 |
+
|
19 |
+
load_dotenv()
|
20 |
+
|
21 |
+
LOG_JSON_FORMAT = bool(os.getenv("LOG_JSON_FORMAT", False))
|
22 |
+
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
|
23 |
+
setup_logging(json_logs=LOG_JSON_FORMAT, log_level=LOG_LEVEL)
|
24 |
+
logger = structlog.stdlib.get_logger(__name__)
|
25 |
+
app = FastAPI(title="Example API", version="1.0.0")
|
26 |
+
|
27 |
+
|
28 |
+
@app.middleware("http")
|
29 |
+
async def logging_middleware(request: Request, call_next) -> Response:
|
30 |
+
structlog.contextvars.clear_contextvars()
|
31 |
+
# These context vars will be added to all log entries emitted during the request
|
32 |
+
request_id = correlation_id.get()
|
33 |
+
# print(f"request_id:{request_id}.")
|
34 |
+
structlog.contextvars.bind_contextvars(request_id=request_id)
|
35 |
+
|
36 |
+
start_time = time.perf_counter_ns()
|
37 |
+
# If the call_next raises an error, we still want to return our own 500 response,
|
38 |
+
# so we can add headers to it (process time, request ID...)
|
39 |
+
response = Response(status_code=500)
|
40 |
+
try:
|
41 |
+
response = await call_next(request)
|
42 |
+
except Exception:
|
43 |
+
# TODO: Validate that we don't swallow exceptions (unit test?)
|
44 |
+
structlog.stdlib.get_logger("api.error").exception("Uncaught exception")
|
45 |
+
raise
|
46 |
+
finally:
|
47 |
+
process_time = time.perf_counter_ns() - start_time
|
48 |
+
status_code = response.status_code
|
49 |
+
url = get_path_with_query_string(request.scope)
|
50 |
+
client_host = request.client.host
|
51 |
+
client_port = request.client.port
|
52 |
+
http_method = request.method
|
53 |
+
http_version = request.scope["http_version"]
|
54 |
+
# Recreate the Uvicorn access log format, but add all parameters as structured information
|
55 |
+
logger.info(
|
56 |
+
f"""{client_host}:{client_port} - "{http_method} {url} HTTP/{http_version}" {status_code}""",
|
57 |
+
http={
|
58 |
+
"url": str(request.url),
|
59 |
+
"status_code": status_code,
|
60 |
+
"method": http_method,
|
61 |
+
"request_id": request_id,
|
62 |
+
"version": http_version,
|
63 |
+
},
|
64 |
+
network={"client": {"ip": client_host, "port": client_port}},
|
65 |
+
duration=process_time,
|
66 |
+
)
|
67 |
+
response.headers["X-Process-Time"] = str(process_time / 10 ** 9)
|
68 |
+
return response
|
69 |
+
|
70 |
+
|
71 |
+
app.include_router(router)
|
72 |
+
logger.info("routes included, creating gradio app")
|
73 |
+
CUSTOM_GRADIO_PATH = "/"
|
74 |
+
|
75 |
+
|
76 |
+
def get_gradio_app():
|
77 |
+
with gr.Blocks() as gradio_app:
|
78 |
+
logger.info("start gradio app building...")
|
79 |
+
gr.Markdown(
|
80 |
+
"""
|
81 |
+
# Hello World!
|
82 |
+
|
83 |
+
Start typing below to _see_ the *output*.
|
84 |
+
|
85 |
+
Here a [link](https://huggingface.co/spaces/aletrn/gradio_with_fastapi).
|
86 |
+
"""
|
87 |
+
)
|
88 |
+
learner_transcription = gr.Textbox(
|
89 |
+
label="Learner Transcription",
|
90 |
+
placeholder="It is nice to wreck a nice beach",
|
91 |
+
)
|
92 |
+
language = gr.Textbox(
|
93 |
+
label="language",
|
94 |
+
placeholder="en",
|
95 |
+
)
|
96 |
+
learner_recording = gr.Audio(
|
97 |
+
label="Learner Recording",
|
98 |
+
sources=["microphone", "upload"],
|
99 |
+
type="filepath"
|
100 |
+
)
|
101 |
+
text_output = gr.Textbox(lines=1, placeholder=None, label="Text Output")
|
102 |
+
btn = gr.Button(value="get speech score")
|
103 |
+
"""
|
104 |
+
event = {'body': json.dumps(request.get_json(force=True))}
|
105 |
+
lambda_correct_output = lambdaSpeechToScore.lambda_handler(event, [])
|
106 |
+
"""
|
107 |
+
btn.click(
|
108 |
+
lambdaSpeechToScore.get_speech_to_score,
|
109 |
+
inputs=[learner_transcription, learner_recording, language],
|
110 |
+
outputs=[text_output]
|
111 |
+
)
|
112 |
+
return gradio_app
|
113 |
+
|
114 |
+
|
115 |
+
logger.info("mounting gradio app within FastAPI...")
|
116 |
+
gradio_app_md = get_gradio_app()
|
117 |
+
app.add_middleware(CorrelationIdMiddleware)
|
118 |
+
app = gr.mount_gradio_app(app, gradio_app_md, path=CUSTOM_GRADIO_PATH)
|
119 |
+
logger.info("gradio app mounted")
|
120 |
+
|
121 |
+
|
122 |
+
if __name__ == "__main__":
|
123 |
+
try:
|
124 |
+
uvicorn.run("app:app", host="127.0.0.1", port=7860, log_config=None, reload=True)
|
125 |
+
except Exception as ex:
|
126 |
+
logging.error(f"ex:{ex}.")
|
127 |
+
raise ex
|
tests/events/test_de.wav
ADDED
Binary file (259 kB). View file
|
|
tests/events/test_en.wav
ADDED
Binary file (196 kB). View file
|
|
tests/test_GetAccuracyFromRecordedAudio.py
CHANGED
@@ -1,10 +1,19 @@
|
|
1 |
import json
|
|
|
|
|
2 |
import unittest
|
3 |
|
|
|
4 |
from aip_trainer.lambdas import lambdaSpeechToScore
|
5 |
from tests import EVENTS_FOLDER
|
6 |
|
7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
def check_output_by_field(output, key, match, expected_output):
|
9 |
import re
|
10 |
|
@@ -17,10 +26,48 @@ def check_output_by_field(output, key, match, expected_output):
|
|
17 |
return output
|
18 |
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
class TestGetAccuracyFromRecordedAudio(unittest.TestCase):
|
21 |
-
def
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
|
|
24 |
with open(EVENTS_FOLDER / "GetAccuracyFromRecordedAudio.json", "r") as src:
|
25 |
inputs_outputs = json.load(src)
|
26 |
inputs = inputs_outputs["inputs"]
|
@@ -29,23 +76,63 @@ class TestGetAccuracyFromRecordedAudio(unittest.TestCase):
|
|
29 |
expected_output = outputs[event_name]
|
30 |
output = lambdaSpeechToScore.lambda_handler(event_content, [])
|
31 |
output = json.loads(output)
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
unittest.main()
|
|
|
1 |
import json
|
2 |
+
import os
|
3 |
+
import platform
|
4 |
import unittest
|
5 |
|
6 |
+
from aip_trainer import app_logger
|
7 |
from aip_trainer.lambdas import lambdaSpeechToScore
|
8 |
from tests import EVENTS_FOLDER
|
9 |
|
10 |
|
11 |
+
text_dict = {
|
12 |
+
"de": "Ich bin Alex, wer bist du?",
|
13 |
+
"en": "Hi there, how are you?"
|
14 |
+
}
|
15 |
+
|
16 |
+
|
17 |
def check_output_by_field(output, key, match, expected_output):
|
18 |
import re
|
19 |
|
|
|
26 |
return output
|
27 |
|
28 |
|
29 |
+
def check_output(self, output, expected_output):
|
30 |
+
self.maxDiff = None
|
31 |
+
try:
|
32 |
+
assert len(output["matched_transcripts"].strip()) > 0
|
33 |
+
assert len(output["matched_transcripts_ipa"].strip()) > 0
|
34 |
+
assert len(output["ipa_transcript"].strip()) > 0
|
35 |
+
assert len(output["real_transcripts_ipa"].strip()) > 0
|
36 |
+
output = check_output_by_field(
|
37 |
+
output, "is_letter_correct_all_words", "[01]+", expected_output
|
38 |
+
)
|
39 |
+
output = check_output_by_field(output, "end_time", "\d+\.\d+", expected_output)
|
40 |
+
output = check_output_by_field(
|
41 |
+
output, "start_time", "\d+\.\d+", expected_output
|
42 |
+
)
|
43 |
+
output = check_output_by_field(
|
44 |
+
output, "pronunciation_accuracy", "\d+", expected_output
|
45 |
+
)
|
46 |
+
output["matched_transcripts"] = expected_output["matched_transcripts"]
|
47 |
+
output["matched_transcripts_ipa"] = expected_output["matched_transcripts_ipa"]
|
48 |
+
output["pronunciation_accuracy"] = expected_output["pronunciation_accuracy"]
|
49 |
+
output["pair_accuracy_category"] = expected_output["pair_accuracy_category"]
|
50 |
+
output["ipa_transcript"] = expected_output["ipa_transcript"]
|
51 |
+
output["real_transcript"] = expected_output["real_transcript"]
|
52 |
+
output["real_transcripts_ipa"] = expected_output["real_transcripts_ipa"]
|
53 |
+
self.assertDictEqual(expected_output, output)
|
54 |
+
except Exception as e:
|
55 |
+
app_logger.error(f"e:{e}.")
|
56 |
+
raise e
|
57 |
+
|
58 |
+
|
59 |
class TestGetAccuracyFromRecordedAudio(unittest.TestCase):
|
60 |
+
def setUp(self):
|
61 |
+
if platform.system() == "Windows" or platform.system() == "Win32":
|
62 |
+
os.environ["PYTHONUTF8"] = "1"
|
63 |
+
|
64 |
+
def tearDown(self):
|
65 |
+
if (
|
66 |
+
platform.system() == "Windows" or platform.system() == "Win32"
|
67 |
+
) and "PYTHONUTF8" in os.environ:
|
68 |
+
del os.environ["PYTHONUTF8"]
|
69 |
|
70 |
+
def test_GetAccuracyFromRecordedAudio(self):
|
71 |
with open(EVENTS_FOLDER / "GetAccuracyFromRecordedAudio.json", "r") as src:
|
72 |
inputs_outputs = json.load(src)
|
73 |
inputs = inputs_outputs["inputs"]
|
|
|
76 |
expected_output = outputs[event_name]
|
77 |
output = lambdaSpeechToScore.lambda_handler(event_content, [])
|
78 |
output = json.loads(output)
|
79 |
+
app_logger.info(
|
80 |
+
f"output type:{type(output)}, expected_output type:{type(expected_output)}."
|
81 |
+
)
|
82 |
+
check_output(self, output, expected_output)
|
83 |
+
|
84 |
+
def test_get_speech_to_score_en_ok(self):
|
85 |
+
from aip_trainer.lambdas import lambdaSpeechToScore
|
86 |
+
|
87 |
+
language = "en"
|
88 |
+
path = EVENTS_FOLDER / f"test_{language}.wav"
|
89 |
+
output = lambdaSpeechToScore.get_speech_to_score(
|
90 |
+
real_text=text_dict[language],
|
91 |
+
file_bytes_or_audiotmpfile=path,
|
92 |
+
language=language,
|
93 |
+
remove_random_file=False,
|
94 |
+
)
|
95 |
+
expected_output = {
|
96 |
+
"real_transcript": text_dict[language],
|
97 |
+
"ipa_transcript": "ha\u026a ha\u028a \u0259r ju",
|
98 |
+
"pronunciation_accuracy": "69",
|
99 |
+
"real_transcripts": text_dict[language],
|
100 |
+
"matched_transcripts": "hi - how are you",
|
101 |
+
"real_transcripts_ipa": "ha\u026a \u00f0\u025br, ha\u028a \u0259r ju?",
|
102 |
+
"matched_transcripts_ipa": "ha\u026a ha\u028a \u0259r ju",
|
103 |
+
"pair_accuracy_category": "0 2 0 0 0",
|
104 |
+
"start_time": "0.2245625 1.3228125 0.852125 1.04825 1.3228125",
|
105 |
+
"end_time": "0.559875 1.658125 1.14825 1.344375 1.658125",
|
106 |
+
"is_letter_correct_all_words": "11 000001 111 111 1111 ",
|
107 |
+
}
|
108 |
+
check_output(self, json.loads(output), expected_output)
|
109 |
+
|
110 |
+
def test_get_speech_to_score_de_ok(self):
|
111 |
+
from aip_trainer.lambdas import lambdaSpeechToScore
|
112 |
+
|
113 |
+
language = "de"
|
114 |
+
path = EVENTS_FOLDER / f"test_{language}.wav"
|
115 |
+
output = lambdaSpeechToScore.get_speech_to_score(
|
116 |
+
real_text=text_dict[language],
|
117 |
+
file_bytes_or_audiotmpfile=path,
|
118 |
+
language=language,
|
119 |
+
remove_random_file=False,
|
120 |
+
)
|
121 |
+
expected_output = {
|
122 |
+
"real_transcript": text_dict[language],
|
123 |
+
"ipa_transcript": "\u026a\u00e7 bi\u02d0n a\u02d0l\u025bksv\u025b\u02d0 b\u025bst\u025b\u02d0 du\u02d0",
|
124 |
+
"pronunciation_accuracy": "63",
|
125 |
+
"real_transcripts": text_dict[language],
|
126 |
+
"matched_transcripts": "ich bin alexwe - beste du",
|
127 |
+
"real_transcripts_ipa": "\u026a\u00e7 bi\u02d0n a\u02d0l\u025bks, v\u0250 b\u026ast du\u02d0?",
|
128 |
+
"matched_transcripts_ipa": "\u026a\u00e7 bi\u02d0n a\u02d0l\u025bksv\u0259 - b\u0259st\u0259 du\u02d0",
|
129 |
+
"pair_accuracy_category": "0 0 2 2 2 0",
|
130 |
+
"start_time": "0.0 0.3075 0.62525 2.1346875 1.5785625 2.1346875",
|
131 |
+
"end_time": "0.328 0.6458125 1.44025 2.4730625 2.15525 2.4730625",
|
132 |
+
"is_letter_correct_all_words": "111 111 11111 000 1011 111 ",
|
133 |
+
}
|
134 |
+
check_output(self, json.loads(output), expected_output)
|
135 |
+
|
136 |
+
|
137 |
+
if __name__ == "__main__":
|
138 |
unittest.main()
|
tests/test_data_de_en_2.pickle
CHANGED
File without changes
|