Spaces:

k2-fsa
/

automatic-speech-recognition

Running

App Files Files Community

csukuangfj commited on Jul 18, 2022

Commit

074cf4f

•

1 Parent(s): 0eef9b6

small fixes

Browse files

Files changed (1) hide show

app.py +21 -35

app.py CHANGED Viewed

@@ -19,6 +19,7 @@
 # References:
 # https://gradio.app/docs/#dropdown
 import os
 import time
 from datetime import datetime
@@ -34,7 +35,7 @@ languages = sorted(language_to_models.keys())
 def convert_to_wav(in_filename: str) -> str:
     """Convert the input audio file to a wave file"""
     out_filename = in_filename + ".wav"
-    print(f"Converting '{in_filename}' to '{out_filename}'")
     _ = os.system(f"ffmpeg -hide_banner -i '{in_filename}' '{out_filename}'")
     return out_filename
@@ -46,23 +47,23 @@ def process(
     decoding_method: str,
     num_active_paths: int,
 ) -> str:
-    print("in_filename", in_filename)
-    print("language", language)
-    print("repo_id", repo_id)
-    print("decoding_method", decoding_method)
-    print("num_active_paths", num_active_paths)
     filename = convert_to_wav(in_filename)
     now = datetime.now()
     date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
-    print(f"Started at {date_time}")
     start = time.time()
     wave, wave_sample_rate = torchaudio.load(filename)
     if wave_sample_rate != sample_rate:
-        print(
             f"Expected sample rate: {sample_rate}. Given: {wave_sample_rate}. "
             f"Resampling to {sample_rate}."
         )
@@ -86,22 +87,12 @@ def process(
     duration = wave.shape[0] / sample_rate
     rtf = (end - start) / duration
-    print(f"Finished at {date_time} s. Elapsed: {end - start: .3f} s")
-    print(f"Duration {duration: .3f} s")
-    print(f"RTF {rtf: .3f}")
-    print("hyp")
-    print(hyp)
-    html_output = f"""
-    <div class='result'>
-      <div class='result_item result_item_success'>
-        {hyp}
-        <br/>
-      </div>
-    </div>
-    """
-    return html_output
 title = "# Automatic Speech Recognition with Next-gen Kaldi"
@@ -125,16 +116,7 @@ def update_model_dropdown(language: str):
     raise ValueError(f"Unsupported language: {language}")
-# The css style is copied from
-# https://huggingface.co/spaces/alphacep/asr/blob/main/app.py#L112
-demo = gr.Blocks(
-    css="""
-    .result {display:flex;flex-direction:column}
-    .result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%}
-    .result_item_success {background-color:mediumaquamarine;color:white;align-self:start}
-    .result_item_error {background-color:#ff7070;color:white;align-self:start}
-    """,
-)
 with demo:
     gr.Markdown(title)
@@ -178,8 +160,8 @@ with demo:
                 optional=False,
                 label="Upload from disk",
             )
-            uploaded_output = gr.HTML(label="Recognized speech from uploaded file")
             upload_button = gr.Button("Submit for recognition")
         with gr.TabItem("Record from microphone"):
             microphone = gr.Audio(
@@ -190,7 +172,7 @@ with demo:
             )
             record_button = gr.Button("Submit for recognition")
-            recorded_output = gr.HTML(label="Recognized speech from recordings")
         upload_button.click(
             process,
@@ -217,4 +199,8 @@ with demo:
     gr.Markdown(description)
 if __name__ == "__main__":
     demo.launch()

 # References:
 # https://gradio.app/docs/#dropdown
+import logging
 import os
 import time
 from datetime import datetime
 def convert_to_wav(in_filename: str) -> str:
     """Convert the input audio file to a wave file"""
     out_filename = in_filename + ".wav"
+    logging.info(f"Converting '{in_filename}' to '{out_filename}'")
     _ = os.system(f"ffmpeg -hide_banner -i '{in_filename}' '{out_filename}'")
     return out_filename
     decoding_method: str,
     num_active_paths: int,
 ) -> str:
+    logging.info(f"in_filename: {in_filename}")
+    logging.info(f"language: {language}")
+    logging.info(f"repo_id: {repo_id}")
+    logging.info(f"decoding_method: {decoding_method}")
+    logging.info(f"num_active_paths: {num_active_paths}")
     filename = convert_to_wav(in_filename)
     now = datetime.now()
     date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
+    logging.info(f"Started at {date_time}")
     start = time.time()
     wave, wave_sample_rate = torchaudio.load(filename)
     if wave_sample_rate != sample_rate:
+        logging.info(
             f"Expected sample rate: {sample_rate}. Given: {wave_sample_rate}. "
             f"Resampling to {sample_rate}."
         )
     duration = wave.shape[0] / sample_rate
     rtf = (end - start) / duration
+    logging.info(f"Finished at {date_time} s. Elapsed: {end - start: .3f} s")
+    logging.info(f"Duration {duration: .3f} s")
+    logging.info(f"RTF {rtf: .3f}")
+    logging.info(f"hyp:\n{hyp}")
+    return hyp
 title = "# Automatic Speech Recognition with Next-gen Kaldi"
     raise ValueError(f"Unsupported language: {language}")
+demo = gr.Blocks()
 with demo:
     gr.Markdown(title)
                 optional=False,
                 label="Upload from disk",
             )
             upload_button = gr.Button("Submit for recognition")
+            uploaded_output = gr.Textbox(label="Recognized speech from uploaded file")
         with gr.TabItem("Record from microphone"):
             microphone = gr.Audio(
             )
             record_button = gr.Button("Submit for recognition")
+            recorded_output = gr.Textbox(label="Recognized speech from recordings")
         upload_button.click(
             process,
     gr.Markdown(description)
 if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO)
     demo.launch()