Spaces:

smajumdar
/

nemo_multilingual_language_id

Runtime error

App Files Files Community

smajumdar commited on Dec 7, 2022

Commit

cceae86

•

1 Parent(s): 0bbd007

Fix issue with single uploaded file not being transcribed if too long

Browse files

Files changed (2) hide show

README.md +1 -1
app.py +33 -9

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🐠
 colorFrom: blue
 colorTo: gray
 sdk: gradio
-sdk_version: 3.7
 app_file: app.py
 pinned: false
 license: apache-2.0

 colorFrom: blue
 colorTo: gray
 sdk: gradio
+sdk_version: 3.12
 app_file: app.py
 pinned: false
 license: apache-2.0

app.py CHANGED Viewed

@@ -1,10 +1,12 @@
 import os
 import json
 import uuid
 import tempfile
 import subprocess
 import re
 import time
 import gradio as gr
 import pytube as pt
@@ -24,6 +26,8 @@ os.environ[constants.NEMO_ENV_CACHE_DIR] = "/tmp/nemo/"
 SAMPLE_RATE = 16000  # Default sample rate for ASR
 BUFFERED_INFERENCE_DURATION_THRESHOLD = 60.0  # 60 second and above will require chunked inference.
 TITLE = "NeMo ASR Inference on Hugging Face"
 DESCRIPTION = "Demo of all languages supported by NeMo ASR"
@@ -184,11 +188,14 @@ def convert_audio(audio_filepath):
         return audio_filepath
     out_filename = os.path.join(filedir, filename + '.wav')
     process = subprocess.Popen(
-        ['ffmpeg', '-i', audio_filepath, '-ac', '1', '-ar', str(SAMPLE_RATE), out_filename],
         stdout=subprocess.PIPE,
         stderr=subprocess.STDOUT,
     )
     stdout, stderr = process.communicate()
     if os.path.exists(out_filename):
@@ -368,6 +375,7 @@ def infer_audio(model_name: str, audio_file: str) -> str:
 def transcribe(microphone, audio_file, model_name):
     warn_output = ""
     if (microphone is not None) and (audio_file is not None):
         warn_output = (
@@ -384,15 +392,32 @@ def transcribe(microphone, audio_file, model_name):
     else:
         audio_data = audio_file
     time_diff = None
     try:
-        # Use HF API for transcription
-        start = time.time()
-        transcriptions = infer_audio(model_name, audio_data)
-        end = time.time()
-        time_diff = end - start
     except Exception as e:
         transcriptions = ""
         warn_output = warn_output
@@ -412,8 +437,6 @@ def transcribe(microphone, audio_file, model_name):
         if transcriptions.startswith("Error:-"):
             html_output = build_html_output(transcriptions, style="result_item_error")
         else:
-            audio_duration = parse_duration(audio_data)
             output = f"Successfully transcribed on {get_device()} ! <br>" f"Transcription Time : {time_diff: 0.3f} s"
             if audio_duration > BUFFERED_INFERENCE_DURATION_THRESHOLD:
@@ -533,10 +556,11 @@ with demo:
         lang_selector, models_in_lang = create_lang_selector_component()
         transcript = gr.components.Label(label='Transcript')
         audio_html_output = gr.components.HTML()
-        run = gr.components.Button('Transcribe')
         run.click(
             transcribe, inputs=[microphone, file_upload, models_in_lang], outputs=[transcript, audio_html_output]
         )

 import os
 import json
+import shutil
 import uuid
 import tempfile
 import subprocess
 import re
 import time
+import traceback
 import gradio as gr
 import pytube as pt
 SAMPLE_RATE = 16000  # Default sample rate for ASR
 BUFFERED_INFERENCE_DURATION_THRESHOLD = 60.0  # 60 second and above will require chunked inference.
+CHUNK_LEN_IN_SEC = 20.0  # Chunk size
+BUFFER_LEN_IN_SEC = 30.0  # Total buffer size
 TITLE = "NeMo ASR Inference on Hugging Face"
 DESCRIPTION = "Demo of all languages supported by NeMo ASR"
         return audio_filepath
     out_filename = os.path.join(filedir, filename + '.wav')
     process = subprocess.Popen(
+        ['ffmpeg', '-y', '-i', audio_filepath, '-ac', '1', '-ar', str(SAMPLE_RATE), out_filename],
         stdout=subprocess.PIPE,
         stderr=subprocess.STDOUT,
+        close_fds=True,
     )
     stdout, stderr = process.communicate()
     if os.path.exists(out_filename):
 def transcribe(microphone, audio_file, model_name):
+    audio_data = None
     warn_output = ""
     if (microphone is not None) and (audio_file is not None):
         warn_output = (
     else:
         audio_data = audio_file
+    if audio_data is not None:
+        audio_duration = parse_duration(audio_data)
+    else:
+        audio_duration = None
     time_diff = None
     try:
+        with tempfile.TemporaryDirectory() as tempdir:
+            filename = os.path.split(audio_data)[-1]
+            new_audio_data = os.path.join(tempdir, filename)
+            shutil.copy2(audio_data, new_audio_data)
+            if os.path.exists(audio_data):
+                os.remove(audio_data)
+            audio_data = new_audio_data`
+            # Use HF API for transcription
+            start = time.time()
+            transcriptions = infer_audio(model_name, audio_data)
+            end = time.time()
+            time_diff = end - start
     except Exception as e:
+        print(traceback.print_exc())
         transcriptions = ""
         warn_output = warn_output
         if transcriptions.startswith("Error:-"):
             html_output = build_html_output(transcriptions, style="result_item_error")
         else:
             output = f"Successfully transcribed on {get_device()} ! <br>" f"Transcription Time : {time_diff: 0.3f} s"
             if audio_duration > BUFFERED_INFERENCE_DURATION_THRESHOLD:
         lang_selector, models_in_lang = create_lang_selector_component()
+        run = gr.components.Button('Transcribe')
         transcript = gr.components.Label(label='Transcript')
         audio_html_output = gr.components.HTML()
         run.click(
             transcribe, inputs=[microphone, file_upload, models_in_lang], outputs=[transcript, audio_html_output]
         )