smajumdar commited on
Commit
cceae86
1 Parent(s): 0bbd007

Fix issue with single uploaded file not being transcribed if too long

Browse files
Files changed (2) hide show
  1. README.md +1 -1
  2. app.py +33 -9
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 🐠
4
  colorFrom: blue
5
  colorTo: gray
6
  sdk: gradio
7
- sdk_version: 3.7
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
4
  colorFrom: blue
5
  colorTo: gray
6
  sdk: gradio
7
+ sdk_version: 3.12
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
app.py CHANGED
@@ -1,10 +1,12 @@
1
  import os
2
  import json
 
3
  import uuid
4
  import tempfile
5
  import subprocess
6
  import re
7
  import time
 
8
 
9
  import gradio as gr
10
  import pytube as pt
@@ -24,6 +26,8 @@ os.environ[constants.NEMO_ENV_CACHE_DIR] = "/tmp/nemo/"
24
 
25
  SAMPLE_RATE = 16000 # Default sample rate for ASR
26
  BUFFERED_INFERENCE_DURATION_THRESHOLD = 60.0 # 60 second and above will require chunked inference.
 
 
27
 
28
  TITLE = "NeMo ASR Inference on Hugging Face"
29
  DESCRIPTION = "Demo of all languages supported by NeMo ASR"
@@ -184,11 +188,14 @@ def convert_audio(audio_filepath):
184
  return audio_filepath
185
 
186
  out_filename = os.path.join(filedir, filename + '.wav')
 
187
  process = subprocess.Popen(
188
- ['ffmpeg', '-i', audio_filepath, '-ac', '1', '-ar', str(SAMPLE_RATE), out_filename],
189
  stdout=subprocess.PIPE,
190
  stderr=subprocess.STDOUT,
 
191
  )
 
192
  stdout, stderr = process.communicate()
193
 
194
  if os.path.exists(out_filename):
@@ -368,6 +375,7 @@ def infer_audio(model_name: str, audio_file: str) -> str:
368
 
369
  def transcribe(microphone, audio_file, model_name):
370
 
 
371
  warn_output = ""
372
  if (microphone is not None) and (audio_file is not None):
373
  warn_output = (
@@ -384,15 +392,32 @@ def transcribe(microphone, audio_file, model_name):
384
  else:
385
  audio_data = audio_file
386
 
 
 
 
 
 
387
  time_diff = None
388
  try:
389
- # Use HF API for transcription
390
- start = time.time()
391
- transcriptions = infer_audio(model_name, audio_data)
392
- end = time.time()
393
- time_diff = end - start
 
 
 
 
 
 
 
 
 
 
394
 
395
  except Exception as e:
 
 
396
  transcriptions = ""
397
  warn_output = warn_output
398
 
@@ -412,8 +437,6 @@ def transcribe(microphone, audio_file, model_name):
412
  if transcriptions.startswith("Error:-"):
413
  html_output = build_html_output(transcriptions, style="result_item_error")
414
  else:
415
- audio_duration = parse_duration(audio_data)
416
-
417
  output = f"Successfully transcribed on {get_device()} ! <br>" f"Transcription Time : {time_diff: 0.3f} s"
418
 
419
  if audio_duration > BUFFERED_INFERENCE_DURATION_THRESHOLD:
@@ -533,10 +556,11 @@ with demo:
533
 
534
  lang_selector, models_in_lang = create_lang_selector_component()
535
 
 
 
536
  transcript = gr.components.Label(label='Transcript')
537
  audio_html_output = gr.components.HTML()
538
 
539
- run = gr.components.Button('Transcribe')
540
  run.click(
541
  transcribe, inputs=[microphone, file_upload, models_in_lang], outputs=[transcript, audio_html_output]
542
  )
 
1
  import os
2
  import json
3
+ import shutil
4
  import uuid
5
  import tempfile
6
  import subprocess
7
  import re
8
  import time
9
+ import traceback
10
 
11
  import gradio as gr
12
  import pytube as pt
 
26
 
27
  SAMPLE_RATE = 16000 # Default sample rate for ASR
28
  BUFFERED_INFERENCE_DURATION_THRESHOLD = 60.0 # 60 second and above will require chunked inference.
29
+ CHUNK_LEN_IN_SEC = 20.0 # Chunk size
30
+ BUFFER_LEN_IN_SEC = 30.0 # Total buffer size
31
 
32
  TITLE = "NeMo ASR Inference on Hugging Face"
33
  DESCRIPTION = "Demo of all languages supported by NeMo ASR"
 
188
  return audio_filepath
189
 
190
  out_filename = os.path.join(filedir, filename + '.wav')
191
+
192
  process = subprocess.Popen(
193
+ ['ffmpeg', '-y', '-i', audio_filepath, '-ac', '1', '-ar', str(SAMPLE_RATE), out_filename],
194
  stdout=subprocess.PIPE,
195
  stderr=subprocess.STDOUT,
196
+ close_fds=True,
197
  )
198
+
199
  stdout, stderr = process.communicate()
200
 
201
  if os.path.exists(out_filename):
 
375
 
376
  def transcribe(microphone, audio_file, model_name):
377
 
378
+ audio_data = None
379
  warn_output = ""
380
  if (microphone is not None) and (audio_file is not None):
381
  warn_output = (
 
392
  else:
393
  audio_data = audio_file
394
 
395
+ if audio_data is not None:
396
+ audio_duration = parse_duration(audio_data)
397
+ else:
398
+ audio_duration = None
399
+
400
  time_diff = None
401
  try:
402
+ with tempfile.TemporaryDirectory() as tempdir:
403
+ filename = os.path.split(audio_data)[-1]
404
+ new_audio_data = os.path.join(tempdir, filename)
405
+ shutil.copy2(audio_data, new_audio_data)
406
+
407
+ if os.path.exists(audio_data):
408
+ os.remove(audio_data)
409
+
410
+ audio_data = new_audio_data`
411
+
412
+ # Use HF API for transcription
413
+ start = time.time()
414
+ transcriptions = infer_audio(model_name, audio_data)
415
+ end = time.time()
416
+ time_diff = end - start
417
 
418
  except Exception as e:
419
+ print(traceback.print_exc())
420
+
421
  transcriptions = ""
422
  warn_output = warn_output
423
 
 
437
  if transcriptions.startswith("Error:-"):
438
  html_output = build_html_output(transcriptions, style="result_item_error")
439
  else:
 
 
440
  output = f"Successfully transcribed on {get_device()} ! <br>" f"Transcription Time : {time_diff: 0.3f} s"
441
 
442
  if audio_duration > BUFFERED_INFERENCE_DURATION_THRESHOLD:
 
556
 
557
  lang_selector, models_in_lang = create_lang_selector_component()
558
 
559
+ run = gr.components.Button('Transcribe')
560
+
561
  transcript = gr.components.Label(label='Transcript')
562
  audio_html_output = gr.components.HTML()
563
 
 
564
  run.click(
565
  transcribe, inputs=[microphone, file_upload, models_in_lang], outputs=[transcript, audio_html_output]
566
  )