awacke1 commited on
Commit
eebc5c8
1 Parent(s): bfb646b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -14
app.py CHANGED
@@ -9,51 +9,38 @@ import os
9
  import uuid
10
 
11
  SAMPLE_RATE = 16000
12
-
13
  model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("nvidia/stt_en_conformer_transducer_xlarge")
14
  model.change_decoding_strategy(None)
15
  model.eval()
16
 
17
-
18
  def process_audio_file(file):
19
  data, sr = librosa.load(file)
20
-
21
  if sr != SAMPLE_RATE:
22
  data = librosa.resample(data, orig_sr=sr, target_sr=SAMPLE_RATE)
23
-
24
  # monochannel
25
  data = librosa.to_mono(data)
26
  return data
27
 
28
-
29
  def transcribe(audio, state=""):
30
  # Grant additional context
31
  # time.sleep(1)
32
-
33
  if state is None:
34
  state = ""
35
-
36
  audio_data = process_audio_file(audio)
37
-
38
  with tempfile.TemporaryDirectory() as tmpdir:
39
  # Filepath transcribe
40
  audio_path = os.path.join(tmpdir, f'audio_{uuid.uuid4()}.wav')
41
  soundfile.write(audio_path, audio_data, SAMPLE_RATE)
42
  transcriptions = model.transcribe([audio_path])
43
-
44
- # Direct transcribe
45
  # transcriptions = model.transcribe([audio])
46
-
47
  # if transcriptions form a tuple (from RNNT), extract just "best" hypothesis
48
  if type(transcriptions) == tuple and len(transcriptions) == 2:
49
  transcriptions = transcriptions[0]
50
-
51
  transcriptions = transcriptions[0]
52
-
53
  state = state + transcriptions + " "
54
  return state, state
55
 
56
-
57
  iface = gr.Interface(
58
  fn=transcribe,
59
  inputs=[
 
9
  import uuid
10
 
11
  SAMPLE_RATE = 16000
 
12
  model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("nvidia/stt_en_conformer_transducer_xlarge")
13
  model.change_decoding_strategy(None)
14
  model.eval()
15
 
 
16
  def process_audio_file(file):
17
  data, sr = librosa.load(file)
 
18
  if sr != SAMPLE_RATE:
19
  data = librosa.resample(data, orig_sr=sr, target_sr=SAMPLE_RATE)
 
20
  # monochannel
21
  data = librosa.to_mono(data)
22
  return data
23
 
 
24
  def transcribe(audio, state=""):
25
  # Grant additional context
26
  # time.sleep(1)
 
27
  if state is None:
28
  state = ""
 
29
  audio_data = process_audio_file(audio)
 
30
  with tempfile.TemporaryDirectory() as tmpdir:
31
  # Filepath transcribe
32
  audio_path = os.path.join(tmpdir, f'audio_{uuid.uuid4()}.wav')
33
  soundfile.write(audio_path, audio_data, SAMPLE_RATE)
34
  transcriptions = model.transcribe([audio_path])
35
+ # Direct transcribe
 
36
  # transcriptions = model.transcribe([audio])
 
37
  # if transcriptions form a tuple (from RNNT), extract just "best" hypothesis
38
  if type(transcriptions) == tuple and len(transcriptions) == 2:
39
  transcriptions = transcriptions[0]
 
40
  transcriptions = transcriptions[0]
 
41
  state = state + transcriptions + " "
42
  return state, state
43
 
 
44
  iface = gr.Interface(
45
  fn=transcribe,
46
  inputs=[