SheldonYC commited on
Commit
70ae40c
1 Parent(s): a7863c7

update asr model

Browse files
Files changed (2) hide show
  1. app.py +10 -10
  2. requirements.txt +3 -1
app.py CHANGED
@@ -8,6 +8,8 @@ import nemo.collections.asr as nemo_asr
8
  from transformers import pipeline
9
  import numpy as np
10
  import gradio as gr
 
 
11
 
12
  def respond(message, chat_history):
13
  bot_message = message
@@ -16,15 +18,13 @@ def respond(message, chat_history):
16
 
17
  def transcribe(audio):
18
  sr, y = audio
19
- y = y.astype(np.float32)
20
- y /= np.max(np.abs(y))
21
- result = asr_model({"sampling_rate": sr, "raw": y})["text"]
 
22
  return result
23
 
24
- # asr_model_id = "openai/whisper-small.en"
25
- # asr_model = pipeline("automatic-speech-recognition", model=asr_model_id)
26
  asr_model = nemo_asr.models.EncDecCTCModel.from_pretrained(model_name="nvidia/parakeet-ctc-0.6b")
27
- text = asr_model.transcribe(["./Samples/Sample_audios/test.wav"])
28
 
29
  with gr.Blocks() as demo:
30
  with gr.Column():
@@ -32,22 +32,22 @@ with gr.Blocks() as demo:
32
  """
33
  # HKU Canteen VA
34
  """)
35
- gr.Markdown(f"{text}")
36
  va = gr.Chatbot(container=False)
37
 
38
  with gr.Row(): # text input
39
  text_input = gr.Textbox(placeholder="Ask me anything...", container=False, scale=1)
40
  submit_btn = gr.Button("Submit", scale=0)
41
 
42
- # with gr.Row(): # audio input
43
- # recording = gr.Microphone(show_download_button=False, container=False)
44
 
45
  with gr.Row(): # button toolbar
46
  clear = gr.ClearButton([text_input, va])
47
 
48
  text_input.submit(respond, [text_input, va], [text_input, va], queue=False)
49
  submit_btn.click(respond, [text_input, va], [text_input, va], queue=False)
50
- # recording.stop_recording(transcribe, [recording], [text_input]).then(respond, [text_input, va], [text_input, va], queue=False)
 
51
 
52
  if __name__ == "__main__":
53
  demo.launch()
 
8
  from transformers import pipeline
9
  import numpy as np
10
  import gradio as gr
11
+ import librosa
12
+ from scipy.io.wavfile import write
13
 
14
  def respond(message, chat_history):
15
  bot_message = message
 
18
 
19
  def transcribe(audio):
20
  sr, y = audio
21
+ audio_name = "resampled_audio.wav"
22
+ resampled_audio = librosa.resample(y=y.astype("float"), orig_sr=sr, target_sr=16000)
23
+ write(audio_name, 16000, resampled_audio)
24
+ result = asr_model.transcribe([f"./{audio_name}"])
25
  return result
26
 
 
 
27
  asr_model = nemo_asr.models.EncDecCTCModel.from_pretrained(model_name="nvidia/parakeet-ctc-0.6b")
 
28
 
29
  with gr.Blocks() as demo:
30
  with gr.Column():
 
32
  """
33
  # HKU Canteen VA
34
  """)
 
35
  va = gr.Chatbot(container=False)
36
 
37
  with gr.Row(): # text input
38
  text_input = gr.Textbox(placeholder="Ask me anything...", container=False, scale=1)
39
  submit_btn = gr.Button("Submit", scale=0)
40
 
41
+ with gr.Row(): # audio input
42
+ recording = gr.Microphone(show_download_button=False, container=False)
43
 
44
  with gr.Row(): # button toolbar
45
  clear = gr.ClearButton([text_input, va])
46
 
47
  text_input.submit(respond, [text_input, va], [text_input, va], queue=False)
48
  submit_btn.click(respond, [text_input, va], [text_input, va], queue=False)
49
+ # recording.stop_recording(transcribe, [recording], [text_input]).then(respond,s [text_input, va], [text_input, va], queue=False)
50
+ recording.stop_recording(transcribe, [recording], [text_input])
51
 
52
  if __name__ == "__main__":
53
  demo.launch()
requirements.txt CHANGED
@@ -1,3 +1,5 @@
1
  torch
2
  transformers
3
- numpy
 
 
 
1
  torch
2
  transformers
3
+ numpy
4
+ librosa
5
+ scipy