gdnartea commited on
Commit
9bebeaf
1 Parent(s): 13c0360

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -23
app.py CHANGED
@@ -1,31 +1,16 @@
1
  import gradio as gr
2
  import torch
3
- from transformers import AutoTokenizer, AutoModelForCausalLM, VitsModel
4
- from nemo.collections.asr.models import EncDecMultiTaskModel
5
 
 
6
 
7
- # load speech to text model
8
- canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b')
9
- canary_model.eval()
10
- canary_model.to('cpu')
 
11
 
12
- # update decode params
13
- canary_model.change_decoding_strategy(None)
14
- decode_cfg = canary_model.cfg.decoding
15
- decode_cfg.beam.beam_size = 1
16
- canary_model.change_decoding_strategy(decode_cfg)
17
 
18
-
19
-
20
- def convert_speech(speech):
21
- # Convert the speech to text
22
- transcription = canary_model.transcribe(
23
- speech,
24
- logprobs=False,
25
- )
26
-
27
- return transcription
28
-
29
- iface = gr.Interface(fn=convert_speech, inputs=gr.Audio(source="microphone"), outputs="textbox")
30
 
31
  iface.launch()
 
1
  import gradio as gr
2
  import torch
3
+ from transformers import pipeline
 
4
 
5
+ canary_pipe = pipeline("automatic-speech-recognition", model="nvidia/canary-1b")
6
 
7
+ def convert_speech(audio):
8
+ sr, y = audio
9
+ y = y.astype(np.float32)
10
+ y /= np.max(np.abs(y))
11
+ return transcriber({"sampling_rate": sr, "raw": y})["text"]
12
 
 
 
 
 
 
13
 
14
+ iface = gr.Interface(fn=convert_speech, inputs=gr.Audio(sources="microphone"), outputs="textbox")
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  iface.launch()