rensdimmendaal commited on
Commit
574db49
1 Parent(s): 9679a3e
Files changed (2) hide show
  1. app.py +33 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import gradio as gr
3
+ import numpy as np
4
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
5
+ import soundfile as sf
6
+ import torch
7
+
8
+ # load model and tokenizer
9
+ processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
10
+ model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
11
+
12
+ def speech2text(audio):
13
+ sr, data = audio
14
+
15
+ # resample to 16hz
16
+ data_16hz = librosa.resample(data[:,0].astype(np.float32),sr,16000)
17
+
18
+ # tokenize
19
+ input_values = processor([data_16hz], return_tensors="pt", padding="longest").input_values # Batch size 1
20
+
21
+ # retrieve logits
22
+ logits = model(input_values).logits
23
+
24
+ # take argmax and decode
25
+ predicted_ids = torch.argmax(logits, dim=-1)
26
+ transcription = processor.batch_decode(predicted_ids)
27
+
28
+ return transcription[0] # batch size 1
29
+
30
+ iface = gr.Interface(speech2text, "microphone", "text")
31
+
32
+ iface.launch()
33
+
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ soundfile
4
+ torch
5
+ librosa