mobinln commited on
Commit
128581e
·
1 Parent(s): 460d74e

working version

Browse files
Files changed (2) hide show
  1. app.py +10 -23
  2. requirements.txt +5 -3
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import gradio as gr
 
2
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
3
  import librosa
4
 
@@ -11,37 +12,23 @@ def transcribe(audio):
11
  if audio is None:
12
  return "No audio input provided. Please record or upload an audio file."
13
 
14
- sample_rate, array = audio
 
15
  sr = 16000
16
  array = librosa.to_mono(array)
17
- array = librosa.resample(array, orig_sr=sample_rate, target_sr=16000)
18
  input_features = processor(array, sampling_rate=sr, return_tensors="pt").input_features
19
 
20
- # generate token ids
21
  predicted_ids = model.generate(input_features)
22
- # decode token ids to text
23
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
24
- return transcription
25
-
26
-
27
- # input_audio = gr.Audio(
28
- # sources=["microphone"],
29
- # waveform_options=gr.WaveformOptions(
30
- # waveform_color="#01C6FF",
31
- # waveform_progress_color="#0066B4",
32
- # skip_length=2,
33
- # show_controls=True,
34
- # ),
35
- # )
36
- # demo = gr.Interface(
37
- # fn=reverse_audio,
38
- # inputs=input_audio,
39
- # outputs="text"
40
- # )
41
  demo = gr.Interface(
42
  fn=transcribe,
43
- inputs=[gr.Audio(sources=["microphone"])],
44
- outputs="text"
 
45
  )
46
 
47
  if __name__ == "__main__":
 
1
  import gradio as gr
2
+ import numpy as np
3
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
4
  import librosa
5
 
 
12
  if audio is None:
13
  return "No audio input provided. Please record or upload an audio file."
14
 
15
+ array, sample_rate = librosa.load(audio)
16
+ array = array.astype(np.float32)
17
  sr = 16000
18
  array = librosa.to_mono(array)
19
+ array = librosa.resample(array, orig_sr=sample_rate, target_sr=sr)
20
  input_features = processor(array, sampling_rate=sr, return_tensors="pt").input_features
21
 
 
22
  predicted_ids = model.generate(input_features)
 
23
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
24
+ return transcription[0]
25
+
26
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  demo = gr.Interface(
28
  fn=transcribe,
29
+ inputs=[gr.Audio(sources=["microphone"], type='filepath')],
30
+ outputs="text",
31
+ allow_flagging="never",
32
  )
33
 
34
  if __name__ == "__main__":
requirements.txt CHANGED
@@ -1,6 +1,8 @@
1
  huggingface_hub==0.22.2
2
- transformers
3
- librosa
4
  torch
5
  torchvision
6
- torchaudio
 
 
 
1
  huggingface_hub==0.22.2
2
+ transformers~=4.42.3
3
+ librosa~=0.10.2.post1
4
  torch
5
  torchvision
6
+ torchaudio
7
+ gradio~=4.36.1
8
+ numpy~=1.24.3