junzhaosun commited on
Commit
21147ce
1 Parent(s): 3bb42a7

fixed bugs

Browse files
Files changed (2) hide show
  1. app.py +50 -4
  2. requirements.txt +3 -0
app.py CHANGED
@@ -1,6 +1,50 @@
1
  #!/usr/local/bin/python3
2
  #-*- coding:utf-8 -*-
3
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  title = "OpenAI Whisper Large v2"
6
 
@@ -40,15 +84,17 @@ examples = [
40
  ["examples/see_in_eyes.wav", None],
41
  ]
42
 
43
- gr.load(
44
- "models/openai/whisper-large-v2",
45
  inputs=[
46
  gr.Audio(label="上传语音", source="upload", type="numpy"),
47
  gr.Audio(label="录制语音", source="microphone", type="numpy"),
48
  ],
49
- outputs=gr.Text(label="识别出的文字"),
 
 
50
  title=title,
51
  description=description,
52
  article=article,
53
- examples=examples
54
  ).launch()
 
1
  #!/usr/local/bin/python3
2
  #-*- coding:utf-8 -*-
3
  import gradio as gr
4
+ import librosa
5
+ import torch
6
+ from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
7
+
8
+ checkpoint = "openai/whisper-large-v2"
9
+ processor = AutoProcessor.from_pretrained(checkpoint)
10
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(checkpoint)
11
+
12
+
13
+ def process_audio(sampling_rate, waveform):
14
+ # convert from int16 to floating point
15
+ waveform = waveform / 32678.0
16
+
17
+ # convert to mono if stereo
18
+ if len(waveform.shape) > 1:
19
+ waveform = librosa.to_mono(waveform.T)
20
+
21
+ # resample to 16 kHz if necessary
22
+ if sampling_rate != 16000:
23
+ waveform = librosa.resample(waveform, orig_sr=sampling_rate, target_sr=16000)
24
+
25
+ # limit to 30 seconds
26
+ waveform = waveform[:16000*30]
27
+
28
+ # make PyTorch tensor
29
+ waveform = torch.tensor(waveform)
30
+ return waveform
31
+
32
+
33
+ def predict(audio, mic_audio=None):
34
+ # audio = tuple (sample_rate, frames) or (sample_rate, (frames, channels))
35
+ if mic_audio is not None:
36
+ sampling_rate, waveform = mic_audio
37
+ elif audio is not None:
38
+ sampling_rate, waveform = audio
39
+ else:
40
+ return "(please provide audio)"
41
+
42
+ waveform = process_audio(sampling_rate, waveform)
43
+ inputs = processor(audio=waveform, sampling_rate=16000, return_tensors="pt")
44
+ predicted_ids = model.generate(**inputs, max_length=400)
45
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
46
+ return transcription[0]
47
+
48
 
49
  title = "OpenAI Whisper Large v2"
50
 
 
84
  ["examples/see_in_eyes.wav", None],
85
  ]
86
 
87
+ gr.Interface(
88
+ fn=predict,
89
  inputs=[
90
  gr.Audio(label="上传语音", source="upload", type="numpy"),
91
  gr.Audio(label="录制语音", source="microphone", type="numpy"),
92
  ],
93
+ outputs=[
94
+ gr.Text(label="识别出的文字"),
95
+ ],
96
  title=title,
97
  description=description,
98
  article=article,
99
+ examples=examples,
100
  ).launch()
requirements.txt CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ git+https://github.com/huggingface/transformers.git
2
+ torch
3
+ librosa