aliosha commited on
Commit
e6e142f
·
1 Parent(s): 9bd076a

segmented audio + multilang + "base" model

Browse files
Files changed (1) hide show
  1. app.py +25 -8
app.py CHANGED
@@ -4,8 +4,10 @@ import os
4
  os.system("pip install git+https://github.com/openai/whisper.git")
5
  import whisper
6
 
7
- # model = whisper.load_model("small")
8
- model = whisper.load_model("medium")
 
 
9
 
10
 
11
 
@@ -16,12 +18,25 @@ def inference(audio):
16
  mel = whisper.log_mel_spectrogram(audio).to(model.device)
17
 
18
  _, probs = model.detect_language(mel)
 
 
 
 
 
19
 
20
  options = whisper.DecodingOptions(fp16=False)
21
- result = whisper.decode(model, mel, options)
 
 
 
 
 
 
 
22
 
23
- print(result.text)
24
- return result.text
 
25
 
26
 
27
  title = "Transcribe using Whisper"
@@ -140,11 +155,12 @@ with block:
140
  <rect x="23" y="69" width="23" height="23" fill="black"></rect>
141
  </svg>
142
  <h1 style="font-weight: 900; margin-bottom: 7px;">
143
- Whisper
144
  </h1>
145
  </div>
146
  <p style="margin-bottom: 10px; font-size: 94%">
147
- Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse audio and is also a multi-task model that can perform multilingual speech recognition as well as speech translation and language identification.
 
148
  </p>
149
  </div>
150
  """
@@ -162,7 +178,8 @@ with block:
162
  btn = gr.Button("Transcribe")
163
  text = gr.Textbox(show_label=False)
164
 
165
- btn.click(inference, inputs=[audio], outputs=[text])
 
166
 
167
  gr.HTML('''
168
  <div class="footer">
 
4
  os.system("pip install git+https://github.com/openai/whisper.git")
5
  import whisper
6
 
7
+ model = whisper.load_model("small")
8
+ model_en = whisper.load_model("small.en")
9
+
10
+ # model = whisper.load_model("medium")
11
 
12
 
13
 
 
18
  mel = whisper.log_mel_spectrogram(audio).to(model.device)
19
 
20
  _, probs = model.detect_language(mel)
21
+
22
+ if max(probs, key=probs.get) == "en":
23
+ _model = model_en
24
+ else:
25
+ _model = model
26
 
27
  options = whisper.DecodingOptions(fp16=False)
28
+ result = whisper.decode(_model, mel, options)
29
+
30
+ segmented_text_list = []
31
+
32
+ for segment in result["segments"]:
33
+ segmented_text_list.append(
34
+ f'{segment["start"]:.4f} - {segment["end"]:.4f}: {segment["text"]}')
35
+ segmented_text = "\n".join(segmented_text_list)
36
 
37
+ # print(result.text)
38
+ # return result.text
39
+ return segmented_text
40
 
41
 
42
  title = "Transcribe using Whisper"
 
155
  <rect x="23" y="69" width="23" height="23" fill="black"></rect>
156
  </svg>
157
  <h1 style="font-weight: 900; margin-bottom: 7px;">
158
+ Audio Transcription using OpenAI Whisper
159
  </h1>
160
  </div>
161
  <p style="margin-bottom: 10px; font-size: 94%">
162
+ Whisper is a general-purpose speech recognition model.
163
+ Simple wrapping to be used as an API.
164
  </p>
165
  </div>
166
  """
 
178
  btn = gr.Button("Transcribe")
179
  text = gr.Textbox(show_label=False)
180
 
181
+ btn.click(inference, inputs=[audio], outputs=[
182
+ text], api_name="transcription")
183
 
184
  gr.HTML('''
185
  <div class="footer">