Spaces:
Runtime error
Runtime error
segmented audio + multilang + "base" model
Browse files
app.py
CHANGED
@@ -4,8 +4,10 @@ import os
|
|
4 |
os.system("pip install git+https://github.com/openai/whisper.git")
|
5 |
import whisper
|
6 |
|
7 |
-
|
8 |
-
|
|
|
|
|
9 |
|
10 |
|
11 |
|
@@ -16,12 +18,25 @@ def inference(audio):
|
|
16 |
mel = whisper.log_mel_spectrogram(audio).to(model.device)
|
17 |
|
18 |
_, probs = model.detect_language(mel)
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
options = whisper.DecodingOptions(fp16=False)
|
21 |
-
result = whisper.decode(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
-
print(result.text)
|
24 |
-
return result.text
|
|
|
25 |
|
26 |
|
27 |
title = "Transcribe using Whisper"
|
@@ -140,11 +155,12 @@ with block:
|
|
140 |
<rect x="23" y="69" width="23" height="23" fill="black"></rect>
|
141 |
</svg>
|
142 |
<h1 style="font-weight: 900; margin-bottom: 7px;">
|
143 |
-
Whisper
|
144 |
</h1>
|
145 |
</div>
|
146 |
<p style="margin-bottom: 10px; font-size: 94%">
|
147 |
-
Whisper is a general-purpose speech recognition model.
|
|
|
148 |
</p>
|
149 |
</div>
|
150 |
"""
|
@@ -162,7 +178,8 @@ with block:
|
|
162 |
btn = gr.Button("Transcribe")
|
163 |
text = gr.Textbox(show_label=False)
|
164 |
|
165 |
-
btn.click(inference, inputs=[audio], outputs=[
|
|
|
166 |
|
167 |
gr.HTML('''
|
168 |
<div class="footer">
|
|
|
4 |
os.system("pip install git+https://github.com/openai/whisper.git")
|
5 |
import whisper
|
6 |
|
7 |
+
model = whisper.load_model("small")
|
8 |
+
model_en = whisper.load_model("small.en")
|
9 |
+
|
10 |
+
# model = whisper.load_model("medium")
|
11 |
|
12 |
|
13 |
|
|
|
18 |
mel = whisper.log_mel_spectrogram(audio).to(model.device)
|
19 |
|
20 |
_, probs = model.detect_language(mel)
|
21 |
+
|
22 |
+
if max(probs, key=probs.get) == "en":
|
23 |
+
_model = model_en
|
24 |
+
else:
|
25 |
+
_model = model
|
26 |
|
27 |
options = whisper.DecodingOptions(fp16=False)
|
28 |
+
result = whisper.decode(_model, mel, options)
|
29 |
+
|
30 |
+
segmented_text_list = []
|
31 |
+
|
32 |
+
for segment in result["segments"]:
|
33 |
+
segmented_text_list.append(
|
34 |
+
f'{segment["start"]:.4f} - {segment["end"]:.4f}: {segment["text"]}')
|
35 |
+
segmented_text = "\n".join(segmented_text_list)
|
36 |
|
37 |
+
# print(result.text)
|
38 |
+
# return result.text
|
39 |
+
return segmented_text
|
40 |
|
41 |
|
42 |
title = "Transcribe using Whisper"
|
|
|
155 |
<rect x="23" y="69" width="23" height="23" fill="black"></rect>
|
156 |
</svg>
|
157 |
<h1 style="font-weight: 900; margin-bottom: 7px;">
|
158 |
+
Audio Transcription using OpenAI Whisper
|
159 |
</h1>
|
160 |
</div>
|
161 |
<p style="margin-bottom: 10px; font-size: 94%">
|
162 |
+
Whisper is a general-purpose speech recognition model.
|
163 |
+
Simple wrapping to be used as an API.
|
164 |
</p>
|
165 |
</div>
|
166 |
"""
|
|
|
178 |
btn = gr.Button("Transcribe")
|
179 |
text = gr.Textbox(show_label=False)
|
180 |
|
181 |
+
btn.click(inference, inputs=[audio], outputs=[
|
182 |
+
text], api_name="transcription")
|
183 |
|
184 |
gr.HTML('''
|
185 |
<div class="footer">
|