Spaces:
Runtime error
Runtime error
updating app + requirements
Browse files- app.py +19 -58
- requirements.txt +1 -0
app.py
CHANGED
@@ -1,35 +1,40 @@
|
|
1 |
|
2 |
import gradio as gr
|
3 |
import os
|
4 |
-
os.system("pip install git+https://github.com/openai/whisper.git")
|
5 |
import whisper
|
6 |
|
7 |
model = whisper.load_model("small")
|
8 |
model_en = whisper.load_model("small.en")
|
|
|
9 |
|
10 |
# model = whisper.load_model("medium")
|
11 |
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
-
def inference(
|
15 |
-
audio = whisper.load_audio(
|
16 |
audio = whisper.pad_or_trim(audio)
|
17 |
-
|
18 |
mel = whisper.log_mel_spectrogram(audio).to(model.device)
|
19 |
-
|
20 |
_, probs = model.detect_language(mel)
|
21 |
-
|
22 |
if max(probs, key=probs.get) == "en":
|
23 |
_model = model_en
|
24 |
else:
|
25 |
_model = model
|
26 |
|
27 |
-
options = whisper.DecodingOptions(fp16=False)
|
28 |
-
result = whisper.decode(_model, mel, options)
|
29 |
|
|
|
30 |
segmented_text_list = []
|
31 |
|
32 |
-
for segment in result
|
33 |
segmented_text_list.append(
|
34 |
f'{segment["start"]:.4f} - {segment["end"]:.4f}: {segment["text"]}')
|
35 |
segmented_text = "\n".join(segmented_text_list)
|
@@ -112,61 +117,20 @@ block = gr.Blocks(css=css)
|
|
112 |
with block:
|
113 |
gr.HTML(
|
114 |
"""
|
115 |
-
<div style="text-align: center; max-width: 650px; margin: 0 auto;">
|
116 |
-
<div
|
117 |
-
style="
|
118 |
-
display: inline-flex;
|
119 |
-
align-items: center;
|
120 |
-
gap: 0.8rem;
|
121 |
-
font-size: 1.75rem;
|
122 |
-
"
|
123 |
-
>
|
124 |
-
<svg
|
125 |
-
width="0.65em"
|
126 |
-
height="0.65em"
|
127 |
-
viewBox="0 0 115 115"
|
128 |
-
fill="none"
|
129 |
-
xmlns="http://www.w3.org/2000/svg"
|
130 |
-
>
|
131 |
-
<rect width="23" height="23" fill="white"></rect>
|
132 |
-
<rect y="69" width="23" height="23" fill="white"></rect>
|
133 |
-
<rect x="23" width="23" height="23" fill="#AEAEAE"></rect>
|
134 |
-
<rect x="23" y="69" width="23" height="23" fill="#AEAEAE"></rect>
|
135 |
-
<rect x="46" width="23" height="23" fill="white"></rect>
|
136 |
-
<rect x="46" y="69" width="23" height="23" fill="white"></rect>
|
137 |
-
<rect x="69" width="23" height="23" fill="black"></rect>
|
138 |
-
<rect x="69" y="69" width="23" height="23" fill="black"></rect>
|
139 |
-
<rect x="92" width="23" height="23" fill="#D9D9D9"></rect>
|
140 |
-
<rect x="92" y="69" width="23" height="23" fill="#AEAEAE"></rect>
|
141 |
-
<rect x="115" y="46" width="23" height="23" fill="white"></rect>
|
142 |
-
<rect x="115" y="115" width="23" height="23" fill="white"></rect>
|
143 |
-
<rect x="115" y="69" width="23" height="23" fill="#D9D9D9"></rect>
|
144 |
-
<rect x="92" y="46" width="23" height="23" fill="#AEAEAE"></rect>
|
145 |
-
<rect x="92" y="115" width="23" height="23" fill="#AEAEAE"></rect>
|
146 |
-
<rect x="92" y="69" width="23" height="23" fill="white"></rect>
|
147 |
-
<rect x="69" y="46" width="23" height="23" fill="white"></rect>
|
148 |
-
<rect x="69" y="115" width="23" height="23" fill="white"></rect>
|
149 |
-
<rect x="69" y="69" width="23" height="23" fill="#D9D9D9"></rect>
|
150 |
-
<rect x="46" y="46" width="23" height="23" fill="black"></rect>
|
151 |
-
<rect x="46" y="115" width="23" height="23" fill="black"></rect>
|
152 |
-
<rect x="46" y="69" width="23" height="23" fill="black"></rect>
|
153 |
-
<rect x="23" y="46" width="23" height="23" fill="#D9D9D9"></rect>
|
154 |
-
<rect x="23" y="115" width="23" height="23" fill="#AEAEAE"></rect>
|
155 |
-
<rect x="23" y="69" width="23" height="23" fill="black"></rect>
|
156 |
-
</svg>
|
157 |
<h1 style="font-weight: 900; margin-bottom: 7px;">
|
158 |
Audio Transcription using OpenAI Whisper
|
159 |
</h1>
|
160 |
-
</div>
|
161 |
<p style="margin-bottom: 10px; font-size: 94%">
|
162 |
Whisper is a general-purpose speech recognition model.
|
163 |
Simple wrapping to be used as an API.
|
164 |
</p>
|
165 |
-
</div>
|
166 |
"""
|
167 |
)
|
168 |
with gr.Group():
|
169 |
with gr.Box():
|
|
|
|
|
|
|
170 |
with gr.Row().style(mobile_collapse=False, equal_height=True):
|
171 |
audio = gr.Audio(
|
172 |
label="Input Audio",
|
@@ -180,11 +144,8 @@ with block:
|
|
180 |
|
181 |
btn.click(inference, inputs=[audio], outputs=[
|
182 |
text], api_name="transcription")
|
|
|
|
|
183 |
|
184 |
-
gr.HTML('''
|
185 |
-
<div class="footer">
|
186 |
-
</p>
|
187 |
-
</div>
|
188 |
-
''')
|
189 |
|
190 |
block.launch()
|
|
|
1 |
|
2 |
import gradio as gr
|
3 |
import os
|
4 |
+
# os.system("pip install git+https://github.com/openai/whisper.git")
|
5 |
import whisper
|
6 |
|
7 |
model = whisper.load_model("small")
|
8 |
model_en = whisper.load_model("small.en")
|
9 |
+
current_size = 'base'
|
10 |
|
11 |
# model = whisper.load_model("medium")
|
12 |
|
13 |
|
14 |
+
def change_model(size):
|
15 |
+
if size == current_size:
|
16 |
+
return
|
17 |
+
model = whisper.load_model(size)
|
18 |
+
model_en = whisper.load_model(f"{size}.en")
|
19 |
+
current_size = size
|
20 |
|
21 |
+
def inference(audio_file):
|
22 |
+
audio = whisper.load_audio(audio_file)
|
23 |
audio = whisper.pad_or_trim(audio)
|
|
|
24 |
mel = whisper.log_mel_spectrogram(audio).to(model.device)
|
|
|
25 |
_, probs = model.detect_language(mel)
|
|
|
26 |
if max(probs, key=probs.get) == "en":
|
27 |
_model = model_en
|
28 |
else:
|
29 |
_model = model
|
30 |
|
31 |
+
# options = whisper.DecodingOptions(fp16=False)
|
32 |
+
# result = whisper.decode(_model, mel, options)
|
33 |
|
34 |
+
result = _model.transcribe(audio_file)
|
35 |
segmented_text_list = []
|
36 |
|
37 |
+
for segment in result.result:
|
38 |
segmented_text_list.append(
|
39 |
f'{segment["start"]:.4f} - {segment["end"]:.4f}: {segment["text"]}')
|
40 |
segmented_text = "\n".join(segmented_text_list)
|
|
|
117 |
with block:
|
118 |
gr.HTML(
|
119 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
<h1 style="font-weight: 900; margin-bottom: 7px;">
|
121 |
Audio Transcription using OpenAI Whisper
|
122 |
</h1>
|
|
|
123 |
<p style="margin-bottom: 10px; font-size: 94%">
|
124 |
Whisper is a general-purpose speech recognition model.
|
125 |
Simple wrapping to be used as an API.
|
126 |
</p>
|
|
|
127 |
"""
|
128 |
)
|
129 |
with gr.Group():
|
130 |
with gr.Box():
|
131 |
+
sz = gr.Dropdown(label="Model Size", choices=[
|
132 |
+
'base', 'small', 'medium', 'large'], value='base')
|
133 |
+
|
134 |
with gr.Row().style(mobile_collapse=False, equal_height=True):
|
135 |
audio = gr.Audio(
|
136 |
label="Input Audio",
|
|
|
144 |
|
145 |
btn.click(inference, inputs=[audio], outputs=[
|
146 |
text], api_name="transcription")
|
147 |
+
|
148 |
+
sz.change(change_model, inputs=[sz], outputs=[])
|
149 |
|
|
|
|
|
|
|
|
|
|
|
150 |
|
151 |
block.launch()
|
requirements.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
git+https://github.com/openai/whisper.git
|