import os
os.system("pip install git+https://github.com/openai/whisper.git")
import gradio as gr
import whisper
from share_btn import community_icon_html, loading_icon_html, share_js
model = whisper.load_model("small")
def inference(audio):
audio = whisper.load_audio(audio)
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(model.device)
_, probs = model.detect_language(mel)
options = whisper.DecodingOptions(fp16 = False)
result = whisper.decode(model, mel, options)
return result.text, gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
block = gr.Blocks(css=css)
with block:
Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse audio and is also a multi-task model that can perform multilingual speech recognition as well as speech translation and language identification. This demo cuts audio after around 30 secs.
Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse audio and is also a multi-task model that can perform multilingual speech recognition as well as speech translation and language identification. This demo cuts audio after around 30 secs.

You can skip the queue by using google colab for the space:
audio = gr.Audio(
label="Input Audio",
btn = gr.Button("Transcribe")
text = gr.Textbox(show_label=False, elem_id="result-textarea")
btn.click(inference, inputs=[audio], outputs=[text, community_icon, loading_icon, share_button])
share_button.click(None, [], [], _js=share_js)
Model by OpenAI - Gradio Demo by 🤗 Hugging Face