import os os.system("pip install git+https://github.com/openai/whisper.git") import gradio as gr import whisper from share_btn import community_icon_html, loading_icon_html, share_js model = whisper.load_model("medium") languages = {'auto': None} | {long_name: short_name for short_name, long_name in whisper.tokenizer.LANGUAGES.items()} def inference(audio, language_long_name): audio = whisper.load_audio(audio) audio = whisper.pad_or_trim(audio) mel = whisper.log_mel_spectrogram(audio).to(model.device) #_, probs = model.detect_language(mel) options = whisper.DecodingOptions(fp16 = False, language=languages[language_long_name]) result = whisper.decode(model, mel, options) print(result.text) return result.text, gr.update(visible=True), gr.update(visible=True), gr.update(visible=True) css = """ .gradio-container { font-family: 'IBM Plex Sans', sans-serif; } .gr-button { color: white; border-color: black; background: black; } input[type='range'] { accent-color: black; } .dark input[type='range'] { accent-color: #dfdfdf; } .container { max-width: 730px; margin: auto; padding-top: 1.5rem; } .details:hover { text-decoration: underline; } .gr-button { white-space: nowrap; } .gr-button:focus { border-color: rgb(147 197 253 / var(--tw-border-opacity)); outline: none; box-shadow: var(--tw-ring-offset-shadow), var(--tw-ring-shadow), var(--tw-shadow, 0 0 #0000); --tw-border-opacity: 1; --tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color); --tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(3px var(--tw-ring-offset-width)) var(--tw-ring-color); --tw-ring-color: rgb(191 219 254 / var(--tw-ring-opacity)); --tw-ring-opacity: .5; } .footer { margin-bottom: 45px; margin-top: 35px; text-align: center; border-bottom: 1px solid #e5e5e5; } .footer>p { font-size: .8rem; display: inline-block; padding: 0 10px; transform: translateY(10px); background: white; } .dark .footer { border-color: #303030; } .dark .footer>p { background: #0b0f19; } .prompt h4{ margin: 1.25em 0 .25em 0; font-weight: bold; font-size: 115%; } .animate-spin { animation: spin 1s linear infinite; } @keyframes spin { from { transform: rotate(0deg); } to { transform: rotate(360deg); } } #share-btn-container { display: flex; margin-top: 1.5rem !important; padding-left: 0.5rem !important; padding-right: 0.5rem !important; background-color: #000000; justify-content: center; align-items: center; border-radius: 9999px !important; width: 13rem; } #share-btn { all: initial; color: #ffffff;font-weight: 600; cursor:pointer; font-family: 'IBM Plex Sans', sans-serif; margin-left: 0.5rem !important; padding-top: 0.25rem !important; padding-bottom: 0.25rem !important; } #share-btn * { all: unset; } """ block = gr.Blocks(css=css) with block: gr.HTML( """
🤫 Whisper demo

Whisper is een generiek spraakherkenning-model van OpenAI. Het is getraind op een grote, diverse dataset van audio. Het kan audio transcriberen, talen herkennen en vertalen. Deze demo kapt audio af na ongeveer 30 seconden.

""" ) with gr.Group(): with gr.Box(): with gr.Row().style(mobile_collapse=False, equal_height=True): audio = gr.Audio( label="Input Audio", show_label=False, source="microphone", type="filepath" ) language_long_name = gr.Dropdown( list(languages.keys()), value="auto", label="Taal van gesproken tekst", info="Taal van de gesproken tekst. Kies auto voor automatische detectie." ) btn = gr.Button("Transcribeer") text = gr.Textbox(show_label=False, elem_id="result-textarea") with gr.Group(elem_id="share-btn-container"): community_icon = gr.HTML(community_icon_html, visible=False) loading_icon = gr.HTML(loading_icon_html, visible=False) share_button = gr.Button("Share to community", elem_id="share-btn", visible=False) btn.click(inference, inputs=[audio, language_long_name], outputs=[text, community_icon, loading_icon, share_button]) share_button.click(None, [], [], _js=share_js) gr.HTML(''' ''') block.launch()