Spaces:
Runtime error
Runtime error
File size: 2,803 Bytes
01c308f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
import os
import gradio as gr
import whisper
from whisper import tokenizer
import time
current_size = 'base'
model = whisper.load_model(current_size)
AUTO_DETECT_LANG = "Auto Detect"
def transcribe(audio, state={}, model_size='base', delay=1.2, lang=None, translate=False):
time.sleep(delay - 1)
global current_size
global model
if model_size != current_size:
current_size = model_size
model = whisper.load_model(current_size)
transcription = model.transcribe(
audio,
language = lang if lang != AUTO_DETECT_LANG else None
)
state['transcription'] += transcription['text'] + " "
if translate:
x = whisper.load_audio(audio)
x = whisper.pad_or_trim(x)
mel = whisper.log_mel_spectrogram(x).to(model.device)
options = whisper.DecodingOptions(task = "translation")
translation = whisper.decode(model, mel, options)
state['translation'] += translation.text + " "
return state['transcription'], state['translation'], state, f"detected language: {transcription['language']}"
title = "OpenAI's Whisper Real-time Demo"
description = "A simple demo of OpenAI's [**Whisper**](https://github.com/openai/whisper) speech recognition model. This demo runs on a CPU. For faster inference choose 'tiny' model size and set the language explicitly."
model_size = gr.Dropdown(label="Model size", choices=['base', 'tiny', 'small', 'medium', 'large'], value='base')
delay_slider = gr.inputs.Slider(minimum=1, maximum=5, default=1.2, label="Rate of transcription")
available_languages = sorted(tokenizer.TO_LANGUAGE_CODE.keys())
available_languages = [lang.capitalize() for lang in available_languages]
available_languages = [AUTO_DETECT_LANG]+available_languages
lang_dropdown = gr.inputs.Dropdown(choices=available_languages, label="Language", default=AUTO_DETECT_LANG, type="value")
if lang_dropdown==AUTO_DETECT_LANG:
lang_dropdown=None
translate_checkbox = gr.inputs.Checkbox(label="Translate to English", default=False)
transcription_tb = gr.Textbox(label="Transcription", lines=10, max_lines=20)
translation_tb = gr.Textbox(label="Translation", lines=10, max_lines=20)
detected_lang = gr.outputs.HTML(label="Detected Language")
state = gr.State({"transcription": "", "translation": ""})
gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(source="microphone", type="filepath", streaming=True),
state,
model_size,
delay_slider,
lang_dropdown,
translate_checkbox
],
outputs=[
transcription_tb,
translation_tb,
state,
detected_lang
],
live=True,
allow_flagging='never',
title=title,
description=description,
).launch(
# enable_queue=True,
# debug=True
) |