import gradio as gr from faster_whisper import WhisperModel model_size = 'large-v3' def load_model(model_size): if torch.cuda.is_available(): model = WhisperModel(model_size, device="cuda", compute_type="float16") # model = WhisperModel(model_size, device="cuda", compute_type="int8_float16") else: model = WhisperModel(model_size, device="cpu", compute_type="int8") def speech_to_text(audio_file, _model_size): global model_size, model if model_size != _model_size: model_size = _model_size model = load_model(model_size) with torch.no_grad(): segments, info = model.transcribe( audio_file, verbose=True, language='japanese', beam_size=5, vad_filter=True, without_timestamps=False, ) text = '' for segment in segments: text += "{segment.start:.2f}\t{segment.end:.2f}\t{segment.text}\n" load_model(model_size) gr.Interface( fn=speech_to_text, inputs=[ gr.Audio(sources="upload", type="filepath"), gr.Dropdown(value=model_size, choices=["tiny", "base", "small", "medium", "large", "large-v2", "large-v3"]), ], outputs="text").launch()