import gradio as gr #import whisper from faster_whisper import WhisperModel model_size = 'aka7774/whisper-large-v3-ct2' #model = whisper.load_model(model_size) #model = WhisperModel(model_size, device="cuda", compute_type="float16") # or run on GPU with INT8 # model = WhisperModel(model_size, device="cuda", compute_type="int8_float16") # or run on CPU with INT8 model = WhisperModel(model_size, device="cpu", compute_type="int8") def speech_to_text(audio_file, _model_size): global model_size, model if model_size != _model_size: model_size = _model_size #model = whisper.load_model(model_size) model = WhisperModel(model_size, compute_type="float16") #result = model.transcribe(audio_file) segments, info = model.transcribe(audio_file, beam_size=5) #return result["text"] return "".join([segment.text for segment in segments]) gr.Interface( fn=speech_to_text, inputs=[ gr.Audio(source="upload", type="filepath"), gr.Dropdown(value=model_size, choices=["tiny", "base", "small", "medium", "large", "large-v2", "large-v3", "aka7774/whisper-large-v3-ct2"]), ], outputs="text").launch()