from faster_whisper import WhisperModel model = None model_size = None def load_model(_model_size): global model_size, model if _model_size and model_size != _model_size: model_size = _model_size try: model = WhisperModel(model_size, device="cuda", compute_type="float16") except: model = WhisperModel(model_size, device="cpu", compute_type="int8") def speech_to_text(audio_file, _model_size = None): global model_size, model load_model(_model_size) segments, info = model.transcribe( audio_file, language='ja', beam_size=5, vad_filter=True, without_timestamps=False, ) text_only = '' text_with_timestamps = '' for segment in segments: text_only += f"{segment.text}\n" text_with_timestamps += f"{segment.start:.2f}\t{segment.end:.2f}\t{segment.text}\n" return text_only, text_with_timestamps