import torch from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline from IPython.display import Audio as IPythonAudio import os import soundfile as sf import io import numpy import librosa from pydub import AudioSegment asr = pipeline(task="automatic-speech-recognition", model="distil-whisper/distil-small.en") def __convert_to_mp3(path:str, extension:str): print("converto audio...") audio = AudioSegment.from_file(path, format=extension) mp3_path = path.split(sep='.')[1] + ".mp3" audio.export(mp3_path, format="mp3") return mp3_path def __convert_to_mono(audio): transposed_audio = numpy.transpose(audio) audio_mono = librosa.to_mono(transposed_audio) audio_16KHz = librosa.resample(audio_mono, orig_sr=16000, target_sr=16000) return audio_16KHz def run_asr_pipeline(audio_file_url: str, extension: str): if "mp3" not in extension: audio_file_url = __convert_to_mp3(extension=extension, path=audio_file_url) audio, _ = sf.read(audio_file_url) audio = __convert_to_mono(audio=audio) return asr(audio)["text"] # print(run_asr_pipeline(audio_file_url="m4a.mp3", # extension="mp3" # ))