import librosa import numpy as np import gradio as gr import soundfile as sf from transformers import pipeline # Load the pipeline model = pipeline( task="automatic-speech-recognition", model="distil-whisper/distil-small.en" ) def transcribe_audio(filepath): """Transcribe audio to text""" audio, sample_rate = sf.read(filepath) audio_mono = librosa.to_mono(np.transpose(audio)) # resample the audio audio_16KHz = librosa.resample( audio_mono, orig_sr=sample_rate, target_sr=16000 ) output = model( audio_16KHz, chunk_length_s=30, batch_size=4, ) return output["text"] mic_transcribe_interface = gr.Interface( fn=transcribe_audio, inputs=gr.Audio(sources="microphone", type="filepath"), outputs=gr.Textbox(label="Transcription", lines=3), allow_flagging="never", title="Transcribe Audio from your Microphone" ) file_transcribe_interface = gr.Interface( fn=transcribe_audio, inputs=gr.Audio(sources="upload", type="filepath"), outputs=gr.Textbox(label="Transcription", lines=3), allow_flagging="never", title="Transcribe Audio from a File", examples=["./examples/sample_audio_1.wav"] )