import gradio as gr from transformers import pipeline import librosa import traceback # Initialize the ASR pipeline asr_pipeline = pipeline(task="automatic-speech-recognition", model="distil-whisper/distil-small.en") def transcribe_long_form(file_info): if not file_info: return "No audio file provided." try: # Unpack the audio array and sample rate from the tuple audio_array, sample_rate = file_info # Check if the audio array is multi-channel (stereo) and convert it to mono if audio_array.ndim > 1: audio_mono = librosa.to_mono(audio_array) else: audio_mono = audio_array # Resample the audio to 16 kHz if the current sample rate is different if sample_rate != 16000: audio_mono = librosa.resample(audio_mono, orig_sr=sample_rate, target_sr=16000) # Transcribe the audio using the ASR pipeline result = asr_pipeline(audio_mono, sampling_rate=16000) # Access the first result's 'text' field return result[0]['text'] except Exception as e: traceback.print_exc() return f"An error occurred: {str(e)}" # Define Gradio interfaces for microphone and file upload mic_transcribe = gr.Interface( fn=transcribe_speech, inputs=gr.Audio(sources="microphone", type="filepath"), outputs=gr.Textbox(label="Transcription", lines=3), allow_flagging="never" ) file_transcribe = gr.Interface( fn=transcribe_speech, inputs=gr.Audio(sources="upload", type="filepath"), outputs=gr.Textbox(label="Transcription", lines=3), allow_flagging="never" ) # Setup the main Gradio app with tabbed interfaces for different input sources with gr.Blocks() as demo: gr.TabbedInterface([mic_transcribe, file_transcribe], ["Transcribe Microphone", "Transcribe Audio File"]) # Launch the Gradio app demo.launch(share=True)