import nemo.collections.asr as nemo_asr from pydub import AudioSegment from io import BytesIO import tempfile import os import gradio as gr def convert_to_mono(input_file): # Load the audio file (supports various formats) sound = AudioSegment.from_file(input_file) # Convert to mono sound = sound.set_channels(1) # Export the mono audio file into a BytesIO object converted = BytesIO() sound.export(converted, format="wav") converted.seek(0) # Move the pointer to the start of the BytesIO object return converted # Load the pre-trained ASR model asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.from_pretrained( model_name="nvidia/stt_ka_fastconformer_hybrid_large_pc" ) def transcribe_audio(audio_file): if audio_file: # Convert the uploaded audio to mono mono_audio = convert_to_mono(audio_file) # Write the mono audio to a temporary file and close it before transcribing with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file: temp_file.write(mono_audio.read()) temp_file_path = temp_file.name # Transcribe the audio using the temporary file path res = asr_model.transcribe([temp_file_path]) # Clean up the temporary file os.remove(temp_file_path) # Return the transcription result return res[0][0] # def transcribe_audio(audio_file): # if audio_file: # # Convert the uploaded audio to mono # mono_audio = convert_to_mono(audio_file) # # Transcribe the audio using the BytesIO object directly # audio_data = mono_audio.read() # # Use the audio_data in the format expected by the ASR model # res = asr_model.transcribe([BytesIO(audio_data)]) # # Return the transcription result # return res[0][0] # Create the Gradio interface interface = gr.Interface( fn=transcribe_audio, inputs=gr.Audio(type="filepath"), # Allow audio file uploads and get the file path outputs="text", # Display the transcription result as text title="ASR Transcription", description="Upload an audio file (mp3, wav, or m4a) and get the transcription." ) # Launch the Gradio interface interface.launch()