import logging import warnings import gradio as gr import librosa # import torchaudio from transformers import pipeline from transformers.utils.logging import disable_progress_bar warnings.filterwarnings("ignore") disable_progress_bar() logging.basicConfig( format="%(asctime)s [%(levelname)s] [%(name)s] %(message)s", datefmt="%Y-%m-%dT%H:%M:%SZ", ) logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) MODEL_NAME = "bofenghuang/asr-wav2vec2-ctc-french" SAMPLE_RATE = 16_000 pipe = pipeline(model=MODEL_NAME) logger.info("ASR pipeline has been initialized") def process_audio_file(audio_file): # waveform, sample_rate = torchaudio.load(audio_file) # waveform = waveform.squeeze(axis=0) # mono # # resample # if sample_rate != SAMPLE_RATE: # resampler = torchaudio.transforms.Resample(sample_rate, SAMPLE_RATE) # waveform = resampler(waveform) waveform, sample_rate = librosa.load(audio_file, mono=True) # resample if sample_rate != SAMPLE_RATE: waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=SAMPLE_RATE) return waveform def transcribe(microphone_audio_file, uploaded_audio_file): warning_message = "" if (microphone_audio_file is not None) and (uploaded_audio_file is not None): warning_message = ( "WARNING: You've uploaded an audio file and used the microphone. " "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n" ) audio_file = microphone_audio_file elif (microphone_audio_file is None) and (uploaded_audio_file is None): return "ERROR: You have to either use the microphone or upload an audio file" elif microphone_audio_file is not None: audio_file = microphone_audio_file else: audio_file = uploaded_audio_file audio_data = process_audio_file(audio_file) # text = pipe(audio_data)["text"] text = pipe(audio_data, chunk_length_s=30, stride_length_s=5)["text"] logger.info(f"Transcription for {audio_file}: {text}") return warning_message + text iface = gr.Interface( fn=transcribe, inputs=[ gr.Audio(source="microphone", type="filepath", label="Record something...", optional=True), gr.Audio(source="upload", type="filepath", label="Upload some audio file...", optional=True), ], outputs="text", layout="horizontal", # theme="huggingface", title="Speech-to-Text in French", description=f"Realtime demo for French automatic speech recognition. Demo uses the the fine-tuned checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files of arbitrary length.", allow_flagging="never", ) # iface.launch(server_name="0.0.0.0", debug=True, share=True) iface.launch(enable_queue=True)