import gradio as gr import librosa import soundfile as sf import torch import warnings import os from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2CTCTokenizer, Wav2Vec2Model warnings.filterwarnings("ignore") from speechbrain.pretrained import EncoderDecoderASR asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-wav2vec2-commonvoice-rw", savedir="pretrained_models/asr-wav2vec2-commonvoice-rw") #asr_model.transcribe_file("speechbrain/asr-wav2vec2-commonvoice-rw/example.mp3") # define speech-to-text function def asr_transcript(audio, audio_microphone, model_params): audio = audio_microphone if audio_microphone else audio if audio == None and audio_microphone == None: return "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)", "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)" text = "" if audio: text = asr_model.transcribe_file(audio.name) return text else: return "File not valid" gradio_ui = gr.Interface( fn=asr_transcript, title="Kinyarwanda Speech Recognition", description="Upload an audio clip or record from browser using microphone, and let AI do the hard work of transcribing.", article = """ This demo showcases the pretrained model from deepspeech. """, inputs=[gr.inputs.Audio(label="Upload Audio File", type="file", optional=True), gr.inputs.Audio(source="microphone", type="file", optional=True, label="Record from microphone"), gr.inputs.Dropdown(choices=["deepspeech","coqui (soon)"], type="value", default="deepspeech", label="Select speech recognition model ", optional=False)], outputs=[gr.outputs.Textbox(label="Recognized speech")], examples = [["sample_1.wav","sample_1.wav","deepspeech"],["sample_2.wav","sample_2.wav","deepspeech"]] ) gradio_ui.launch(enable_queue=True)