import gradio as gr import librosa import soundfile as sf import torch import warnings import os from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2CTCTokenizer, Wav2Vec2Model from engine import SpeechToTextEngine import wave from nemo_asr import transcribe warnings.filterwarnings("ignore") from speechbrain.pretrained import EncoderDecoderASR asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-wav2vec2-commonvoice-rw", savedir="pretrained_models/asr-wav2vec2-commonvoice-rw") #asr_model.transcribe_file("speechbrain/asr-wav2vec2-commonvoice-rw/example.mp3") # define speech-to-text function def asr_transcript(audio): if audio == None: return "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)", "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)" text = "" data={} if audio: text_asr = asr_model.transcribe_file(audio.name) text_nemo_trasducer = transcribe(audio, "stt_rw_conformer_transducer_large") with open(audio.name,'rb') as f: audio_proper = f.read() stt_engine = SpeechToTextEngine() all_hot_words = [] if data: all_hot_words = stt_engine.add_hot_words(data) if not audio_proper: raise InvalidUsage('Audio not provided') # Running the transcription text_coqui = stt_engine.run(audio_proper) return text_asr.lower() , text_coqui , text_nemo_trasducer else: return "File not valid" gradio_ui = gr.Interface( fn=asr_transcript, title="Kinyarwanda Speech Recognition", description="Record an audio clip from browser using microphone, and let AI do the hard work of transcribing.", article = """ This demo showcases two pretrained STT models the first model from speechbrain(wave2vec+CTC models)(1,2gb) is 30 times larger compared to the coqui STT (deepspeech model)(45mb). """, inputs=[gr.inputs.Audio(source="microphone", type="file", optional=False, label="Record from microphone")], outputs=[gr.outputs.Textbox(label="Recognized speech from speechbrain model"), gr.outputs.Textbox(label="Recognized speech from coqui STT model"), gr.outputs.Textbox(label="Recognized speech from NVIDIA Conformer transduver large model")] # examples = [["sample_1.wav"],["sample_2.wav"]] ) gradio_ui.launch(enable_queue=True)